-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmethods.py
301 lines (242 loc) · 11.3 KB
/
methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import io
import pandas as pd
import numpy as np
from IPython.core.display_functions import display
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib.dates import MonthLocator, DateFormatter
def df_to_dtm(df):
"""
Convert a pandas DataFrame of preprocessed text data (obtained using preprocessing() ) into a Document-Term Matrix
(DTM) using a CountVectorizer.
Parameters:
df (pandas DataFrame): A DataFrame containing preprocessed text data, including a column named 'lemmatised_text'
containing the preprocessed text data as strings.
Returns:
pandas DataFrame: A DataFrame representation of the DTM with document IDs as the index and individual terms as
columns. The cells of the DataFrame contain the term frequencies (counts) for each document.
"""
# Create a CountVectorizer object
vectoriser = CountVectorizer()
dtm = vectoriser.fit_transform(df['lemmatised_text'])
# Create a dataframe from the DTM
df_dtm = pd.DataFrame(dtm.toarray(), columns=vectoriser.get_feature_names_out())
# Add the original text column back to the dataframe
df_dtm['content'] = df['content']
return df_dtm
def df_to_tfidf(df):
"""
Convert a pandas DataFrame of preprocessed text data (obtained using preprocessing() ) into a Term Frequency-Inverse
Document Frequency (TF-IDF) matrix using a CountVectorizer and a TfidfTransformer.
Parameters:
df (pandas DataFrame): A DataFrame containing preprocessed text data, including a column named 'lemmatised_text'
containing the preprocessed text data as strings.
Returns:
pandas DataFrame: A DataFrame representation of the TF-IDF matrix with document IDs as the index and individual
terms as columns. The cells of the DataFrame contain the TF-IDF scores for each document.
"""
vectoriser = CountVectorizer()
dtm = vectoriser.fit_transform(df['lemmatised_text'])
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(dtm)
# Create a dataframe from the TF-IDF matrix
df_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectoriser.get_feature_names_out())
df_tfidf = df_tfidf[df_tfidf.sum().sort_values(ascending=False).index]
# Add the original text column back to the dataframe
df_tfidf['content'] = df['content']
pd.set_option('display.max_columns', 100)
# Print the resulting dataframe
display(df_tfidf)
return df_tfidf
def csv_to_tfidf(file_path):
"""
Convert a CSV file of preprocessed text data (obtained using preprocessing() ) into a Term Frequency-Inverse
Document Frequency (TF-IDF) matrix using a CountVectorizer and a TfidfTransformer.
Parameters:
file_path (str): The path to the CSV file containing preprocessed text data, including a column named
'lemmatised_text' containing the preprocessed text data as strings.
Returns:
pandas DataFrame: A DataFrame representation of the TF-IDF matrix with document IDs as the index and individual
terms as columns. The cells of the DataFrame contain the TF-IDF scores for each document.
"""
df = pd.read_csv(file_path)
vectoriser = CountVectorizer()
dtm = vectoriser.fit_transform(df['lemmatised_text'])
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(dtm)
# Create a dataframe from the TF-IDF matrix
df_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectoriser.get_feature_names_out())
df_tfidf = df_tfidf[df_tfidf.sum().sort_values(ascending=False).index]
# Add the original text column back to the dataframe
df_tfidf['content'] = df['content']
# Limit the size of the dataframe to 100 rows and 100 columns
# df_tfidf = df_tfidf.iloc[:100, :500]
return df_tfidf
def read_csv_files():
"""
Reads the CSV files for The Guardian, Daily Mail, The Times, and The Sun and returns them as dataframes.
Returns:
tuple: A tuple containing two elements:
1. A list of pandas dataframes, one for each CSV file.
2. A list of strings, one for each dataframe, representing the color to use when plotting that dataframe's data.
"""
filenames = ['guardian.csv', 'mail.csv', 'times.csv', 'sun.csv']
dataframes = [pd.read_csv(filename) for filename in filenames]
colors = ['blue', 'red', 'green', 'orange'] # Add colors for each dataframe
return dataframes, colors
def plot_tfidf(term: str, save: bool = False) -> bool:
"""
Plots the development of the normalised TF-IDF score for a given term across four UK newspapers: The Times, Daily Mail,
The Sun, and The Guardian, for the period between September 2022 and February 2023.
Args:
term (str): The term for which to plot the TF-IDF score.
save (bool, optional): Whether to save the plot as a JPEG image file. Defaults to False.
Returns:
bool: True if the plot was created successfully, otherwise False.
"""
vectorizer = TfidfVectorizer()
dataframes, colors = read_csv_files()
# Create an empty dataframe to store the combined data from all dataframes
df = pd.DataFrame()
for i, dataframe in enumerate(dataframes):
# Calculate the TF-IDF for all text in each dataframe
tfidf = vectorizer.fit_transform(dataframe['lemmatised_text'])
# Get the index of the term you want to plot
term_index = vectorizer.vocabulary_[term]
# Extract the TF-IDF scores for the selected term
tfidf_term = tfidf[:, term_index].toarray().ravel()
# Normalize the TF-IDF scores to be between 0 and 1
scaler = MinMaxScaler()
tfidf_term = scaler.fit_transform(tfidf_term.reshape(-1, 1)).ravel()
# Convert the date column to datetime objects
dataframe['date'] = pd.to_datetime(dataframe['date'], utc=True)
# Group by month and calculate the mean TF-IDF score for each month
tfidf_monthly = dataframe.groupby(dataframe['date'].dt.to_period('M'))['lemmatised_text'].agg(
['count', lambda x: np.nanmean(tfidf_term[x.index])])
tfidf_monthly.columns = ['count', 'tfidf']
# Add a new column to the dataframe with the color for this dataframe
tfidf_monthly['color'] = colors[i]
# Append the data to the combined dataframe
df = df.append(tfidf_monthly)
# Create the plot
fig, ax = plt.subplots(figsize=(12, 8))
# Set the x-axis to display months
months = MonthLocator()
months_fmt = DateFormatter('%b %Y')
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(months_fmt)
# Loop through the data for each dataframe and plot the data with the corresponding color
for color, group in df.groupby('color'):
group.index = group.index.to_timestamp()
if color == 'green':
ax.plot(group.index, group['tfidf'], label='The Times', color=color)
elif color == 'red':
ax.plot(group.index, group['tfidf'], label='Daily Mail', color=color)
elif color == 'orange':
ax.plot(group.index, group['tfidf'], label='The Sun', color=color)
else:
ax.plot(group.index, group['tfidf'], label='The Guardian', color=color)
# Add labels and title
ax.set_xlabel('Month')
ax.set_ylabel('Normalised TF-IDF score')
ax.set_title(f'TF-IDF score development Sep 22 - Feb 23 for term "{term}"')
ax.legend()
if save == True:
plt.savefig(f"{term}.jpg")
else:
pass
plt.show()
plt.pause(0.001)
return True
def get_vocab_from_csv(csv_file, lemma_col='lemmas'):
"""
Read a CSV file containing lemmas and return a set of unique lemmas.
Parameters
----------
csv_file : str
The path to the CSV file to read.
lemma_col : str, optional
The name of the column in the CSV file containing the lemmas. Default is 'lemmas'.
Returns
-------
set of str
A set of unique lemmas from the specified column of the CSV file.
Raises
------
ValueError
If the specified column does not exist in the CSV file.
FileNotFoundError
If the specified file does not exist or cannot be read.
Examples
--------
>>> vocab = get_vocab_from_csv('my_corpus.csv')
>>> print(vocab)
{'word1', 'word2', 'word3', ...}
"""
try:
df = pd.read_csv(csv_file)
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {csv_file}")
if lemma_col not in df.columns:
raise ValueError(f"Column '{lemma_col}' not found in CSV file.")
lemmas = df[lemma_col].apply(eval).explode().unique()
return set(lemmas)
def get_avg_token_length(vocab):
"""
Calculate the average length of tokens in a vocabulary.
Parameters
----------
vocab : set of str
The set of strings representing the vocabulary.
Returns
-------
float or None
The average length of tokens in the vocabulary, or None if the input set is empty.
Raises
------
TypeError
If the input vocabulary is not a set of strings.
Examples
--------
>>> vocab = {"hello", "world", "blue", "why", "a", "bee"}
>>> get_avg_token_length(vocab)
3.5
"""
if not isinstance(vocab, set) or not all(isinstance(token, str) for token in vocab):
raise TypeError("The input vocabulary must be a set of strings.")
if not vocab:
return None
total_length = sum(len(token) for token in vocab)
average_length = total_length / len(vocab)
return average_length
def get_term_position(df_tfidf, term):
"""
Get the column index of a given term in a TF-IDF matrix represented by a pandas DataFrame.
Parameters:
df_tfidf (pandas DataFrame): A DataFrame representation of the TF-IDF matrix with document IDs as the index and
individual terms as columns. The cells of the DataFrame contain the TF-IDF scores
for each document.
term (str): The term whose column index is to be obtained.
Returns:
int: The column index of the given term in the TF-IDF matrix.
"""
# Get the column index of the term
col_index = df_tfidf.columns.get_loc(term)
return col_index
def compare_term_position(term):
"""
Compare the positions of a given term in the TF-IDF matrices of four different CSV files and write the results to a
new CSV file.
Parameters:
term (str): The term whose position is to be compared.
Returns:
None
"""
guardian = csv_to_tfidf("guardian.csv").columns.get_loc(term)
times = csv_to_tfidf("times.csv").columns.get_loc(term)
sun = csv_to_tfidf("sun.csv").columns.get_loc(term)
mail = csv_to_tfidf("mail.csv").columns.get_loc(term)
with io.open(f"{term}compare.csv", "w", encoding="utf8") as file:
file.write(f"guardian, {guardian}, times, {times}, sun, {sun}, mail, {mail}")
return print(f"guardian, {guardian}, times, {times}, sun, {sun}, mail, {mail}")