-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmit_topicGlobal.py
334 lines (284 loc) · 14.5 KB
/
mit_topicGlobal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import csv
import logging
import coloredlogs
import os
import threading
from datetime import datetime
import time
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.metrics.pairwise import cosine_similarity
import text_preprocessing as tp
import embedding_word as ew
import centroid_topic as ct
import CSV_complie as cc
import PCA_plot3D as pca
import DBSCAN_topic as db
from tqdm import tqdm
import top_2_vec as t2v
import multiprocessing
from csv import writer
import pandas as pd
import operator
import sys
import lda as lda
import lsa as lsa
import result_visualization as rv
# GLOBAL VARIABLES
from STD import STD
choose = ""
'''
multiprocess function to preprocess the text
'''
logger = logging.getLogger(__name__)
coloredlogs.install(level='INFO', logger=logger,
fmt="- Process -> pid[%(process)d], name[%(processName)s] Function -> [%(funcName)s]\n%(asctime)s --- %(levelname)s log -> [%(message)s]")
best_topic_number = {}
with open('best_num_topic.txt', 'r', encoding='UTF8') as file:
for line in file:
text = line
text = text.split(" ")
best_topic_number[text[0]] = text[1]
def parallelized_function(file): # use to clear and prepare source text of each file
if file.endswith(".txt"):
input_file = open(f"data/{file}", encoding="utf8")
file_text = input_file.read()
file_text = tp.remove_whitespace(file_text) # rimozione doppi spazi
file_text = tp.tokenization(file_text) # tokenizzo
file_text = tp.stopword_removing(file_text) # rimuovo le stopword
file_text = tp.pos_tagging(file_text) # metto un tag ad ogni parola
file_text = tp.lemmatization(file_text) # trasformo nella forma base ogni parola
logger.info("Subprocess for file -> [%s]", file)
return file_text
def choice_lda_method(data, n_topic, n_words, year):
lda_model, dictionary, corpus = lda.lda(data, n_topic, year)
# lda.print_coherence(lda_model, dictionary, corpus, data)
# lda.print_topics(lda_model, n_words)
# lda.print_documents(lda_model, n_words)
return
def choice_lsa_method(data, n_topic, n_words, year):
# # decomment to search for the best number of topics
# start, stop, step = round((n_topic / 4) - 2), n_topic, 1
# lsa.plot_graph(data, start, stop, step, year)
model = lsa.create_gensim_lsa_model(data, n_topic, n_words, year)
return
def frequency_analysis_file(year, filtered_docs_list, results):
header = ['file_name', 'word_frequency']
title_header = year + "_"
if not os.path.exists(f"output/{year}/wordcloud"):
os.makedirs(f"output/{year}/wordcloud")
with open(f'output/{year}/wordcloud/{title_header}files_word_frequency.csv', 'w', encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header) # write the header
frequency_list = []
for i in range(len(filtered_docs_list)):
with open(f'output/{year}/wordcloud/{title_header}files_word_frequency.csv', 'a', encoding='UTF8',
newline='') as f:
writer = csv.writer(f)
# write file words
frequency_list.append(rv.word_count(results[0][i]))
data = [filtered_docs_list[i],
str(frequency_list[i]).replace(",", "").replace("[", "").replace("]", "")]
writer.writerow(data)
return
def word_cloud_method(file_text, year):
words = []
for i in range(0, len(file_text)):
words.append(file_text[i])
rv.tag_cloud(words, year, "-")
return
def choice_wordcloud_method(file_text, year, filtered_docs_list, results):
# tag cloud of most frequent words of the year
word_cloud_method(file_text, year)
#histogram
rv.histogram(year, file_text)
# frequency analysis of the most frequent words of each file in the selected year
frequency_analysis_file(year, filtered_docs_list, results)
return
def choice_clustering_method(tot_vectors, year, file_text):
bigClusters = db.DBSCAN_Topic(tot_vectors, year)
words = []
for i in range(0, len(file_text)):
for t in range(0, len(bigClusters)):
for j in range(0, len(bigClusters[t])):
if bigClusters[t][j] == file_text[i]:
words.append(bigClusters[t][j])
return
def choice_top2vec(list_files):
topic_words, word_scores, topic_nums = t2v.top_2_vec(list_files)
return topic_words, word_scores, topic_nums
def printToFile(topicResults, type, year):
if type == "group":
if not os.path.exists(f"output/top2vec/grouped"):
os.makedirs(f"output/top2vec/grouped")
with open(f'output/top2vec/grouped/grouped_results.csv', 'w', encoding='UTF8') as csvfile:
fieldnames = ['File', 'TopicWords', 'WordScore', 'TopicNumber']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i in range(0, len(topicResults)):
writer.writerow(
{'File': topicResults[i][0], 'TopicWords': topicResults[i][1], 'WordScore': topicResults[i][2],
'TopicNumber': topicResults[i][3]})
else:
if not os.path.exists(f"output/top2vec/single/{year}"):
os.makedirs(f"output/top2vec/single/{year}")
with open(f'output/top2vec/single/{year}/{year}_results.csv', 'w', encoding='UTF8') as csvfile:
fieldnames = ['File', 'TopicWords', 'WordScore', 'TopicNumber']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i in range(0, len(topicResults)):
writer.writerow(
{'File': topicResults[i][0], 'TopicWords': topicResults[i][1], 'WordScore': topicResults[i][2],
'TopicNumber': topicResults[i][3]})
if __name__ == "__main__":
if (len(sys.argv) == 4):
arg_from_command_line = True
else:
arg_from_command_line = False
while 1:
if (arg_from_command_line == False):
choose = input('Insert:\n\n'
'- LDA -> Digit "lda" to execute this text mining method.\n\n'
'- LSA -> Digit "lsa" to execute this text mining method.\n\n'
'- TOP_2_VEC -> Digit "top2vec" to execute this text mining method.\n\n'
'- WORD_CLOUD -> Digit "wordcloud" to execute this text mining method.\n\n'
'- GTD -> Digit "gtd-cluster" to execute this text mining method in a year.\n\n'
'- STD -> Digit "std" to execute this text mining method for each file in a year. \n\n'
'- EXIT -> Digit "exit" to exit the program.\n\n')
else:
choose = str(sys.argv[1])
if choose == 'lda' or choose == 'lsa' or choose == 'top2vec' or choose == 'wordcloud' or choose == 'gtd-cluster' or choose == 'pca' or choose == 'std':
break
else:
exit() # choose seected is exit
# get folder with all the files to analyze
# recommendation: put the files in the data folder and
# make sure that each file has a title like string_string_year and saved in a .txt format
os.chdir("data")
listDoc = os.listdir()
os.chdir("../")
# extract the years from the file names
years_list = []
for doc in listDoc:
year = doc.split("_")[2]
if year not in years_list:
years_list.append(year)
years_list.sort()
if choose == 'std':
STD()
else:
if choose != "top2vec": # if choose is not top2vec, becouse top2vec is the only method that can't analyze single year
# get the year to analyze
if (arg_from_command_line == False):
year = input("Insert year to be analyze: \n""(you can choose from the list: " + str(years_list) + ")\n")
else:
year = str(sys.argv[2])
# check if the year is in the list of years
if year not in years_list:
print("Year not in the list")
exit()
# take all the names of the files
filtered_docs_list = [] # list of the files to analyze filtered by year
all_docs = [] # list of all the files
for doc in listDoc:
if doc.endswith(".txt"):
all_docs.append(doc)
if doc.endswith(".txt") and year in doc:
filtered_docs_list.append(doc)
if filtered_docs_list == []: # we found no files to analyze for the selected year
print("No documents found for this decade")
exit()
# choose how many subprocess to use
if arg_from_command_line == False:
print("You have ", multiprocessing.cpu_count(), " cores")
core_number = input('How many core do you want to use?: (Do not overdo it)\n')
else:
core_number = str(sys.argv[3])
# create a pool of processes to clear the source text
logger.info("Start Time : %s", datetime.now())
start_time = datetime.utcnow()
pool = multiprocessing.Pool(processes=int(core_number)) # creation of the pool of processes
results = [pool.map(parallelized_function, filtered_docs_list)] # array of documents preprocessed
pool.close() # close the pool of processes
concat_results = np.concatenate(results[0]) # concat all the processed documents
# logs the time of the total process about the selected year
logger.info("End Time : %s", datetime.now())
end_time = datetime.utcnow()
total_time = end_time - start_time
logger.info("Total Time : %s", total_time)
if choose == "lda":
choice_lda_method(results[0], int(best_topic_number[year]), 10, year)
if choose == "lsa":
choice_lsa_method(results[0], int(best_topic_number[year]), 10, year)
if choose == "wordcloud":
clear_results = [
list(dict.fromkeys(concat_results))] # remove duplicates for clustering method
tot_vectors = {}
for word in clear_results[0]:
tot_vectors[str(word)] = ew.get_embedding(str(word)) # get the embedding of each word
choice_wordcloud_method(concat_results, year, filtered_docs_list, results)
if choose == "gtd-cluster":
clear_results = [list(dict.fromkeys(concat_results))] # remove duplicates for clustering method
tot_vectors = {}
for word in clear_results[0]:
tot_vectors[str(word)] = ew.get_embedding(str(word)) # get the embedding of each word
# get the top 50 words of the most dense cluster
if not os.path.exists(f"output/{year}/PCA/"):
os.makedirs(f"output/{year}/PCA/")
pca.pca_clustering_3D(list(tot_vectors.values()), list(tot_vectors.keys()),
f"output/{year}/PCA/InitialCluster__nWords_{len(tot_vectors)}")
choice_clustering_method(tot_vectors, year, clear_results[0])
else: # choose is top2vec
singleORgrouped = input("Do you want to analyze a single year or a group of years?\n"
"-> Digit 'single' to analyze a single year (may not working for every years)\n"
"-> Digit 'group' to analyze a group of years\n")
if singleORgrouped == "single":
print("single")
year = input("Insert year to be analyze: \n""(you can choose from the list: " + str(years_list) + ")\n")
# check if the year is in the list of years
if year not in years_list:
print("Year not in the list")
exit()
# take all the names of the files
filtered_docs_list = [] # list of the files to analyze filtered by year
for doc in listDoc:
if doc.endswith(".txt") and year in doc:
filtered_docs_list.append(doc)
if filtered_docs_list == []: # we found no files to analyze for the selected year
print("No documents found for this decade")
exit()
list_files = []
for doc in filtered_docs_list:
input_file = open(f"data/{doc}", encoding="utf8")
file_text = input_file.read()
list_files.append(file_text)
logger.info("Start Top2Vec analysis for documents contained in the year: %s. Number of documents: %s",
year, len(list_files))
topic_words, word_scores, topic_nums = choice_top2vec(list_files)
partial_results = [year, topic_words, word_scores, topic_nums]
logger.info("End Top2Vec analysis for documents contained in the year: %s", year)
printToFile(partial_results, "single", year)
else:
# execute top_2_vec on documents grouped by ten years
# extract interval of 10 year from the list of years
year_list_10 = []
for i in range(0, len(years_list) - 9, 10):
year_list_10.append(years_list[i:i + 10])
year_list_10.append(years_list[len(years_list) - len(years_list) % 10:]) # take the remaining years
resultsForFile = []
for group in year_list_10:
list_files = []
for year in group:
for doc in listDoc:
if doc.endswith(".txt") and year in doc:
input_file = open(f"data/{doc}", encoding="utf8")
file_text = input_file.read()
list_files.append(file_text)
logger.info(
"Start Top2Vec analysis for documents contained in the year: %s. Number of documents: %s",
group, len(list_files))
topic_words, word_scores, topic_nums = choice_top2vec(list_files)
partial_results = [group, topic_words, word_scores, topic_nums]
logger.info("End Top2Vec analysis for documents contained in the year: %s", group)
resultsForFile.append(partial_results)
printToFile(resultsForFile, "group", "")