-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
105 lines (77 loc) · 3.2 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# from data.test_data import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import os
nltk.download('stopwords')
nltk.download('punkt_tab')
group_colors = {
"mostly left": "#4E79A7",
"somewhat left": "#5FA2CE",
"center": "#9467BD",
"somewhat right": "#FF9D9A",
"mostly right": "#E15759"
}
def remove_stopwords(sent):
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(sent)
filtered_words = [w for w in word_tokens if not w.lower() in stop_words]
return ' '.join(filtered_words)
def get_data(data_path):
keys = [
'id',
'name',
'articles',
'mostly_left_summary',
'somewhat_left_summary',
'center_summary',
'somewhat_right_summary',
'mostly_right_summary'
]
processed_clusters = {}
clusters = {}
base_path_to_all_data = data_path
json_files = [pos_json for pos_json in os.listdir(base_path_to_all_data) if pos_json.endswith('.jsonl')]
json_files.sort()
for file in json_files:
json_file = pd.read_json(path_or_buf=f'{base_path_to_all_data}/{file}', lines=True)
cur_cluster_name = file.replace('_', ' ').replace('.jsonl', '').replace(data_path, '')
clusters[cur_cluster_name] = []
len_cur_cluster = len(json_file['id'])
for idx in range(len_cur_cluster):
cur_item = {}
for key in keys:
cur_item[key] = json_file[key][idx]
clusters[cur_cluster_name].append(cur_item)
for week in clusters:
cluster_week = []
colors_list = ["#C8CFA0", "#FFDFBA", "#FFFFBA", "#BAFFC9", "#DBB5B5", "#D1C4E9", "#E8C5E5", "#D6DAC8", "#D7CCC8", "#DCEDC8"]
# colors_list = ["#4e79a7","#f28e2c","#e15759","#76b7b2","#59a14f","#edc949","#af7aa1","#ff9da7","#9c755f","#bab0ab"]
for idx, c in enumerate(clusters[week]):
c["color"] = colors_list[idx % 10]
c['article_counts'] = len(c['articles'])
c["distribution"] = {}
c["distribution"]["mostly left"] = 0
c["distribution"]["somewhat left"] = 0
c["distribution"]["center"] = 0
c["distribution"]["somewhat right"] = 0
c["distribution"]["mostly right"] = 0
# mostly_left, somewhat_left, center, somewhat_right, mostly_right
for article in c["articles"]:
article["collection"] = article["collection"].replace("_", " ")
if article["collection"] == "mostly left":
c["distribution"]["mostly left"] += 1
elif article["collection"] == "somewhat left":
c["distribution"]["somewhat left"] += 1
elif article["collection"] == "center":
c["distribution"]["center"] += 1
elif article["collection"] == "somewhat right":
c["distribution"]["somewhat right"] += 1
else:
c["distribution"]["mostly right"] += 1
cluster_week.append(c)
if idx >= 9:
break
processed_clusters[week] = cluster_week
return processed_clusters