-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerator_amazon_movie_drift_data.py
185 lines (166 loc) · 7.48 KB
/
generator_amazon_movie_drift_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import os
import sys
import pickle
from datetime import datetime, timedelta
import time
import random
random.seed(42)
import re
import yaml
modes = ['bert_768', 'bow_50', 'bow_768']
num_samples = 4000 # Will be taken from each score class, e.g. 2,000*5=10,000 samples overall
year = 2011 # year to extract data from
if len(sys.argv) < 2 or sys.argv[1] not in modes:
print('Need mode {mode} as parameter!'.format(mode=modes))
exit(1)
mode = sys.argv[1]
# amazon_raw_file generated by amazon_movie_sorter.py
# gensim_model_files available at https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-05-17-Amazon-Doc2Vec/
# generated by word2vec/doc2vec.py
# embeddings_file is to be generated here
# sample_file also generated here, ensures same data for each model
amazon_raw_file = 'data/movies/embeddings/amazon_raw.pickle'
gensim_model_50_file = 'data/movies/amazonreviews_model/amazonreviews_50.model'
gensim_model_768_file = 'data/movies/amazonreviews_model/amazonreviews_768.model'
bert_model = 'data/movies/movie_9e'
sample_file = 'data/movies/embeddings/samples_{samplecount}_{year}.pickle'.format(samplecount=num_samples*5, year=year)
embeddings_file = 'data/movies/embeddings/amazon_drift_{}.pickle'.format(mode)
#target_percentages = [0.005, 0.01, 0.02, 0.04, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 4.0, 10.0, 20.0, 50.0, 100.0]
target_percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1] # From every 40th word to every second word
# These word were generated using the word2vec/opinion_words_amazon_combine.py and [...]_extract.py scripts
negative_words = ['waste', 'unwatchable', 'stinks']
if(True): # ratio at least 2.0
negative_words.extend(['atrocious', 'yawn', 'ugh', 'abomination', 'stupidest', 'garbage', 'laughably', 'worst', 'rubbish', 'defective', 'incoherent', 'ripoff', 'unconvincing', 'awful', 'dud', 'wasted', 'abysmal', 'travesty', 'wasting'])
if(False): # ratio at least 1.6
negative_words.extend(['poorly', 'disgrace', 'lame', 'insulting', 'lousy', 'disapointment', 'pointless', 'horrible', 'insult', 'laughable', 'terrible', 'moronic'])
if(False): # ratio at least 1.5
negative_words.extend(['sucks', 'idiotic', 'boycott', 'froze', 'nauseating', 'worthless', 'shoddy'])
if os.path.isfile(embeddings_file): # Do not overwrite
print("Embeddings file already exists, exiting.", embeddings_file)
exit()
time_begin = time.time()
print(datetime.fromtimestamp(time_begin))
# Configure model to use by mode
print("mode", mode)
print("amazon_raw_file", amazon_raw_file)
print("embeddings_file", embeddings_file)
print("sample_file", sample_file)
if(mode == "bert_768"):
from embedding import BertHuggingface
bert = BertHuggingface(5, batch_size=8)
bert.load(bert_model)
embed = bert.embed
elif(mode == "bow_50"):
print("gensim_model_50_file", gensim_model_50_file)
from word2vec.Word2Vec import Word2Vec
word2vec = Word2Vec(gensim_model_50_file)
word2vec.prepare()
embed = word2vec.embed
elif(mode == "bow_768"):
print("gensim_model_768_file", gensim_model_768_file)
from word2vec.Word2Vec import Word2Vec
word2vec = Word2Vec(gensim_model_768_file)
word2vec.prepare()
embed = word2vec.embed
elif(mode == "dummy"):
print("Skipping model")
def justpass(x):
pass
embed = justpass
else:
raise ValueError("Unknown mode " + mode)
# Load data
print("Loading data")
if os.path.isfile(sample_file):
with open(sample_file, 'rb') as handle:
original_data = pickle.load(handle)
drift_data = pickle.load(handle)
train_data = pickle.load(handle)
print("Loaded", len(drift_data), "|", len(original_data), "|", len(train_data), "samples")
else:
docs_in_years = {}
with open(amazon_raw_file, 'rb') as handle:
texts, keys = pickle.load(handle)
for i in range(len(keys)):
keys[i][1] -= 1 # fix class names from 1..5 to 0..4 for easier 1-hot encoding
docs_in_years[keys[i][-2].year] = docs_in_years.get(keys[i][-2].year , 0) + 1
print("Docs in year overview:", docs_in_years)
print("Example keys", keys[0])
# Gather amazon reviews of year range only
# e.g. classes[2] = [..., ("text", ['7/15', 2, datetime.datetime(1997, 12, 19, 1, 0), 39862]), ...]
classes = [[] for x in range(5)]
for i in range(len(keys)):
if(keys[i][-2].year == year):
classes[keys[i][1]].append((texts[i],keys[i]))
# Randomize order and get samples
# Same amount for original data and drift data
for i in range(len(classes)):
random.shuffle(classes[i])
original_data = []
drift_data = []
train_data = []
for i in range(len(classes)):
original_data.extend(classes[i][:num_samples])
drift_data.extend(classes[i][num_samples:2*num_samples])
train_data.extend(classes[i][2*num_samples:3*num_samples])
#sample_data = {'original_data': original_data, 'drift_data': drift_data}
with open(sample_file, 'wb') as handle:
pickle.dump(original_data, handle)
pickle.dump(drift_data, handle)
pickle.dump(train_data, handle)
print("Created", len(drift_data), "|", len(original_data), "|", len(train_data), "samples")
# Injection
# 0.5 means insertion of 1 words in 50 percent of texts
# 2.0 means insertion of 2 words in each text
indices = []
more_than_one = 0
def inject(texts, percent):
""" texts will be injected so that they then are at a injection rate of #percent
"""
global indices
global more_than_one
num = int(percent*len(texts)) - (more_than_one*len(texts) + len(indices))
for _ in range(num):
# find a text which hasnt been injected yet
not_yet_injected = [x for x in range(len(texts)) if x not in indices]
if not not_yet_injected:
indices = []
more_than_one += 1
not_yet_injected = [x for x in range(len(texts))]
injection_index = random.choice(not_yet_injected)
indices.append(injection_index)
# actual injection process
spaces = [m.start() for m in re.finditer(' ', texts[injection_index])]
if spaces:
injection_point = random.choice(spaces)
else:
injection_point = len(texts[injection_index])
t = texts[injection_index]
t = t[:injection_point] + ' ' + random.choice(negative_words) + t[injection_point:]
texts[injection_index] = t
return texts
if(False): # Injection test
texts = ["0 0 0 0","1 1 1 1","2 2 2 2","3 3 3 3","4 4 4 4","5 5 5 5","6 6 6 6","7 7 7 7","8 8 8 8","9 9 9 9"]
print(inject(texts, 0.5))
print(inject(texts, 2))
exit()
# Inject percentages
drifted_embeddings = []
drifted_texts, drifted_keys = [list(t) for t in zip(*drift_data)]
drifted_embeddings.append(embed(drifted_texts))
for num, perc in enumerate(target_percentages):
print("Incecting", num, "|", perc)
drifted_texts = inject(drifted_texts, perc)
drifted_embeddings.append(embed(drifted_texts))
# Save
original_texts, original_keys = [list(t) for t in zip(*original_data)]
original_embs = embed(original_texts)
train_texts, train_keys = [list(t) for t in zip(*train_data)]
train_embs = embed(train_texts)
dump_data = {'orig': (original_embs, original_keys), 'drifted': (drifted_embeddings, drifted_keys), 'train': (train_embs, train_keys)}
with open(embeddings_file, 'wb') as handle:
pickle.dump(dump_data, handle)
# Stats
time_end = time.time()
runtime = time_end - time_begin;
print("Runtime: %s minutes" % (runtime/60))