-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerator_twitter_diff_classes.py
67 lines (52 loc) · 2.06 KB
/
generator_twitter_diff_classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import sys
import pickle
from datetime import datetime, timedelta
import time
import random
random.seed(42)
import re
modes = ['bert_768', 'bow_50', 'bow_768']
if len(sys.argv) < 2 or sys.argv[1] not in modes:
print('Need mode {mode} as parameter!'.format(mode=modes))
exit(1)
mode = sys.argv[1]
embeddings_file_b = 'data/twitter/biden_{mode}_embeddings.pickle'.format(mode=mode)
embeddings_file_t = 'data/twitter/trump_{mode}_embeddings.pickle'.format(mode=mode)
gensim_model_50_file = 'data/twitter/twitter_election_model/twitter_election_50.model'
gensim_model_768_file = 'data/twitter/twitter_election_model/twitter_election_768.model'
if os.path.isfile(embeddings_file_b) and os.path.isfile(embeddings_file_t): # Do not overwrite
print("Embeddings file already exists, exiting.", embeddings_file_t)
exit()
with open('data/twitter/election_dataset_raw.pickle', 'rb') as handle:
twitter = pickle.load(handle)
biden, trump = twitter['biden'], twitter['trump']
biden = [x for x in biden if 'trump' not in x[1].lower()]
trump = [x for x in trump if 'biden' not in x[1].lower()]
if(mode == "bert_768"):
from embedding import BertHuggingface
bert = BertHuggingface(8, model_name='bert-base-multilingual-cased', batch_size=8)
embed = bert.embed
elif(mode == "bow_50"):
print("gensim_model_50_file", gensim_model_50_file)
from word2vec.Word2Vec import Word2Vec
word2vec = Word2Vec(gensim_model_50_file)
word2vec.prepare()
embed = word2vec.embed
elif(mode == "bow_768"):
print("gensim_model_768_file", gensim_model_768_file)
from word2vec.Word2Vec import Word2Vec
word2vec = Word2Vec(gensim_model_768_file)
word2vec.prepare()
embed = word2vec.embed
def embed_tweets(data):
times, tweets = zip(*data)
embs = embed(tweets)
z = zip(embs, times)
return list(z)
embs_biden = embed_tweets(biden)
with open(embeddings_file_b, 'wb') as handle:
pickle.dump(embs_biden, handle)
embs_trump = embed_tweets(trump)
with open(embeddings_file_t, 'wb') as handle:
pickle.dump(embs_trump, handle)