-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathfind_tokens.py
executable file
·176 lines (129 loc) · 5.86 KB
/
find_tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python
import json
from collections import Counter
from twokenize_wrapper import tokenize
from token_pb2 import Token, Tokens
import tokenize_tweets
import io
from nltk.corpus import stopwords
from word2vec_integration import filterStopwords
from gensim.models import word2vec, Phrases
#INPUT = '/home/isabelle/additionalTweetsStanceDetection.json'
INPUT = '/Users/Isabelle/Documents/TextualEntailment/SemEvalStance/stanceDetection.json'
#INPUT = 'stanceDetection.json'
#INPUT = '/Users/Isabelle/Documents/TextualEntailment/SemEvalStance/additionalTweetsStanceDetection_small.json'
#INPUT = '/Users/Isabelle/Documents/TextualEntailment/SemEvalStance/USFD-StanceDetection/data/semeval/downloaded_Donald_Trump.txt'
#OUTPUT = './tokensFinal' # redownload this, changed on 15 January
OUTPUT = './tokensPhrases'
# tokenise the collected tweets
def findTokensJson():
tokens = Counter()
for line in open(INPUT, 'r'):
for token in tokenize(json.loads(line)['text']):
tokens[token] += 1
output = open(OUTPUT, "wb")
tokens_pb = Tokens()
for token, count in tokens.most_common():
token_pb = tokens_pb.tokens.add()
token_pb.token = token
token_pb.count = count
output.write(tokens_pb.SerializeToString())
output.close
# tokenise the collected tweets, plus all the other ones, process with phrases model, for training autoencoder
def findTokensPhrases(phrasemodel="phrase.model", useDev=False):
tokencnt = Counter()
bigram = Phrases(phrasemodel)
twcntr = 0
supercntr = 0
trumpcntr = 0
for line in open(INPUT, 'r'):
twcntr += 1
tokenised = tokenize(json.loads(line)['text'].lower())
tokens = filterStopwords(tokenised) # filter stopwords
for token in bigram[tokens]: # calling the phrase model, this leaves some as single tokens and feq occurring ones as bigrams
tokencnt[token] += 1
for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8
if line.startswith('ID\t'):
continue
tokens = filterStopwords(tokenize(line.split("\t")[2].lower())) #For Trump it's [1]
for token in bigram[tokens]:
supercntr += 1
tokencnt[token] += 1
if useDev == True:
for line in io.open(tokenize_tweets.FILEDEV, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8
if line.startswith('ID\t'):
continue
tokens = filterStopwords(tokenize(line.split("\t")[2].lower())) #For Trump it's [1]
for token in bigram[tokens]:
supercntr += 1
tokencnt[token] += 1
for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8
if line.startswith('ID\t'):
continue
tokens = filterStopwords(tokenize(line.split("\t")[1].lower())) #For Trump it's [1]
for token in bigram[tokens]:
trumpcntr += 1
tokencnt[token] += 1
output = open(OUTPUT, "wb")
tokens_pb = Tokens()
for token, count in tokencnt.most_common():
if count > 1: # not even worth saving singletons
token_pb = tokens_pb.tokens.add()
token_pb.token = token
token_pb.count = count
print "Saving token counts for ", tokencnt.__sizeof__(), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets"
output.write(tokens_pb.SerializeToString())
output.close
# tokenise the collected tweets, plus all the other ones, for training autoencoder
def findTokensAll():
tokens = Counter()
twcntr = 0
supercntr = 0
trumpcntr = 0
for line in open(INPUT, 'r'):
twcntr += 1
for token in tokenize(json.loads(line)['text']):
tokens[token] += 1
for line in io.open(tokenize_tweets.FILETRAIN, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8
if line.startswith('ID\t'):
continue
for token in tokenize(line.split("\t")[2]): #For Trump it's [1]
supercntr += 1
tokens[token] += 1
for line in io.open(tokenize_tweets.FILETRUMP, encoding='utf-8', mode='r'): #for the Trump file it's utf-8
if line.startswith('ID\t'):
continue
for token in tokenize(line.split("\t")[1]): #For Trump it's [1]
trumpcntr += 1
tokens[token] += 1
output = open(OUTPUT, "wb")
tokens_pb = Tokens()
for token, count in tokens.most_common():
if count > 1: # not even worth saving singletons
token_pb = tokens_pb.tokens.add()
token_pb.token = token
token_pb.count = count
print "Saving token counts for ", tokens.__sizeof__(), ". ", twcntr, " unlabelled tweets, ", trumpcntr, " Donald Trump tweets, ", supercntr, " labelled tweets"
output.write(tokens_pb.SerializeToString())
output.close
# tokenise the official data
def findTokensOfficial():
tokens = Counter()
for line in io.open(INPUT, encoding='windows-1252', mode='r'): #for the Trump file it's utf-8
if line.startswith('ID\t'):
continue
for token in tokenize(line.split("\t")[2]): #For Trump it's [1]
tokens[token] += 1
output = open(OUTPUT, "wb")
tokens_pb = Tokens()
for token, count in tokens.most_common():
token_pb = tokens_pb.tokens.add()
token_pb.token = token
token_pb.count = count
output.write(tokens_pb.SerializeToString())
output.close
if __name__ == '__main__':
#findTokensJson() #this is to tokenise the unlabelled tweets, needs to be run first. OUTPUT = './tokens
#findTokensOfficial() #this is to tokenise the labelled tweets, needs to be run first. OUTPUT = './tokensOfficialDev, ./tokensOfficialTrain, ./tokensOfficialTrump
#findTokensAll()
findTokensPhrases(useDev=True)