-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathtweet_tokenizer.py
221 lines (185 loc) · 6.04 KB
/
tweet_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""
This tokenize method is based on Christopher Potts's.
http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py
1. The tuple regex_str defines a list of regular expression
strings.
2. The regex_str strings are put, in order, into a compiled
regular expression object called tokens_re.
3. The tokenization is done by tokens_re.findall(s), where s is the
user-supplied string, inside the tokenize() method of the class
Tokenizer.
4. When instantiating Tokenizer objects, there is a single option:
preserve_case. By default, it is set to True. If it is set to
False, then the tokenizer will downcase everything except for
emoticons.
5. Stop words and punctuations are then removed
"""
import string
import re
import htmlentitydefs
import nltk.data
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
SlangLookupTable = "./data/SentStrength_Data/SlangLookupTable.txt"
EmoticonLookupTable = "./data/SentStrength_Data/EmoticonLookupTable.txt"
informal_words = []
emoticons = []
with open(SlangLookupTable) as f:
for line in f.readlines():
word = line.split()[0]
informal_words.append(word)
with open(EmoticonLookupTable) as f:
for line in f.readlines():
emoticon = line.split()[0]
emoticons.append(emoticon)
stop_words = set(stopwords.words("english"))
# stop_words_tweets = set(stopwords.words('english_tweet'))
# print informal_words
# emoticons_str = ' '.join(informal_words)
emoticon_str = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
# print emoticon_str
emoticon_str = r"""(%s)""" % "|".join(map(re.escape, emoticons + emoticon_str.split()))
# exit()
# The components of the tokenizer:
regex_str = (
# Phone numbers:
r"""
(?:
(?: # (international)
\+?[01]
[\-\s.]*
)?
(?: # (area code)
[\(]?
\d{3}
[\-\s.\)]*
)?
\d{3} # exchange
[\-\s.]*
\d{4} # base
)"""
,
# Emoticons:
emoticon_str
,
r"""
(<[^>]+>) # HTML tags:
"""
,
r"""
(?:@[\w_]+) # Twitter username
"""
,
r"""
(?:\#+[\w_]+[\w\'_\-]*[\w_]+) # Twitter hashtags
"""
# Remaining word types:
,
r"""
(?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.
# |
# (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
# |
# (?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
)
# print emoticon_str
# exit()
# The emoticon string gets its own regex so that we can preserve case for them as needed:
emoticon_re = re.compile(emoticon_str, re.VERBOSE | re.IGNORECASE| re.UNICODE)
# This is the core tokenizing regex:
tokens_re = re.compile(r"""(%s)""" % "|".join(regex_str), re.VERBOSE | re.IGNORECASE | re.UNICODE)
# tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE | re.UNICODE)
# These are for regularizing HTML entities to Unicode:
html_entity_digit_re = re.compile(r"&#\d+;")
html_entity_alpha_re = re.compile(r"&\w+;")
amp = "&"
def html2unicode(s):
"""
Internal method that seeks to replace all the HTML entities in
s with their corresponding unicode characters.
"""
# First the digits:
ents = set(html_entity_digit_re.findall(s))
if len(ents) > 0:
for ent in ents:
entnum = ent[2:-1]
try:
entnum = int(entnum)
s = s.replace(ent, unichr(entnum))
except:
pass
# Now the alpha versions:
ents = set(html_entity_alpha_re.findall(s))
ents = filter((lambda x: x != amp), ents)
for ent in ents:
entname = ent[1:-1]
try:
s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname]))
except:
pass
s = s.replace(amp, " and ")
return s
PUNCTS = set(string.punctuation)
NUMBERS = set('0123456789')
def nltk_tokenize(s):
# Try to ensure unicode:
try:
s = unicode(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Fix HTML character entitites:
s = html2unicode(s)
tknzr = TweetTokenizer(strip_handles=False, reduce_len=True)
words = tknzr.tokenize(s)
return words
tokenizer = "NLTK"
def tokenize(s, preserve_case=False):
# Try to ensure unicode:
try:
s = unicode(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Fix HTML character entitites:
s = html2unicode(s)
if tokenizer == "NLTK":
"""
NLTK tokenizer
"""
tknzr = TweetTokenizer(strip_handles=False, reduce_len=True)
words = tknzr.tokenize(s)
elif tokenizer == "Custom":
s.replace("\n", " ") # merge multiple lines of tweets if any
s = re.sub('[\s]+|&', ' ', s) # Remove additional white spaces
s = re.sub(r'https?:\/\/.*\/[a-zA-Z0-9]*', '', s) # Remove hyperlinks
"""
custom tokenizer
"""
s = nltk.tokenize.casual.remove_handles(s) # remove twitter username handles from text
# Tokenize:
words = tokens_re.findall(s)
# print s
# print words
# Possible alter the case, but avoid changing emoticons like :D into :d:
if not preserve_case:
words = map((lambda x: x[0] if emoticon_re.search(x[0]) else x[0].lower()), words)
words = [w for w in words if not (w.strip() in stop_words or w.strip() in PUNCTS) and not w.strip().isnumeric()] # remove stop words AND punctuations
return words