-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsentiment_analysis.py
140 lines (119 loc) · 4.72 KB
/
sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#Alec Larsen - University of the Witwatersrand, South Africa, 2012 import shlex, subprocess
import shlex
import subprocess
import sys
import os
import csv
import re
import numpy as np
from sklearn.metrics import precision_recall_fscore_support as pr
#tweet_category_file = "./output/tweet.category.csv"
# tweet_category_file = "./data/election_neg.txt"
tweet_category_file = "./data/sentiment140/testdata.manual.2009.06.14.csv"
tweet_senti_file = "./data/sentiment140/testdata.manual.2009.06.14_output.csv"
tweet_index = 5 #6
def SentiStrength(tweet):
#open a subprocess using shlex to get the command line string into the correct args list format
p = subprocess.Popen(shlex.split("java -jar SentiStrength.jar stdin sentidata data/SentStrength_Data/ binary"),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
#communicate via stdin the string to be rated. Note that all spaces are replaced with +
stdout_text, stderr_text = p.communicate(tweet.replace(" ", "+").encode())
#remove the tab spacing between the positive and negative ratings. e.g. 1 -5 -> 1-5
stdout_text = stdout_text.rstrip().decode().replace("\t","")
return stdout_text
"""
run senstiment analysis for a file, including a set of tweets
each line of input file is a tweet's information
tweet_index is the index of the text in each line
"""
def sensiment_analyzer(tweet_input, tweet_output, delimiter=',', tweet_index=3):
true_moods = []
estimated_moods = [] # counting frequency of tweet
mood_mapping = {0:-1, 4:1}
with open(tweet_input,"r") as tweetfile:
rd=csv.reader(tweetfile)
for row in rd:
try:
s = unicode()
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
s = row[tweet_index]
if not mood_mapping.has_key(int(row[0])):
continue
res=SentiStrength(s)
mood = int(res.split()[2])
estimated_moods.append(mood)
true_mood = mood_mapping[int(row[0])]
true_moods.append(true_mood)
print true_mood, '\t', mood
bPrecis, bRecall, bFscore, bSupport = pr(true_moods, estimated_moods, average='macro', labels=[-1,1])
print bPrecis, bRecall, bFscore, bSupport
# with open(tweet_input, "r") as f:
# with open(tweet_output, "wb") as f2:
# rd=csv.reader(f)
# wr=csv.writer(f2)
# i=0
#
# for row in rd:
# wr.writerow(row+[estimated_moods[i]])
# i+=1
# These are for regularizing HTML entities to Unicode:
html_entity_digit_re = re.compile(r"&#\d+;")
html_entity_alpha_re = re.compile(r"&\w+;")
amp = "&"
def html2unicode(s):
"""
Internal method that seeks to replace all the HTML entities in
s with their corresponding unicode characters.
"""
# First the digits:
ents = set(html_entity_digit_re.findall(s))
if len(ents) > 0:
for ent in ents:
entnum = ent[2:-1]
try:
entnum = int(entnum)
s = s.replace(ent, unichr(entnum))
except:
pass
# Now the alpha versions:
ents = set(html_entity_alpha_re.findall(s))
ents = filter((lambda x: x != amp), ents)
for ent in ents:
entname = ent[1:-1]
try:
s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname]))
except:
pass
s = s.replace(amp, " and ")
return s
def split_datasets(tweet_input, tweet_index=3):
mood_mapping = {0: -1, 2: 0, 4: 1}
with open(tweet_input, "r") as tweetfile:
rd = csv.reader(tweetfile)
train_neg = []
train_neu = []
train_pos = []
for row in rd:
s = None
try:
s = unicode(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Fix HTML character entitites:
s = html2unicode(s)
s = row[tweet_index]
true_mood = mood_mapping[int(row[0])]
if true_mood == -1:
train_neg.append(s)
elif true_mood == 0:
train_neu.append(s)
elif true_mood == 1:
train_pos.append(s)
np.savetxt('./test/word2vec-sentiments-master/train-neg.txt', train_neg, fmt='%s')
# np.savetxt('./test/word2vec-sentiments-master/train_neu.txt', train_neu, fmt='%s')
np.savetxt('./test/word2vec-sentiments-master/train-pos.txt', train_pos, fmt='%s')
if __name__=="__main__":
sensiment_analyzer(tweet_category_file, tweet_senti_file, '\t', tweet_index)
# split_datasets(tweet_category_file, tweet_index)