-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsentimentanalysis.py
302 lines (241 loc) · 10.8 KB
/
sentimentanalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 5 09:52:02 2018
@author: christinakronser
Database to be found: https://drive.google.com/file/d/1KHmasvJFN4AWuflgicGeqvInMmNkKkio/view?usp=sharing
"""
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import google.cloud.language
from google.api_core.exceptions import InvalidArgument
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/christinakronser/Downloads/My First Project-522341c822f9.json"
def select(cur, variable, table):
"""
Database function to retrieve a variable
"""
cur.execute("SELECT {v} FROM {t}".format(v = variable, t = table))
variable = cur.fetchall()
variable = [i[0] for i in variable]
return variable
def sentiment_analysis(con, cur):
"""
Retrieves and stores the sentiment score and magnitude on sentence and
description level from the Google Cloud Natural Language API to the DB
"""
# Retrieve data from DB
description = np.array(select(cur,"DESCRIPTION", "data11"))
description_trans = np.array(select(cur,"DESCRIPTION_TRANSLATED", "data11"))
description_list = []
sentimentscore_list=[]
magnitude_list=[]
sentences_score_list=[]
sentences_magnitude_list=[]
sum= 0
# Create a Language client
language_client = google.cloud.language.LanguageServiceClient()
# Check whether to use original or translated description
for i in range(len(description)):
if description_trans[i] == '':
descr = description[i]
else:
descr = description_trans[i]
document = google.cloud.language.types.Document(
content=descr,
type=google.cloud.language.enums.Document.Type.PLAIN_TEXT)
# Use Language to detect the sentiment of the text
try:
response = language_client.analyze_sentiment(document=document)
except InvalidArgument as e:
print("Invalid: ", i)
sum += 1
continue
#SAVE SENTENCE ATTRIBUTES
score_all=[]
magnitude_all=[]
for y in range(len(response.sentences)):
score_all.append((response.sentences[y].sentiment.score))
magnitude_all.append((response.sentences[y].sentiment.magnitude))
sentences_score_list.append(repr(score_all))
sentences_magnitude_list.append(repr(magnitude_all))
# use eval() to turn it back into a list of floats
description_list.append(descr)
sentiment = response.document_sentiment
sentimentscore_list.append(sentiment.score)
magnitude_list.append(sentiment.magnitude)
print ('Progress: {}/{} rows processed'.format(i, len(description)))
# Save all scores to the DB
print("Sum of skipped rows: ", sum)
cur.execute("DROP TABLE IF EXISTS temp")
cur.execute("CREATE TABLE temp(DESCRIPTIONS text, SENTIMENTSCORE numeric, MAGNITUDE numeric, SENTENCESCORES text, SENTENCEMAGNITUDES text)")
def insert(d, ss, m, sens, senm):
cur.execute("INSERT INTO temp (DESCRIPTIONS, SENTIMENTSCORE, MAGNITUDE, SENTENCESCORES, SENTENCEMAGNITUDES) VALUES (?, ?, ?, ?, ?)", (d, ss, m, sens, senm))
for d, ss, m, sens, senm in zip(description_list, sentimentscore_list, magnitude_list, sentences_score_list, sentences_magnitude_list):
insert(d, ss, m, sens, senm)
cur.execute("DROP TABLE IF EXISTS data22")
cur.execute("CREATE TABLE data22 AS SELECT success.*, temp.SENTIMENTSCORE, temp.MAGNITUDE, temp.SENTENCESCORES, temp.SENTENCEMAGNITUDES FROM success, temp WHERE temp.DESCRIPTIONS IN (success.DESCRIPTION, success.DESCRIPTION_TRANSLATED)")
con.commit()
def distribution_sentimentscore_histogram(cur, var, table, label):
"""
Plots distribution of sentiment score
"""
x = select(cur,var, table)
print("Number of entries: ", len(x))
print("Maximum: ", max(x))
print("Minimum: ", min(x))
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.set_xlabel("Sentiment Score")
ax.set_ylabel("Number of Loans")
fig.suptitle(label)
ax.hist(x, bins = 15)
plt.show()
def distribution_magnitude_histogram(cur, var, table, label):
"""
Plots distribution of sentiment magnitude
"""
x = select(cur,var, table)
print("Number of entries: ", len(x))
print("Maximum: ", max(x))
print("Minimum: ", min(x))
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.set_xlabel("Sentiment Magnitude")
ax.set_ylabel("Number of Sentences")
fig.suptitle(label)
ax.hist(x, bins = 20)
plt.show()
def scatter_linearity(cur, var1, var2, table, x_label, y_label, name):
"""
Plots scatter plot of two variables to depict their relationship
"""
x = select(cur,var1, table)
y = select(cur,var2, table)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_ylim([0,11000])
fig.suptitle(name)
ax.plot(x, y, 'x')
ax.plot(x, np.poly1d(np.polyfit(x, y, 1))(x), color = 'r', linewidth = 1.0)
plt.show()
#--------------- Sentiment on Sentence Level ---------------
def histogram_quartiles(cur,variable1, variable2, table):
"""
Sentiment distribution based on position of sentence in description
"""
print("Sentiment distribution based on position of sentence in description")
sentence_scores = select(cur,variable1, table) # multiple list of strings
sentence_mags = select(cur,variable2, table) # multiple list of strings
quartileBottom_score = []
quartileBottom_mag = []
halfMiddle_score = []
halfMiddle_mag = []
quartileTop_score = []
quartileTop_mag = []
for i in range(len(sentence_scores)):
sentence_score = eval(sentence_scores[i]) # simple list of floats
sentence_mag = eval(sentence_mags[i])
for i in range(len(sentence_score)):
if i < round((0.25*len(sentence_score))):
quartileBottom_score.append(sentence_score[i])
quartileBottom_mag.append(sentence_mag[i])
if i > round((0.75*len(sentence_score))):
quartileTop_score.append(sentence_score[i])
quartileTop_mag.append(sentence_mag[i])
else:
halfMiddle_score.append(sentence_score[i])
halfMiddle_mag.append(sentence_mag[i])
n_groups = 3
means_score = (np.average(quartileBottom_score), np.average(halfMiddle_score), np.average(quartileTop_score))
# std_score = (np.std(quartileBottom_score), np.std(halfMiddle_score), np.std(quartileTop_score))
means_mag = (np.average(quartileBottom_mag), np.average(quartileTop_mag), np.average(quartileTop_mag))
# std_mag = (np.std(quartileBottom_mag), np.std(quartileTop_mag), np.std(quartileTop_mag))
fig, ax = plt.subplots()
print("Means Sentiment Score: ", means_score)
print("Means Magnitude: ", means_mag)
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.4
error_config = {'ecolor': '0.3'}
rects1 = ax.bar(index, means_score, bar_width,
alpha=opacity, color='b',
error_kw=error_config,
label='Sentiment')
rects2 = ax.bar(index + bar_width, means_mag, bar_width,
alpha=opacity, color='r',
error_kw=error_config,
label='Magnitude')
# ax.set_xlabel('Quartiles')
ax.set_ylabel('Scores')
ax.set_title('Scores by sentiment and magnitude')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(('Bottom quartile', 'Middle half', 'Top quartile'))
ax.legend((rects1[0], rects2[0]), ('Sentiment', 'Magnitude'))
fig.tight_layout()
plt.show()
def distribution_sentences_histo(cur, variable1, variable2, table):
"""
Sentiment distribution of sentences in bottom quartile (beginning of text)
and top quartile (end of text) in description
"""
print("Sentiment distribution of sentences in bottom quartile (beginning of text) and top quartile (end of text) in description")
# Retrieve data from DB
# sentences scores are stored as string list
sentence_scores = select(cur,variable1, table) # multiple list of strings
sentence_mags = select(cur,variable2, table) # multiple list of strings
quartileBottom_score = []
quartileBottom_mag = []
# halfMiddle_score = []
# halfMiddle_mag = []
quartileTop_score = []
quartileTop_mag = []
for i in range(len(sentence_scores)):
sentence_score = eval(sentence_scores[i]) # simple list of floats
sentence_mag = eval(sentence_mags[i])
for i in range(len(sentence_score)):
if i < round((0.25*len(sentence_score))):
quartileBottom_score.append(sentence_score[i])
quartileBottom_mag.append(sentence_mag[i])
if i > round((0.75*len(sentence_score))):
quartileTop_score.append(sentence_score[i])
quartileTop_mag.append(sentence_mag[i])
# else:
# halfMiddle_score.append(sentence_score[i])
# halfMiddle_mag.append(sentence_mag[i])
# create an empty figure object
fig = plt.figure()
# create a single axis on that figure
ax = fig.add_subplot(1,1,1)
# histogram the data and label the axes
ax.set_xlabel("Sentiment Score")
ax.set_ylabel("Number of loans")
fig.suptitle('Distribution of bottom quartile sentences sentiment')
ax.hist(quartileBottom_score)
plt.show()
# create an empty figure object
fig = plt.figure()
# create a single axis on that figure
ax = fig.add_subplot(1,1,1)
# histogram the data and label the axes
ax.set_xlabel("Sentiment Score")
ax.set_ylabel("Number of loans")
fig.suptitle('Distribution of top quartile sentences sentiment')
ax.hist(quartileTop_score)
plt.show()
def main():
# Make a connection to the database
con = sqlite3.connect('database.db')
cur = con.cursor()
# Plot distribution of sentiment score and magnitude
distribution_sentimentscore_histogram(cur, "SENTIMENTSCORE", "data22", "Distribution Sentiment Score - Large loans")
distribution_magnitude_histogram(cur, "MAGNITUDE", "data22", "Distribution Sentiment Magnitude - Large loans")
# Plot relationship between sentiment score and funding gap
scatter_linearity(cur, "SENTIMENTSCORE", "GAP", "data22", "Sentiment Score", "Funding gap in $", "Scatter plot of funding gap and sentiment score - large loans")
# Exploring sentiment score & magnitude on sentence level
histogram_quartiles(cur,"SENTENCESCORES", "SENTENCEMAGNITUDES", "data22")
distribution_sentences_histo(cur, "SENTENCESCORES", "SENTENCEMAGNITUDES", "data22")
if __name__ == "__main__": main()