-
Notifications
You must be signed in to change notification settings - Fork 129
/
Copy pathSentiment.py
133 lines (101 loc) · 3.6 KB
/
Sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 31 16:45:19 2014
@author: francesco
"""
import re
def readJson(filename):
"""
reads a json file and returns a clean pandas data frame
"""
import pandas as pd
df = pd.read_json(filename)
def unlist(element):
return ''.join(element)
for column in df.columns:
df[column] = df[column].apply(unlist)
if filename == '/home/francesco/BigData/Project/ritho.json':
def getCorrectDate(wrongdate):
mon_day_year = re.search( r'(\w+) (\d+)\w+, (\d+)', wrongdate)
month, day, year = mon_day_year.group(1), mon_day_year.group(2), mon_day_year.group(3)
return month + ' ' + day + ' ' + year
df['date'] = df['date'].apply(getCorrectDate)
df['date'] = pd.to_datetime(df['date'])
else:
df['date'] = df['date'].apply(lambda x: x[:10])
df['date'] = pd.to_datetime(df['date'])
df = df.drop_duplicates(subset = ['keywords'])
df = df.sort(columns='date')
#df = df.set_index('date')
df['text'] = df['keywords'] + df['body']
df = df.drop('body', 1)
df = df.drop('keywords', 1)
return df
def cleanText(text):
"""
removes punctuation, stopwords and returns lowercase text in a list of single words
"""
text = text.lower()
from bs4 import BeautifulSoup
text = BeautifulSoup(text).get_text()
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
text = tokenizer.tokenize(text)
from nltk.corpus import stopwords
clean = [word for word in text if word not in stopwords.words('english')]
return clean
def loadPositive():
"""
loading positive dictionary
"""
myfile = open('/home/francesco/Dropbox/DSR/StocksProject/LoughranMcDonald_Positive.csv', "r")
positives = myfile.readlines()
positive = [pos.strip().lower() for pos in positives]
return positive
def loadNegative():
"""
loading positive dictionary
"""
myfile = open('/home/francesco/Dropbox/DSR/StocksProject/LoughranMcDonald_Negative.csv', "r")
negatives = myfile.readlines()
negative = [neg.strip().lower() for neg in negatives]
return negative
def countNeg(cleantext, negative):
"""
counts negative words in cleantext
"""
negs = [word for word in cleantext if word in negative]
return len(negs)
def countPos(cleantext, positive):
"""
counts negative words in cleantext
"""
pos = [word for word in cleantext if word in positive]
return len(pos)
def getSentiment(cleantext, negative, positive):
"""
counts negative and positive words in cleantext and returns a score accordingly
"""
positive = loadPositive()
negative = loadNegative()
return (countPos(cleantext, positive) - countNeg(cleantext, negative))
def updateSentimentDataFrame(df):
"""
performs sentiment analysis on single text entry of dataframe and returns dataframe with scores
"""
positive = loadPositive()
negative = loadNegative()
df['text'] = df['text'].apply(cleanText)
df['score'] = df['text'].apply(lambda x: getSentiment(x,negative, positive))
#clean = pd.Series([cleanText(text) for text in list(df['text'])])
#df['text'] = clean
return df
def prepareToConcat(filename):
"""
load a csv file
"""
print 'Reading json'
df = readJson('/home/francesco/BigData/Project/tech2.json')
#print df
print 'Performing Sentiment...'
updateSentimentDataFrame(df).to_csv('/home/francesco/BigData/Project/tech2.csv', index = False)