-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcategorization.py
98 lines (78 loc) · 3.09 KB
/
categorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
categorize tweets into categories, e.g., food, water, etc.
append at the end of each tweet
"""
from stemming.porter2 import stem
import csv
from PyDictionary import PyDictionary
dictionary=PyDictionary()
informative_tweet = "./output/informative_tweet.csv"
test_delimiter, test_text_index = ",", 3
tweet_category_file = "./output/tweet.category.csv"
def categorize_tweets(input_file, output_file, test_delimiter, test_text_index):
tweets=[]
newDataset=[]
with open(informative_tweet, 'rU') as f:
datareader=csv.reader(f, delimiter=test_delimiter)
for each in datareader:
tweets.append(each)
categories = ['Food', 'Water', 'Shelter', 'Medicine']
weight=0.2
dictionary={'Food':[], 'Water':[], 'Shelter':[], 'Medicine':[]}
"""
Extracting the tweets from csv
"""
#Food
with open('./data/Food.csv', 'rU') as f:
for each in f:
dictionary['Food'].append(each)
#Water
with open('./data/Water.csv', 'rU') as f:
for each in f:
dictionary['Water'].append(each)
#Shelter
with open('./data/Shelter.csv', 'rU') as f:
for each in f:
dictionary['Shelter'].append(each)
#Medicine
with open('./data/Medicine.csv', 'rU') as f:
for each in f:
dictionary['Medicine'].append(each)
for tweet in tweets:
inputString = tweet[test_text_index]
weightDict={} # {category : weight}
for category in categories:
weightCount=0.0
synList = dictionary[category]
if(category.lower() in inputString.lower()):
weightCount+=weight
for each in synList:
a=each.lower()
if a[:-1] in inputString.lower():
weightCount+=weight
for each in synList:
a=each.lower()
if stem(a[:-1]) in inputString.lower():
weightCount+=weight
weightDict[category]=weightCount
maxCount = max(weightDict.values())
if(weightDict.values().count(maxCount)==1): # one group
for key in weightDict.keys():
if(weightDict[key]==maxCount):
tweet.append(key) # category is the group with maximum weight
elif(maxCount==0.0): # no group
tweet.append('Unknown')
else: # multiple groups
temp=[]
for key in weightDict.keys():
if(weightDict[key]==maxCount):
temp.append(key)
tempString = ",".join(temp)
tweet.append(tempString)
newDataset.append(tweet)
with open(tweet_category_file, "wb") as csvfile:
areawriter = csv.writer(csvfile, delimiter=',')
for l in newDataset:
areawriter.writerow(l)
print "Categorized tweets into groups, e.g., food, water, etc: " + tweet_category_file
categorize_tweets( informative_tweet, tweet_category_file, test_delimiter, test_text_index)