-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathremove_spam_tweets.py
102 lines (84 loc) · 3.38 KB
/
remove_spam_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
remove spam tweets from users that has more than K number of tweets
"""
import numpy as np
import collections
from sets import Set
from Params import Params
from datetime import datetime
import operator
disaster_array = ["napa_earthquake", "michigan_storm", "california_fire", "washington_mudslide", "iowa_stf", "iowa_storm", "jersey_storm",
"oklahoma_storm", "iowa_stf_2", "vermont_storm", "virginia_storm", "texas_storm", "washington_storm",
"washington_wildfire", "newyork_storm"]
K = 15 # tweets created by users with more than K tweets are considered spams
d_duration = Params.disaster_duration
"""
remove tweets from spam users, who created more than K tweets
"""
def remove_spam_tweets(input_file, non_spam_tweets, spam_tweets, delimeter=',', user_id_index=0):
global K
print K
user_ids = []
data = []
with open(input_file, 'rU') as fg:
for g in fg:
a = (g.split(','))
try:
user_ids.append(a[-3])
data.append(a)
except IndexError:
continue
#data = np.array(data)
#print user_ids
#user_ids = np.loadtxt(input_file, dtype= 'int64', delimiter=delimeter, usecols = ([len()]))
map_index = {}
for i in range(len(user_ids)):
map_index[user_ids[i]] = i
counter = dict(collections.Counter(user_ids))
#sorted_x = sorted(counter.items(), key=operator.itemgetter(1))
#print sorted_x[len(sorted_x) - 50 :-1]
non_spam_user_ids = Set([k for k in counter.keys() if counter[k] <= K])
valid_rows = []
for id in data:
if id[-3] in non_spam_user_ids:
valid_rows.append(1)
else:
valid_rows.append(0)
non_spam_data = 0
spam_data = 0
#print valid_rows
with open(non_spam_tweets,"wb") as f2:
for i in xrange(len(valid_rows)):
if int(valid_rows[i]) == 1:
f2.write(",".join(data[i]))
non_spam_data += 1
with open(spam_tweets, "wb") as f3:
for i in xrange(len(valid_rows)):
if int(valid_rows[i]) == 0:
f3.write(",".join(data[i]))
spam_data += 1
print 'Number of spam tweets: ', (spam_data), '(', int(((spam_data)+ 0.0)/len(user_ids) * 100), '%)'
print "Number of Non Spam users: ", len(non_spam_user_ids)
print "Number of Spam users: ", len(counter) - len(non_spam_user_ids)
#print "Number of Non Spam users: ", ((non_spam_user * 1.0) / (spam_user + non_spam_user)) * 100
#print "Number of Spam users : ", ((spam_user * 1.0) / (spam_user + non_spam_user)) * 100
#np.savetxt(non_spam_tweets, non_spam_data, fmt='%s', delimiter=delimeter)
#np.savetxt(spam_tweets, spam_data, fmt='%s', delimiter=delimeter)
for ij in disaster_array:
#print d_duration[ij]
print ij
#if ij != "newyork_storm":
# continue
K = 15
date_format = "%m-%d-%Y %H:%M:%S"
a = datetime.strptime(d_duration[ij][0], date_format)
b = datetime.strptime(d_duration[ij][1], date_format)
delta = b - a
#print delta.days
K = K * (delta.days + 1)
f = ["./data/disasters/" + ij + "/" + ij + "_affected_unfiltered.txt",
"./data/disasters/" + ij + "/" + ij + "_unaffected_unfiltered.txt"]
for i in f:
NON_SPAM_FILE = i[:-4] + "_non_spam.txt"
SPAM_FILE = i[:-4] + "_spam.txt"
remove_spam_tweets(i, NON_SPAM_FILE, SPAM_FILE)