-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathextract_tweets.py
116 lines (104 loc) · 5.05 KB
/
extract_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import re
import os.path
import csv
import glob
import datetime
import calendar
from Utils import distance
from Params import Params
"""
multiple purpose function, e.g., extract tweet only, add sentiment, extract statistics without tweet
"""
def extract_tweets(tweet_input, tweet_output, tweet_index=0, type='tweet_only'):
print 'extracting...', tweet_input
if type == 'tweet_only':
columns = 5
elif type == 'with_sentiment':
basepath, filename = os.path.split(file)
label_file = Params.label_folder + filename
if not os.path.isfile(label_file):
return
labels = np.loadtxt(label_file)
j = 0
columns = 5
elif type == 'with_informative':
columns = 6
elif type == 'without_tweet':
columns = 6
with open(tweet_input, 'rU') as f:
with open(tweet_output, "w") as f2:
for a in f:
b = []
if len(a.split(',')) >= columns:
a = a.split(',')
tweet = ','.join([a[i] for i in xrange(0, len(a) - columns + 1)])
b.append(tweet)
b += [a[i] for i in xrange(len(a) - columns + 1, len(a))]
elif len(b) < columns:
print len(b), b
j = j + 1
continue
if type == 'tweet_only':
s = re.sub('[\s]+|&', ' ', b[tweet_index]) # Remove additional white spaces
s = re.sub(r'https?:\/\/.*\/[a-zA-Z0-9]*', '', s) # Remove hyperlinks
f2.write(s + "\n")
elif type == 'with_sentiment':
line = str(b[0]) + ',' + str(b[1]) + ',' + str(int(b[2])) + ',' + str(float(b[3])) + ',' + str(float(b[4].rstrip('\n'))) + ',' + str(int(labels[j])) + '\n'
f2.write(line)
j = j + 1
elif type == 'with_informative':
line = str(b[0]) + ',' + str(b[1]) + ',' + str(int(b[2])) + ',' + str(float(b[3])) + ',' + str(
float(b[4].rstrip('\n'))) + ',' + str(int(b[5])) + '\n'
f2.write(line)
elif type == 'without_tweet':
t = calendar.timegm(datetime.datetime.strptime(b[1].strip(), "%Y-%m-%d %H:%M:%S").timetuple())
d = distance(float(b[3]), float(b[4]), 38.2414392,-122.3128157)
f2.write(str(b[1])+ '\t' + str(t) + '\t' + str(int(b[2])) + '\t' + str(float(b[3])) + '\t' + str(float(b[4].rstrip('\n'))) + '\t' + str(int(b[5])) + '\t' + str(d) + '\n')
# setup
tweet_only, with_sentiment, with_informative, without_tweet = False, False, False, True
disaster_array = ["napa_earthquake", "michigan_storm", "california_fire", "washington_mudslide", "iowa_stf", "iowa_storm", "jersey_storm",
"oklahoma_storm", "iowa_stf_2", "vermont_storm", "virginia_storm", "texas_storm", "washington_storm",
"washington_wildfire", "newyork_storm"]
area = ["affected_filtered", "unaffected_filtered"]
type = ["hash", "classify"]
# extract tweet_only
if tweet_only:
for ij in disaster_array:
for i in area:
for j in type:
for file in glob.glob(Params.gesis_disaster_folder + ij + "/" + ij + "_" + i + "_" + j + ".txt"):
#for file in glob.glob(Params.gesis_disaster_folder + "/" + ij + "/" + ij .txt"):
basepath, filename = os.path.split(file)
file_out = Params.tweet_folder + filename
extract_tweets(file, file_out, type='tweet_only')
# add sentiment to data
if with_sentiment:
for ij in disaster_array:
for i in area:
for j in type:
for file in glob.glob(Params.gesis_disaster_folder + ij + "/" + ij + "_" + i + "_" + j + ".txt"):
#for file in glob.glob(Params.gesis_disaster_folder + "/*/*.txt"):
basepath, filename = os.path.split(file)
file_out = Params.with_sentiment_folder + filename
#print file_out
extract_tweets(file, file_out, type='with_sentiment')
# add informative to tweets
if with_informative:
for file in glob.glob(Params.with_sentiment_folder + "*.txt"):
basepath, filename = os.path.split(file)
file_out = Params.with_informative_folder + filename
extract_tweets(file, file_out, type='with_informative')
# extract statistics without tweet
if without_tweet:
for ij in disaster_array:
for i in area:
for j in type:
#print str(Params.with_sentiment_folder + ij + "_" + i + "_" + j + ".txt")
for file in glob.glob(Params.with_sentiment_folder + ij + "_" + i + "_" + j + ".txt"):
#for file in glob.glob(Params.with_sentiment_folder + "*.txt"):
basepath, filename = os.path.split(file)
#print filename
file_out = Params.without_tweet_folder + filename
#print file_out
extract_tweets(file, file_out, type='without_tweet')