-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathfiler_tweets_by_topics.py
142 lines (126 loc) · 4.92 KB
/
filer_tweets_by_topics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import json
import re
import numpy as np
import glob
from tweet_tokenizer import tokenize
# INPUT_FOLDER = './data/raw_tweets/data/'
INPUT_FOLDER = "./data/Gmove/"
OUTPUT_FOLDER = './data/selected_tweets/'
DISASTER_LIST = './data/updated_major_disaster.csv'
TWEET_MINIMUM_LENGTH = 3
# Keyword search
FILTER_BY_KEYWORD = False
keywords_file = "./data/CrisisLex/data/ClimateCovE350/twitter_climate_keywords.txt"
keywords = []
if FILTER_BY_KEYWORD:
with open(keywords_file) as f:
keywords = set([l.lower() for l in f.readlines()])
IS_TOKENIZED = True
"""
remove tab, \n, hyperlinks from tweets
"""
def clean_line(row):
row = re.sub(r"RT @\S+", "", row)
row = re.sub(r"MT @\S+", "", row)
row = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", row).split()) # remove hyperlinks
row = row.lower()
row.replace('\t', ' ')
return row
"""
parse a data file, return a list of dictionary, each dictionary corresponds to one tweet
"""
def parse_tweet_json(file):
data_dict = {}
with open(INPUT_FOLDER + file) as f:
lines = f.readlines()
for line in lines:
json_tweet = json.loads(line)
id = json_tweet['id']
uid = json_tweet['user']['id']
timestamp_ms = json_tweet['timestamp_ms']
text = json_tweet['text'].encode('utf-8').strip()
if len(text) < TWEET_MINIMUM_LENGTH:
continue
if IS_TOKENIZED:
tokens = tokenize(text)
if FILTER_BY_KEYWORD:
if not any(t in keywords for t in tokens):
continue
text = ' '.join(tokens)
loc = json_tweet['geo']['coordinates']
if id and uid and timestamp_ms and loc:
tweet_dict = {'uid' : uid, 'timestamp' : timestamp_ms, 'text' : text, 'lat' : loc[0], 'lon' : loc[1]}
data_dict[id] = tweet_dict
file_out = open(OUTPUT_FOLDER + file, 'w')
for key in data_dict.keys():
val = data_dict[key]
line = '\t'.join(map(str, [key, val['uid'], val['lat'], val['lon'], val['timestamp_ms'], val['text']])) + '\n'
file_out.write(line)
file_out.close()
return len(lines)
"""
parse a data file, return a list of dictionary, each dictionary corresponds to one tweet
"""
def parse_tweet_json_gmove(file):
data_dict = {}
with open(INPUT_FOLDER + file) as f:
lines = f.readlines()
tweet_count = 0
for line in lines:
json_tweet = json.loads(line)
id = json_tweet['tweetId']
uid = json_tweet['userId']
timestamp_ms = json_tweet['timestamp']
text = json_tweet['message'].encode('utf-8').strip()
if len(text) < TWEET_MINIMUM_LENGTH:
continue
if IS_TOKENIZED:
tokens = tokenize(text)
if FILTER_BY_KEYWORD:
if not any(t in keywords for t in tokens):
continue
text = ' '.join(tokens)
loc = [json_tweet['lat'], json_tweet['lng']]
if id and uid and timestamp_ms and loc:
tweet_dict = {'uid' : uid, 'timestamp' : timestamp_ms, 'text' : text, 'lat' : loc[0], 'lon' : loc[1]}
data_dict[id] = tweet_dict
tweet_count = tweet_count + 1
if tweet_count % 10000 == 0:
print "Processed ", tweet_count, " tweets"
file_out = open(OUTPUT_FOLDER + file, 'w')
for key in data_dict.keys():
val = data_dict[key]
line = '\t'.join(map(str, [key, val['uid'], val['lat'], val['lon'], val['timestamp'], val['text']])) + '\n'
file_out.write(line)
file_out.close()
return len(lines)
# test
parse_tweet_json_gmove("GMove_LA_tweets_processed.txt")
# obtain a list of dates for disaster response
# dates = np.loadtxt(DISASTER_LIST, dtype = 'str', delimiter='\",\"', usecols = ([1]), skiprows=1)
# disasters_date = [date.split('/')[2] + date.split('/')[0] + date.split('/')[1] for date in dates]
# print 'Extracting data from the following dates: ', dates
"""
extract data according to specified date
"""
if False:
total_tweets = 0
for file in glob.glob(INPUT_FOLDER + "/*"):
filename = re.findall('[^\\\\/]+', file)[-1]
date = filename.split('.')[0]
# if date in disasters_date:
tweet_count = parse_tweet_json(filename)
print 'Date ' + date + ' has ' + str(tweet_count ) + ' tweets'
total_tweets += tweet_count
print '..... total ' + str(total_tweets) + ' tweets have been extracted...'
print 'Done!'
# min_lat, max_lat, min_lon, max_lon = 33, 39, -101, -93
#
# data = np.loadtxt(OUTPUT_FOLDER + './20160903.txt', dtype= float, delimiter='\t', usecols = (2,3))
# valid_rows = np.all([min_lat <= data[:,0], data[:,0] <= max_lat, min_lon <= data[:,1], data[:,1]<= max_lon], axis=0)
#
# data = np.loadtxt(OUTPUT_FOLDER + './20160903.txt', dtype= 'str', delimiter='\t')
# data = data[valid_rows]
#
# print type(data), data.shape
# np.savetxt(OUTPUT_FOLDER + './20160903_filtered.txt', data, fmt='%s', delimiter='\t')