-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathget_tweets_by_id.py
268 lines (229 loc) · 8.82 KB
/
get_tweets_by_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
'''
Gets text content for tweet IDs
http://stackoverflow.com/questions/28384588/twitter-api-get-tweets-with-specifc-id
'''
# standard
from __future__ import print_function
import getopt
import logging
import os
import sys
import glob
import re
import time
# import traceback
# third-party: `pip install tweepy`
import tweepy
from array import *
# global logger level is configured in main()
Logger = None
#TWEET_PATH = "./data/state_id_2014-08-24/trial.txt"
# For Michigan Flood Folder
#TWEET_PATH = "./data/michigian_flood/2014-08-16"
# For New York Flood
#TWEET_PATH = "./data/ny_flood"
# For Washington tweets
TWEET_PATH = "./data/twitter_sandy/data/"
#county_array = [6075, 6055, 6041, 6033, 6097, 6113, 6095, 6011, 6013, 6001, 6081]
# Generate your own at https://apps.twitter.com/app
#CONSUMER_KEY = 'VIlDdi6LKAjGJxUhsxZNc1P1t'
#CONSUMER_SECRET = 'oh6dwllSvDszpxp100hZWs9jLrNnRzFw8ixgtIlzOlrjOGjfG5'
#OAUTH_TOKEN = '573560865-y1IBXNelm6YbmMeS4E6vSbbOgng7cSlNIGfLp1sW'
#OAUTH_TOKEN_SECRET = 'Snw836BTOUDVy56Tg6o3IVtjkZwyUUtfosIhzNEwnHRSx'
#CONSUMER_KEY = '7UvZlPPrZXUl4QVVMc6K0aTV6'
#CONSUMER_SECRET = 'eSrnLcdRhbrs4zHdGKQDZa2lJOC7fildOIKcckktpTaYSp7T06'
#OAUTH_TOKEN = '1633608188-VsB2v5qqOUb8amj4InBTryK6mwmYNifkfiSNjsx'
#OAUTH_TOKEN_SECRET = 'WFQzD23giH073sEhSkN4e2WsdWsT2E9QLydak4nkIqa5d'
#CONSUMER_KEY = '0xlAczBK98VvBvhqf5kGRQ'
#CONSUMER_SECRET = 'Mcxk4lhY8WYihITXg2IQxKzkjwRexUkwFhzRtgkikU'
#OAUTH_TOKEN = '43861519-L2xfj6RsaXw1kt6fr4AA2a711ghSx2NXisETqPar5'
#OAUTH_TOKEN_SECRET = 'H2EmiQbP6MFOzJrv5N2sXflApnIIIQJJ4difHk0sYoI'
CONSUMER_KEY = 'jQck5XQ2zC1RTM05724QpeWmA'
CONSUMER_SECRET = 'CZs95zlrnhAemvsFj760tK3ZD2at7YrRZt7kgU91RzeDdDObmq'
OAUTH_TOKEN = '3028433558-noFZLHJl1KU0lxEm0hjeQKhpFvFldO5sDlTsSv7'
OAUTH_TOKEN_SECRET = '4O0y26ViLqzYWlYlqLece4Rz4EaTeTFi9SE6Zgqk38xJZ'
#CONSUMER_KEY = 'k8uVBodZUOTOJeUH1zIRw'
#CONSUMER_SECRET = 'Lkgdz9MRWdksjx19eeNwojBdPPWGb3ov06fcwDIw'
#OAUTH_TOKEN = '465794227-Cn0pPQE5HYGZvWfZdqevNl3rb8OGrhhwPjLxresu'
#OAUTH_TOKEN_SECRET = 'Wtd2TrGSCiH3cW6MpSJwvEfNVVcQHMFk9uFGiHGaEG4'
def get_tweet_id(line):
'''
Extracts and returns tweet ID from a line in the input.
'''
# (tagid,_timestamp,_sandyflag) = line.split('\t')
# (_tag, _search, tweet_id) = tagid.split(':')
# return tweet_id
return line.strip()
def get_tweets_single(twapi, idfilepath):
'''
Fetches content for tweet IDs in a file one at a time,
which means a ton of HTTPS requests, so NOT recommended.
`twapi`: Initialized, authorized API object from Tweepy
`idfilepath`: Path to file containing IDs
'''
# process IDs from the file
with open(idfilepath, 'rb') as idfile:
for line in idfile:
tweet_id = get_tweet_id(line)
Logger.debug('Fetching tweet for ID %s', tweet_id)
try:
tweet = twapi.get_status(tweet_id)
print('%s,%s' % (tweet_id, tweet.text.encode('UTF-8')))
except tweepy.TweepError as te:
Logger.warn('Failed to get tweet ID %s: %s', tweet_id, te.message)
# traceback.print_exc(file=sys.stderr)
# for
# with
def get_tweet_list(twapi, idlist, file_out):
'''
Invokes bulk lookup method.
Raises an exception if rate limit is exceeded.
'''
# fetch as little metadata as possible
while True:
try:
tweets = twapi.statuses_lookup(id_=idlist, include_entities=False, trim_user=True)
for tweet in tweets:
# "tweet content","date time","user id",lat,lon
if tweet.geo is not None:
content = re.sub("\s\s+", " ", tweet.text.encode('UTF-8'))
content = content.replace("\n", " ")
line = ','.join(map(str, [content, tweet.created_at, tweet.user.id, tweet.geo['coordinates'][0], tweet.geo['coordinates'][1]])) + '\n'
file_out.write(line)
except tweepy.TweepError, e:
time.sleep(60 * 15)
print ("TweepError raised, ignoring and continuing.")
print (e)
continue
except StopIteration:
break
break
def get_tweets_bulk(twapi, idfilepath):
'''
Fetches content for tweet IDs in a file using bulk request method,
which vastly reduces number of HTTPS requests compared to above;
however, it does not warn about IDs that yield no tweet.
`twapi`: Initialized, authorized API object from Tweepy
`idfilepath`: Path to file containing IDs
'''
# process IDs from the file
tweet_ids = list()
#filepath = "./data/state_id_2014-08-25/sent1.txt"
filepath = "./data/michigian_flood/"
'''
parts = idfilepath.split('\\')
county = parts[1].split('_')
if int(county[1][1:]) in county_array:
county_array.remove(int(county[1][1:]))
#filepath = parts[0][:-3] + '.txt'
filepath = "./data/state_id_2014-08-25/output/"+parts[1]
file_out = open(filepath, 'w')
'''
'''
folder_list = []
print ([x[1] for x in os.walk(TWEET_PATH)])
print (folder_list)
'''
os.chdir(TWEET_PATH)
for ijk in xrange (9, 10):
if ijk < 10:
s = "0"+str(ijk)
else:
s = str(ijk)
os.chdir("C:/Sumeet/IMSC/tweet/tweet_mining/data/iowa_stf_2/")
os.makedirs("out_2014-07-"+ str(s))
os.chdir("C:/Sumeet/IMSC/tweet/tweet_mining/data/iowa_stf_2/2014-07-"+ str(s))
print (os.getcwd())
for file in glob.glob("*.txt"):
idfilepath = str(file)
if int(idfilepath[11:13]) == 19:
file_out = open("C:/Sumeet/IMSC/tweet/tweet_mining/data/iowa_stf_2/out_2014-07-"+ str(s) + "/" +idfilepath, 'w')
with open(idfilepath, 'rb') as idfile:
for line in idfile:
tweet_id = get_tweet_id(line)
Logger.debug('Fetching tweet for ID %s', tweet_id)
# API limits batch size to 100
if len(tweet_ids) < 100:
tweet_ids.append(tweet_id)
else:
get_tweet_list(twapi, tweet_ids, file_out)
tweet_ids = list()
# process rump of file
if len(tweet_ids) > 0:
get_tweet_list(twapi, tweet_ids, file_out)
file_out.close()
'''
os.chdir(TWEET_PATH)
print(os.getcwd())
for file in glob.glob("*.txt"):
idfilepath = str(file)
print (idfilepath)
file_out = open("C:/Sumeet/IMSC/tweet/tweet_mining/data/twitter_sandy/tweet.txt", 'w')
with open(idfilepath, 'rb') as idfile:
for line in idfile:
#print (line)
a = [x.strip() for x in line.split('\t')]
ax = [x.strip() for x in a[0].split(':')]
#print (a[-1])
if a[-1] == "False":
#print ("sd")
continue
#print (ax[-1])
#break
tweet_id = get_tweet_id(ax[-1])
Logger.debug('Fetching tweet for ID %s', tweet_id)
# API limits batch size to 100
if len(tweet_ids) < 100:
tweet_ids.append(tweet_id)
else:
get_tweet_list(twapi, tweet_ids, file_out)
tweet_ids = list()
# process rump of file
if len(tweet_ids) > 0:
get_tweet_list(twapi, tweet_ids, file_out)
file_out.close()
'''
def usage():
print('Usage: get_tweets_by_id.py [options] folder/file')
print(' -s (single) makes one HTTPS request per tweet ID')
print(' -v (verbose) enables detailed logging')
sys.exit()
def main(args):
logging.basicConfig(level=logging.WARN)
global Logger
Logger = logging.getLogger('get_tweets_by_id')
bulk = True
try:
opts, args = getopt.getopt(args, 'sv')
except getopt.GetoptError:
usage()
for opt, _optarg in opts:
if opt in ('-s'):
bulk = False
elif opt in ('-v'):
Logger.setLevel(logging.DEBUG)
Logger.debug("verbose mode on")
else:
usage()
if len(args) != 1:
usage()
idfile = args[0]
if not os.path.isfile(idfile):
for file in glob.glob(TWEET_PATH):
#print ("parsing " + str(file))
parse_one_file(file, bulk)
else: # this is a file
parse_one_file(idfile, bulk)
def parse_one_file(idfile, bulk):
# connect to twitter
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)
# hydrate tweet IDs
if bulk:
get_tweets_bulk(api, idfile)
else:
get_tweets_single(api, idfile)
if __name__ == '__main__':
#print (sys.argv[1:])
main(sys.argv[1:])