Skip to content

Commit

Permalink
Merged from remote master
Browse files Browse the repository at this point in the history
  • Loading branch information
bceskavich committed Nov 10, 2015
2 parents 26cab2f + 9dbcf5d commit 6000e83
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 24 deletions.
6 changes: 4 additions & 2 deletions db.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
Column('in_reply_to_screen_name', String(255)),
Column('text', String(255), nullable=False),
Column('retweet_count', Integer, nullable=False),
Column('created_at', DateTime)
Column('created_at', DateTime),
Column('reply_count', Integer)
)

urls = Table('urls', metadata,
Expand Down Expand Up @@ -98,7 +99,8 @@ def update_tweet_info(tweet, conn):
update = tweets.update().where(tweets.c.id_str == tweet['id_str']).\
values(
retweet_count=tweet['retweet_count'],
created_at=tweet['created_at']
created_at=tweet['created_at'],
reply_count=tweet['reply_count']
)

conn.execute(update)
Expand Down
93 changes: 71 additions & 22 deletions timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@ def parse_tweet_entities(entities):

return entities_dict

def parse_tweet(status):
def parse_tweet(status, replies):
status = status._json

reply_count = 0
if status['id_str'] in replies:
reply_count = replies[status['id_str']]

parsed_status_dict = {
'tweet': {
'text': status['text'],
Expand All @@ -32,6 +36,7 @@ def parse_tweet(status):
'in_reply_to_user_id_str': status['in_reply_to_user_id_str'],
'in_reply_to_screen_name': status['in_reply_to_screen_name'],
'in_reply_to_status_id_str': status['in_reply_to_status_id_str'],
'reply_count': reply_count,
'created_at': status['created_at'],
'entities': parse_tweet_entities(status['entities'])
},
Expand Down Expand Up @@ -63,31 +68,16 @@ def run_insert(filename):

os.remove(filename)

def timeline(auth, handle):
api_auth = tweepy.OAuthHandler(auth['consumer_key'], auth['consumer_secret'])
api_auth.set_access_token(auth['access_token'], auth['access_token_secret'])
api = tweepy.API(api_auth)

print 'COLLECTING FOR: {}'.format(handle)
print

if api.verify_credentials:
print 'Successfully authenticated with Twitter.'
print 'Collecting...'
else:
print 'Failed to authenticate with Twitter. Please try again.'
sys.exit(1)

filename = '{}-timeline.json'.format(handle)
def timeline(filename, handle, replies, api):
status_count = 0
with open(filename, 'a') as outfile:
timeline = tweepy.Cursor(api.user_timeline, screen_name=handle, count=200).items()
collecting = True
status_count = 1

while collecting:
try:
status = next(timeline)
parsed_status = parse_tweet(status)
parsed_status = parse_tweet(status, replies)
outfile.write(json.dumps(parsed_status).encode('utf-8'))
outfile.write('\n')
status_count += 1
Expand All @@ -99,13 +89,72 @@ def timeline(auth, handle):
except StopIteration as e:
collecting = False

return status_count

def replies(handle, api):
reply_count = 0
reply_counts_dict = {}

search_results = tweepy.Cursor(
api.search,
q='to:{}'.format(handle),
result_type='recent',
count=100
).items()

collecting = True
while collecting:
try:
status = next(search_results)
parsed_status = status._json
reply_id = parsed_status['in_reply_to_status_id_str']

if reply_id is not None:
if reply_id in reply_counts_dict:
reply_counts_dict[reply_id] += 1
else:
reply_counts_dict[reply_id] = 1
reply_count += 1

except TweepError as e:
print 'Received timeout. Sleeping for 15 minutes.'
time.sleep(15 * 60)

except StopIteration as e:
collecting = False

return reply_count, reply_counts_dict

def collect(auth, handle):
api_auth = tweepy.OAuthHandler(auth['consumer_key'], auth['consumer_secret'])
api_auth.set_access_token(auth['access_token'], auth['access_token_secret'])
api = tweepy.API(api_auth)

print 'COLLECTING FOR: {}'.format(handle)
print

if api.verify_credentials:
print 'Successfully authenticated with Twitter.'
else:
print 'Failed to authenticate with Twitter. Please try again.'
sys.exit(1)

filename = '{}-timeline.json'.format(handle)

print 'Collecting replies...'
reply_count, reply_counts_dict = replies(handle, api)
print
print 'TOTAL Replies Collected: {}'.format(reply_count)
print

print 'Collecting {}\'s timeline'.format(handle)
status_count = timeline(filename, handle, reply_counts_dict, api)
print
print 'TOTAL Tweets Collected: {}'.format(status_count)
print
print 'Now inserting...'

print 'Now inserting...'
run_insert(filename)

print 'Insertion completed'
print

Expand Down Expand Up @@ -140,7 +189,7 @@ def run_timeline(auth):

while True:
for handle in CANDIDATES_LIST:
timeline(auth, handle)
collect(auth, handle)

print 'All candidates collected for. Resuming in 20 minutes.'
time.sleep(20 * 60)

0 comments on commit 6000e83

Please sign in to comment.