Skip to content

Commit

Permalink
First version of Twitter Timeline Scraper with a Connection to MongoDB
Browse files Browse the repository at this point in the history
  • Loading branch information
bceskavich committed Mar 9, 2018
1 parent 6000e83 commit f151131
Show file tree
Hide file tree
Showing 12 changed files with 44,929 additions and 162 deletions.
Empty file modified .gitignore
100644 → 100755
Empty file.
Empty file modified README.md
100644 → 100755
Empty file.
13 changes: 13 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python

MONGO_ACCOUNT = {'username': 'xxx',
'password': 'xxx'}

DB_NAME = 'xxx'

AUTH = {
'consumer_key': 'xxx',
'consumer_secret': 'xxx',
'access_token': 'xxx',
'access_token_secret': 'xxx'
}
161 changes: 0 additions & 161 deletions db.py

This file was deleted.

137 changes: 137 additions & 0 deletions db_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# -*- coding: UTF-8 -*-
import sys
import json
import sys
import pymongo
import csv
import logging
import config as cfg
from pytz import timezone
from datetime import datetime
from collections import Counter

mongoClient = pymongo.MongoClient()
mongoClient.admin.authenticate(cfg.MONGO_ACCOUNT['username'], cfg.MONGO_ACCOUNT['password'])
mongoDB = mongoClient[cfg.DB_NAME]

logging.basicConfig(format='%(asctime)s %(message)s',
filename='./logs/twitter-scraper.log',
level=logging.DEBUG)

def insert_tweet_data(tweet):
try:
utc = timezone('UTC')
docId = tweet['id']

tweet['created_at'] = utc.localize(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +%f %Y'))
tweet['updated_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M")
tweet['text'] = tweet['text'].encode('utf-8')

isExist = mongoDB.TW_cand.find_one({'id': docId})
if isExist is None:
mongoDB.TW_cand.insert(tweet)
else:
mongoDB.TW_cand.update( { 'id': docId },
{ '$set': {'updated_at': tweet['updated_at'],
'retweet_count': tweet['retweet_count'],
'reply_count': tweet['reply_count'],
'favorite_count': tweet['favorite_count'],
'user.followers_count': tweet['user']['followers_count'],
'user.listed_count': tweet['user']['listed_count'],
'user.friends_count': tweet['user']['friends_count']}},
upsert=True, multi=False)

except Exception as e:
template = "In insert_tweet_data(). 1 An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(e.message, e.args)
logging.debug(message)

def insert_reply_data(tweet):
try:
utc = timezone('UTC')
docId = tweet['id']

tweet['created_at'] = utc.localize(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +%f %Y'))
tweet['updated_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M")
tweet['text'] = tweet['text'].encode('utf-8')

isExist = mongoDB.TW_reply.find_one({'id': docId})
if isExist is None:
mongoDB.TW_reply.insert(tweet)
else:
mongoDB.TW_reply.update( { 'id': docId },
{ '$set': {'updated_at': tweet['updated_at'],
'retweet_count': tweet['retweet_count'],
'favorite_count': tweet['favorite_count'],
'user.followers_count': tweet['user']['followers_count'],
'user.listed_count': tweet['user']['listed_count'],
'user.friends_count': tweet['user']['friends_count']}},
upsert=True, multi=False)

except Exception as e:
template = "In insert_reply_data(). 2 An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(e.message, e.args)
logging.debug(message)

def insert_candidate_data(tweet):
try:
utc = timezone('UTC')

candId = tweet['user']['id']

cand = {}
cand['id'] = candId
cand['screen_name'] = tweet['user']['screen_name']
cand['name'] = tweet['user']['name']
cand['followers_count'] = tweet['user']['followers_count']
cand['listed_count'] = tweet['user']['listed_count']
cand['friends_count'] = tweet['user']['friends_count']
cand['created_at'] = utc.localize(datetime.strptime(tweet['user']['created_at'], '%a %b %d %H:%M:%S +%f %Y'))

cand['updated_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M")

isExist = mongoDB.TW_cand_info.find_one({'id': candId})
if isExist is None:
mongoDB.TW_cand_info.insert(cand)
else:
mongoDB.TW_cand_info.update( { 'id': candId },
{ '$set': {'updated_at': cand['updated_at'],
'followers_count': cand['followers_count'],
'listed_count': cand['listed_count'],
'friends_count': cand['friends_count']}},
upsert=True, multi=False)

except Exception as e:
template = "In insert_candidate_data(). 3 An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(e.message, e.args)
logging.debug(message)

def insert_tweet_log(tweet):
try:
utc = timezone('UTC')
docId = tweet['id']

log_data = {}
log_data['tweet_id'] = tweet['id']
log_data['retweet_count'] = tweet['retweet_count']
log_data['reply_count'] = tweet['reply_count']
log_data['favorite_count'] = tweet['favorite_count']

log_data['user'] = {}
log_data['user']['followers_count'] = tweet['user']['followers_count']
log_data['user']['friends_count'] = tweet['user']['friends_count']
log_data['user']['listed_count'] = tweet['user']['listed_count']

log_data['log_created_at'] = datetime.utcnow().strftime("%Y-%m-%d %H:%M")

mongoDB.TW_cand_crawl_history.insert(log_data)

except Exception as e:
template = "In insert_tweet_log(). 4 An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(e.message, e.args)
logging.debug(message)





Loading

0 comments on commit f151131

Please sign in to comment.