forked from kiranvodrahalli/cos521
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_trend_predictor.py
60 lines (45 loc) · 2.35 KB
/
naive_trend_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from collections import defaultdict
import itertools
from dateutil import parser
TRENDING_THRESHOLD = 25
class NaiveTrendPredictor:
def __init__(self, data_file):
self.tweets = defaultdict(list)
self.inverted_tweets_idx = defaultdict(list)
with open(data_file) as f:
for line in f:
try:
id_, timestamp, hashtag = line.strip().split(',')
tweet_dt = parser.parse(timestamp)
except Exception, e:
print e
continue
self.tweets[hashtag.lower()].append(tweet_dt)
self.inverted_tweets_idx[tweet_dt].append(hashtag.lower())
def get_hashtags(self, start_dt, end_dt):
hashtag_grps = (self.inverted_tweets_idx[tweet_dt]
for tweet_dt in self.inverted_tweets_idx
if start_dt <= tweet_dt and tweet_dt <= end_dt)
hashtags = (hashtag for hashtag_grp in hashtag_grps
for hashtag in hashtag_grp)
return hashtags
def get_hashtag_freq(self, hashtag, start_dt, end_dt):
return sum(1 for tweet_dt in self.tweets[hashtag]
if start_dt <= tweet_dt and tweet_dt <= end_dt)
def get_most_popular(self, n, start_dt, end_dt):
hashtags = set(self.get_hashtags(start_dt, end_dt))
most_popular = {(self.get_hashtag_freq(hashtag, start_dt, end_dt), hashtag)
for hashtag in hashtags}
return list(itertools.islice(reversed(sorted(most_popular)), 0, n))
def get_most_novel(self, n, start_dt, end_dt):
prev_start_dt = start_dt - (end_dt - start_dt)
prev_hashtags = set(self.get_hashtags(prev_start_dt, start_dt))
cur_hashtags = set(self.get_hashtags(start_dt, end_dt))
prev_freq_counts = {hashtag: self.get_hashtag_freq(hashtag, prev_start_dt, start_dt)
for hashtag in prev_hashtags}
cur_freq_counts = {hashtag: self.get_hashtag_freq(hashtag, start_dt, end_dt)
for hashtag in cur_hashtags}
most_novel = (((cur_freq_counts[hashtag] + 1.0)/
(prev_freq_counts.get(hashtag, 0) + TRENDING_THRESHOLD), hashtag)
for hashtag in cur_freq_counts)
return list(itertools.islice(reversed(sorted(most_novel)), 0, n))