-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgather.py
72 lines (60 loc) · 3.07 KB
/
gather.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python
# encoding: utf-8
import argparse
import json
from datetime import datetime
from dateutil import parser
import re
import os
import random
def parse_arguments():
parser = argparse.ArgumentParser(description='Normalize a json.')
parser.add_argument('-max', dest='max', action='store', type=int, help='approx. maximum to gather')
return parser.parse_args()
def main(run_args):
total_hashtags = 0
without_hashtags = 0
tweets = []
elements = os.listdir('data/datasets')
random.shuffle(elements)
for element in elements:
if element.endswith('_tweets.json'):
user_tweets = json.load(open('data/datasets/' + element))
account_name = element.replace("_tweets.json","");
temp_without_hashtags = 0
temp_total_hashtags = 0
print("### Get hashtags from : "+account_name+" ###")
for t in user_tweets:
hour = parser.parse(t['date'])
if t['followers_count'] == 0:
continue
# parse hashtags
hashtag = []
if t['hashtags']:
for h in t['hashtags']:
if re.match("^[a-zA-Z0-9àèìòùÀÈÌÒÙáéíóúýÁÉÍÓÚÝâêîôûÂÊÎÔÛãñõÃÑÕäëïöüÿÄËÏÖÜŸçÇßÆæœ_-]*$", h):
temp_total_hashtags += 1
hashtag.append(h)
else:
temp_without_hashtags += 1
#tweets.append({'user': account_name, 'weekday': (int(hour.strftime('%w'))), 'hour':(hour.strftime('%H:%M')), 'hashtag': hashtag, 'score': (t['rt'] * 2 + t['fav']), 'score2': round(t['rt'] * 100 / t['followers_count'],4),'text': t['text'], 'followers_count':t['followers_count'], 'friends_count':t['friends_count'], 'listed_count':t['listed_count'], 'statuses_count':t['statuses_count']})
tweets.append({'user': account_name, 'weekday': (int(hour.strftime('%w'))), 'hour':(hour.strftime('%H:%M')), 'hashtag': hashtag, 'score': (t['rt'] * 2 + t['fav']), 'score2': round(t['rt'] * 100 / t['followers_count'],4),'text': t['text'], 'followers_count':t['followers_count'], 'friends_count':t['friends_count'], 'listed_count':t['listed_count'], 'statuses_count':t['statuses_count']})
print("Number of Tweets : "+ str(len(user_tweets)))
print("Number of tweets without hashtags : "+ str(temp_without_hashtags))
print("Number of hashtags : "+ str(temp_total_hashtags))
# Update final counter
total_hashtags += temp_total_hashtags
without_hashtags += temp_without_hashtags
if run_args.max and total_hashtags > run_args.max:
break
filename = 'data/gathered/gathering_' +str(len(tweets))+ '_' +str(total_hashtags)+ '.json'
with open(filename, 'w') as outfile:
json.dump(tweets, outfile)
print("\n=======================")
print("Number of tweets : " + str(len(tweets)))
print("With hashtags : "+ str(len(tweets) - without_hashtags) + " ("+str(round((len(tweets) - without_hashtags)*100/len(tweets),2))+"%)")
print("Without hashtags : "+ str(without_hashtags) + " ("+str(round(without_hashtags*100/len(tweets),2))+"%)")
print("Total hashtag : "+ str(total_hashtags)+ " (mean value : "+str(round(total_hashtags*100/len(tweets),2))+"%)")
if __name__ == "__main__":
args = parse_arguments()
main(args)