-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatagen-es.py
84 lines (65 loc) · 2.79 KB
/
datagen-es.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import configparser
import TwitterAPI
import sys
import pandas as pd
import numpy as np
def get_census_names():
males = pd.read_csv("male_es.txt",header=None,names=["Name"])
male_names = np.array(males["Name"])
male_names = male_names
females = pd.read_csv("female_es.txt",header=None, names = ["Name"])
female_names = np.array(females["Name"])
return male_names, female_names
male_names, female_names = get_census_names()
print('Number of Male Names:', len(male_names))
print('Number of Female Names:', len(female_names))
print('male name sample:', list(male_names[:5]))
print('female name sample:', list(female_names[:5]))
def get_twitter(config_file):
token1, token2, token3, token4 = ''
config = configparser.ConfigParser()
config.read(config_file)
twitter = TwitterAPI.TwitterAPI(token1, token2, token3, token4)
return twitter
twitter = get_twitter('twitter.cfg')
def get_first_name(tweet):
if 'user' in tweet and 'name' in tweet['user']:
parts = tweet['user']['name'].split()
if len(parts) > 0:
return parts[0].lower()
def get_first_uid(tweet):
if 'user' in tweet and 'id_str' in tweet['user']:
parts = tweet['user']['id_str'].split()
if len(parts) > 0:
return parts[0]
def get_first_text(tweet):
if 'text' in tweet:
return tweet['text']
def sample_tweets(twitter, limit, male_names, female_names):
import time
ids = []
while True:
try:
# Restrict to U.S.
for response in twitter.request('statuses/filter',{'locations':'-9.868,35.759,2.907,43.018'}):
time.sleep ( 1 )
print(response)
if 'user' in response:
name = get_first_name(response)
if name in male_names:
id = get_first_uid(response)
ids.append(id)
f = open ( "tweets_es.txt" , "a" )
f.write (get_first_uid ( response ) + '\t' + get_first_text ( response ).replace ( '\t' ,' ' ).replace ('\n' , ' ' ).replace ( '\r' , '' ) + '\t' + 'men\n' )
f.close ( )
if name in female_names:
id = get_first_uid ( response )
ids.append ( id )
f = open ( "tweets_es.txt" , "a" )
f.write (get_first_uid ( response ) + '\t' + get_first_text ( response ).replace ( '\t' ,' ' ).replace ('\n' , ' ' ).replace ( '\r' , '' ) + '\t' + 'woman\n' )
f.close ( )
except:
print("Unexpected error:", sys.exc_info())
return
tweets = sample_tweets(twitter, 100000, male_names, female_names)
print('Number of tweets of users in census list:',len(tweets))