-
Notifications
You must be signed in to change notification settings - Fork 0
/
user_information.py
110 lines (88 loc) · 3.61 KB
/
user_information.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
'''This code reads in the urls of users and extracts user information form the page.
'''
import os
import time
import random
import urllib2
from bs4 import BeautifulSoup
log_file = file("log_data_question_text", "w")
users_stackoverflow = file("users_stackoverflow.txt", 'a')
def read_links():
filename = file("user_hyperlinks.txt", "r")
g = filename.readlines()
filename.close()
user_hyperlinks = []
for i in g:
user_hyperlinks.append(i.strip().split("#")[2])
return user_hyperlinks
def send_request(current_url):
try:
print "=============================================================================="
#request_count += 1
#print "Request no ", request_count
#print "Sending Request to ", current_url, " at time ", time.asctime()
req = urllib2.Request(current_url)
req.add_header('User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0')
page_content = urllib2.urlopen(req)
except Exception as e:
print "Exception caught at time ", time.asctime()
print "Exception tuple ", e
sleep_time = random.randint(5, 10)
print "Sleeping for ", sleep_time, "seconds"
time.sleep(sleep_time)
#printing in log
print >>log_file, "Exception caught at time ", time.asctime()
print >>log_file, "Exception tuple ", e
print >>log_file, "Sleeping for ", sleep_time, "seconds"
return page_content
def extract(user_hyperlinks):
while len(user_hyperlinks) != 0:
print "Frontier remaning", len(user_hyperlinks)
relative_url = user_hyperlinks.pop(0)
if "/users/" not in relative_url or len(user_hyperlinks) > 17287:
continue
user_id = relative_url.split("/")[2]
current_url = "http://www.stackoverflow.com" + relative_url
try:
page_content = send_request(current_url)
except BaseException as bee:
print "Exception", bee
continue
#Use bs4
soup = BeautifulSoup(page_content)
#extact user information, username, location, questions answered, poeple reached, time for membership, profile views, top tags, score
#posts, reputation, gold, silver, bronze badges
username_tag = soup.findAll('h2', attrs={'class' : 'user-card-name'})
try:
location_tag = soup.findAll('ul', attrs={'class' : 'list-unstyled'})[0]
except Exception as e:
print "Exception caught ", e
print >>log_file, e
continue
question_answered_tag = soup.find('span', attrs={'class' : 'number'})
#people_reached_tag = soup.findAll('span', attrs={'class' : 'number'})
#profile_view_tag = soup.findAll('ul', attrs={'class' : 'list-unstyled'})[0]
reputation_tag = soup.find('div', attrs={'class' : 'reputation'})
top_tags_tag = soup.findAll('a', attrs={'class' : 'post-tag'})
#manipulate and extract information from all the tags
username = str(username_tag[0].text.encode("UTF-8")).strip().split("\n")[0]
location = str(location_tag.text.split("\n")[3].encode("UTF-8")).strip()
q_answered = str(question_answered_tag.text.encode("UTF-8"))
reputation = str(reputation_tag.text.encode("UTF-8")).strip().split()[0]
#collct list of tags connected to the user
tag_list = []
buff = ""
for tag in top_tags_tag:
buff = buff + str(tag.text.encode("UTF-8")) + "#"
#Write them to file
print >>users_stackoverflow, str(user_id) + "#" + str(username) + "#" + str(location) + "#" + str(q_answered) + "#" + str(reputation) + "#" + str(buff)
print str(user_id) + "#" + str(username) + "#" + str(location) + "#" + str(q_answered) + "#" + str(reputation) + "#" + str(buff)
users_stackoverflow.close()
if __name__ == '__main__':
user_hyperlinks = read_links()
try:
extract(user_hyperlinks)
except BaseException as be:
print >>log_file, be
finally:
users_stackoverflow.close()