-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuser_info.py
110 lines (89 loc) · 3.19 KB
/
user_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
'''
This code collects user page hyperlinks of those users who have answered questions
on stackoverflow.com.
Date : 21 November, 2015
Author : SiddharthaAnand
'''
import sys
import time
import random
import urllib2
from bs4 import BeautifulSoup
log_file = file("log_data_user_links", "w")
filename = "q_hyperlink.txt"
def read_q_hyperlinks():
f = file(filename, "r")
g = f.readlines()
f.close()
hyperlink_list = []
count = 0
for i in g:
count += 1
if count < 2000:
continue
hyperlink_list.append(i.strip())
return hyperlink_list
def extract_user_hyperlinks(question_hyperlinks):
'''This method takes as input list of hyperlinks of questions on stackoverflow.com
and returns the list of userpage hyperlinks.
'''
f = open("user_hyperlinks.txt", "a+")
user_hyperlink_list = []
upvote_list = []
request_count = 0
ques_no = 0
while( len(question_hyperlinks) != 0 ):
try:
#Create request using urllib2 request
print "=============================================================================="
request_count += 1
ques_no += 1
relative_url = question_hyperlinks.pop(0)
current_url = "http://www.stackoverflow.com" + relative_url
print "Request no ", request_count
print "frontier length ", len(question_hyperlinks)
print "Sending Request to ", current_url, " at time ", time.asctime()
req = urllib2.Request(current_url)
req.add_header('User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0')
page_content = urllib2.urlopen(req)
#page_content = urllib2.urlopen(current_url, timeout=timeout)
print "Received response from ", current_url, " at time ", time.asctime()
soup = BeautifulSoup(page_content)
#Collect all answers
answers_tag = soup.findAll('div', attrs={'id' : 'answers'})
#extract different parts from the answer html
a_upvote_tag = answers_tag[0].findAll('span', attrs={'itemprop' : 'upvoteCount'})
for upvote in a_upvote_tag:
print "Upvote loop"
upvote_list.append(str(upvote.text.encode("UTF-8")))
print upvote
user_details_tag = answers_tag[0].findAll('div', attrs={'class' : 'user-details'})
for index in range(len(user_details_tag)):
if user_details_tag[index].a == None:
continue
user_hyperlink_list.append(user_details_tag[index].a['href'])
print user_details_tag[index].a['href']
#write to file
for vote, link in zip(upvote_list, user_hyperlink_list):
print str(ques_no) + "#" + str(vote) + "#" + str(link)
print >>f, str(ques_no) + "#" + str(vote) + "#" + str(link)
user_hyperlink_list = []
upvote = []
except KeyboardInterrupt as ki:
f.close()
log_file.close()
print "All files closed"
sys.exit(-1)
except BaseException as e:
print "Exception caught at time ", time.asctime()
print "Exception tuple ", e
sleep_time = random.randint(5, 10)
print "Sleeping for ", sleep_time, "seconds"
time.sleep(sleep_time)
#printing in log
print >>log_file, "Exception caught at time ", time.asctime()
print >>log_file, "Exception tuple ", e
print >>log_file, "Sleeping for ", sleep_time, "seconds"
if __name__ == '__main__':
q_hyperlink_list = read_q_hyperlinks()
extract_user_hyperlinks(q_hyperlink_list)