forked from SpiderClub/haipproxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzhihu_spider.py
129 lines (99 loc) · 3.47 KB
/
zhihu_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# the code is some copied from https://github.com/windcode/zhihu-crawler-people
import json
import time
from multiprocessing import Pool
from bs4 import BeautifulSoup as BS
from utils import get_redis_conn
from examples.zhihu.crawler import Crawler
per_page = 20
info_max_process_num = 50
list_max_process_num = 10
host = 'https://www.zhihu.com'
waiting_set = 'zhihu:seeds:to_crawl'
seeds_all = 'zhihu:seeds:all'
info_set = 'zhihu:info:user'
# Not considering concurrent security
common_crawler = Crawler()
def init_db():
redis_client = get_redis_conn(db=1)
return redis_client
def get_info(url_token):
"""get user info"""
url = '%s/people/%s/answers' % (host, url_token)
html = common_crawler.get(url)
print("parsing page's HTML……")
if not html:
return
s = BS(html, 'html.parser')
try:
data = s.find('div', attrs={'id': 'data'})['data-state']
data = json.loads(data)
data = data['entities']['users'][url_token]
except Exception:
return None
# filter data according to userType
if data['userType'] != 'people':
return None
return data
def get_per_followers(url_token, page, sum_page):
"""crawl use's followers"""
print('crawling page %d/%d ……' % (page, sum_page))
followers = list()
url = '%s/people/%s/followers?page=%d' % (host, url_token, page)
html = common_crawler.get(url)
s = BS(html, 'html.parser')
try:
data = s.find('div', attrs={'id': 'data'})['data-state']
data = json.loads(data)
items = data['people']['followersByUser'][url_token]['ids']
except (AttributeError, TypeError):
return list()
for item in items:
if item is not None and item is not False and item is not True and item != '知乎用户':
print(item)
followers.append(item)
return followers
def get_followers(url_token, follower_count):
# get all the followers of the specified url_token
# return [] if user has no followers
if follower_count == 0:
return []
sum_page = int((follower_count - 1) / per_page) + 1
pool = Pool(processes=list_max_process_num)
results = []
for page in range(1, sum_page + 1):
results.append(pool.apply_async(get_per_followers, (url_token, page, sum_page)))
pool.close()
pool.join()
follower_list = []
for result in results:
follower_list += result.get()
return follower_list
def start():
redis_client = init_db()
while not redis_client.scard(waiting_set):
# block if there is no seed in waitting_set
time.sleep(0.1)
# fetch seeds from waitting_set
url_token = redis_client.spop(waiting_set).decode()
print("crawling %s's user info……" % url_token)
user = get_info(url_token)
redis_client.sadd(info_set, user)
print("crawling %s's followers list……" % url_token)
try:
follower_list = get_followers(url_token, user['followerCount'])
except (TypeError, AttributeError):
return
push_success_num = 0
for follower in follower_list:
if not redis_client.sismember(seeds_all, follower):
redis_client.sadd(waiting_set, follower)
redis_client.sadd(seeds_all, follower)
push_success_num += 1
if __name__ == '__main__':
init_seeds = ['excited-vczh', 'resolvewang']
redis_conn = init_db()
redis_conn.sadd(waiting_set, *init_seeds)
redis_conn.sadd(seeds_all, *init_seeds)
while True:
start()