-
Notifications
You must be signed in to change notification settings - Fork 72
/
Copy pathweibo_search.py
74 lines (59 loc) · 2.34 KB
/
weibo_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# coding: utf-8
import re
import json
import requests
# 基于 m.weibo.cn 抓取少量数据,无需登陆验证
url_template = "https://m.weibo.cn/api/container/getIndex?type=wb&queryVal={}&containerid=100103type=2%26q%3D{}&page={}"
def clean_text(text):
"""清除文本中的标签等信息"""
dr = re.compile(r'(<)[^>]+>', re.S)
dd = dr.sub('', text)
dr = re.compile(r'#[^#]+#', re.S)
dd = dr.sub('', dd)
dr = re.compile(r'@[^ ]+ ', re.S)
dd = dr.sub('', dd)
return dd.strip()
def fetch_data(query_val, page_id):
"""抓取关键词某一页的数据"""
resp = requests.get(url_template.format(query_val, query_val, page_id))
card_group = json.loads(resp.text)['data']['cards'][0]['card_group']
print('url:', resp.url, ' --- 条数:', len(card_group))
mblogs = [] # 保存处理过的微博
for card in card_group:
mblog = card['mblog']
blog = {'mid': mblog['id'], # 微博id
'text': clean_text(mblog['text']), # 文本
'userid': str(mblog['user']['id']), # 用户id
'username': mblog['user']['screen_name'], # 用户名
'reposts_count': mblog['reposts_count'], # 转发
'comments_count': mblog['comments_count'], # 评论
'attitudes_count': mblog['attitudes_count'] # 点赞
}
mblogs.append(blog)
return mblogs
def remove_duplication(mblogs):
"""根据微博的id对微博进行去重"""
mid_set = {mblogs[0]['mid']}
new_blogs = []
for blog in mblogs[1:]:
if blog['mid'] not in mid_set:
new_blogs.append(blog)
mid_set.add(blog['mid'])
return new_blogs
def fetch_pages(query_val, page_num):
"""抓取关键词多页的数据"""
mblogs = []
for page_id in range(1 + page_num + 1):
try:
mblogs.extend(fetch_data(query_val, page_id))
except Exception as e:
print(e)
print("去重前:", len(mblogs))
mblogs = remove_duplication(mblogs)
print("去重后:", len(mblogs))
# 保存到 result.json 文件中
fp = open('result_{}.json'.format(query_val), 'w', encoding='utf-8')
json.dump(mblogs, fp, ensure_ascii=False, indent=4)
print("已保存至 result_{}.json".format(query_val))
if __name__ == '__main__':
fetch_pages('谷歌', 50)