-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathguba_spider.py
137 lines (117 loc) · 4.25 KB
/
guba_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding:UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import json
import time
import re
def getHTMLText(url):
"""
从url处获取html返回
:param url:
:return:
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Mobile Safari/537.36'}
try:
r = requests.get(url,headers=headers,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList_comments(html):
"""
爬取某一页上的发帖信息,以字典形式保存时间、地址、标题、内容,将所有字典保存在列表comment_result中并返回
:param html:
:return:
"""
url_list = 'http://guba.eastmoney.com'
comment_result = []
soup = BeautifulSoup(html, "html.parser")
for div in soup.find_all('div',attrs={'class':'articleh normal_post'}):
try:
time_ = div.find('span',attrs={'class':'l5 a5'}).string
comment_url = url_list + div.find('a')['href']
title = div.find('a')['title']
comment = div.find('span',attrs={'class':'l3 a3'}).string
comment_result.append({
'时间': time_,
'地址': comment_url,
'标题': title,
'内容': comment
})
except:
continue
return comment_result
def fillUnivList_news(html):
"""
爬取某一页上的新闻信息,以字典形式保存时间、地址、标题,news_result中并返回
:param html:
:return:
"""
news_result = []
soup = BeautifulSoup(html, "html.parser")
div = soup.find('div',attrs={'class':'datelist'})
for a in div.find_all('a'):
try:
href = a['href']
title = a.string
match = re.search(r'\d{4}-\d{2}-\d{2}',href)
news_result.append({
"时间":match.group(0) if match else ' ',
"地址":href,
"标题":title
})
except:
continue
return news_result
def comments_spider(pages_num):
"""
pages_num为需爬取的页数,本函数爬取前pages_num页的评论信息,并将其写入comments_result.json文件中,并返回评论条数
:return:
"""
url = 'http://guba.eastmoney.com/list,002415,f.html'
url_fir = 'http://guba.eastmoney.com/list,002415,f_'
html = getHTMLText(url)
comment_result = fillUnivList_comments(html)
for i in range(2,pages_num+1):
if i > 0 and i%5 == 0:
print("已爬取"+str(i)+'页的评论..')
url_new = url_fir + str(i) + '.html'
html_new = getHTMLText(url_new)
comment_result += fillUnivList_comments(html_new)
result = json.dumps(comment_result,ensure_ascii=False)
with open('/Users/apple/PycharmProjects/untitled4/stock_analysis/comments_result.json', 'w') as f:
f.write(result)
return len(comment_result)
def news_spider(pages_num):
"""
pages_num为需爬取的页数,本函数爬取前pages_num页的新闻信息,并将其写入news_result.json文件中,并返回新闻条数
:param pages_num:
:return:
"""
news_result = []
url = 'https://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sz002415&Page='
for i in range(1,pages_num+1):
if i > 0 and i%5 == 0:
print("已爬取"+str(i)+'页的新闻....')
url_new = url + str(i)
html = getHTMLText(url_new)
news_result += fillUnivList_news(html)
result = json.dumps(news_result, ensure_ascii=False)
with open('/Users/apple/PycharmProjects/untitled4/stock_analysis/news_result.json', 'w') as f:
f.write(result)
return len(news_result)
if __name__ == '__main__':
print("----------程序开始-----------")
print('\n',end='')
start_time = time.time()
len1 = comments_spider(1000)
len2 = news_spider(40)
print("共爬取" + str(len1) + "条评论。")
print("共爬取" + str(len2) + "条新闻。")
end_time = time.time()
times = end_time - start_time
print("运行时间:",times,"秒")
print('\n',end='')
print("----------程序结束-----------")