-
Notifications
You must be signed in to change notification settings - Fork 0
/
zd.py
81 lines (68 loc) · 2.51 KB
/
zd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import requests
from bs4 import BeautifulSoup
from exts import db
from models import Article
from sqlalchemy import or_, func
# 访问链接
dw_url = "http://www.zdfans.com/apk"
# 下载该url下的信息
def dw_page(url):
# 设置请求头 伪装浏览器
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
# proxies = {
# 'https':'121.30.109.112:80',
# 'http':'27.10.232.26:8118',
# }
data = requests.get(url,headers=headers).content
return data
# 用bs4解析网页
def parse_html(html):
soup = BeautifulSoup(html,'lxml')
software_list_soup = soup.find('ul',attrs={'class':'excerpt'})
software_href_list = []
# 遍历每一条信息
for software_li in software_list_soup.find_all('li'):
detail = software_li.find('a')
software_href = detail['href']
print(software_href )
aid = db.session.query(func.count(Article.aId)).filter(Article.src==software_href).scalar()
print(aid)
if aid:
return software_href_list, None
else:
software_href_list.append(software_href)
current = soup.find('span',attrs={'class':'current'})
# 判断是否还有下一页
if current.next_sibling:
page = int((current).getText())+1
next_page = dw_url + "/page/" + str(page)
else:
next_page = None
# 如果有下一页 返回下一页链接 否则返回 None
if next_page:
return software_href_list,next_page
return software_href_list,None
def nextparse(href):
soup = BeautifulSoup(href,'lxml')
content = str(soup.find('div',attrs={'class':'entry'}))
title = soup.find('h1',attrs={'class':'meta-tit'}).find('a').getText()
date = soup.find('p', attrs={'class':'meta-info'}).getText()[1:11]
print(date)
return content, title, date
def content_main():
content_list = []
content_dict = {}
url = dw_url
# with codecs.open('software_href_list','wb',encoding='utf-8') as fp:
while url:
html = dw_page(url)
href, url = parse_html(html)
for h in href:
content_page = dw_page(h)
content, title, date = nextparse(content_page)
content_dict = {'content': content, 'title': title, 'src': h, 'date': date, 'mid': '8'}
content_list.append(content_dict)
return content_list
#fp.write(u'{href}\n'.format(href='\n'.join(href)))
if __name__ == '__main__':
content_main()