-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnet.py
241 lines (209 loc) · 8.72 KB
/
net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
from lxml import etree
import json
import requests
import time
import re
import datetime
from flask import Flask, render_template
import os
app = Flask(__name__, template_folder="templates")
notion_database_id = os.environ.get("database","None")
notion_api_token = os.environ.get("api","None")
@app.route("/")
def index():
# decodedText = request.form.get('decodedText')
return render_template("22.html")
@app.route("/isbn/<isbn>")
def get_book_info(isbn):
getResqutes("https://douban.com/isbn/" + isbn)
def getResqutes(url):
tag_pattern = re.compile("criteria = '(.+)'")
brand = "无"
pubilishDate = None
# 设置请求头信息
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"
}
data = requests.get(url, headers=headers) # 此处是请求
html = etree.HTML(data.content) # 网页的解析
# 书名
book_name = html.xpath("//*[@id='mainpic']/a/@title")
# 图片url
book_img = html.xpath("//*[@id='mainpic']/a/img/@src")
# 作者
author_name = html.xpath("//*[@id='info']/span[1]/a/text()")
if "".join(author_name) == "":
author_name = html.xpath('//span[text()="作者:"]/../a[1]/text()')
author_name = re.sub(r"[(\s)*(\n)*]", "", "".join(author_name))
# 出版社
press = html.xpath('//span[./text()="出版社:"]/following::text()[2]')
if "".join(press).lstrip() == "":
press = html.xpath('//span[./text()="出版社:"]/following::text()[1]')
# 副标题
substitle = html.xpath('//span[./text()="副标题:"]/following::text()[1]')
# 出版日期
press_year = html.xpath('//span[./text()="出版年:"]/following::text()[1]')
# 页数
pages = html.xpath('//span[./text()="页数:"]/following::text()[1]')
# 价格
price = html.xpath('//span[./text()="定价:"]/following::text()[1]')
# 图书ISBN
ISBN = html.xpath('//span[./text()="ISBN:"]/following::text()[1]')
# 出品方
brand = html.xpath('//span[./text()="出品方:"]/following::text()[2]')
# 丛书
series = html.xpath('//span[./text()="丛书:"]/following::text()[2]')
# 装帧
design = html.xpath('//span[./text()="装帧:"]/following::text()[1]')
# 评分
score = html.xpath("//*[@id='interest_sectl']/div/div[2]/strong/text()")
# # 评价人数
# number_reviewers = html.xpath("//*[@id='interest_sectl']/div/div[2]/div/div[2]/span/a/span/text()")
# # 图书简介
# introduction = html.xpath(u'//span[text()="内容简介"]/../following::div[1]//div[@class="intro"]/p/text()')
# # 作者简介
# introduction = html.xpath(u'//span[text()="作者简介"]/../following::div[1]//div[@class="intro"]/p/text()')
# # 译者
translator = html.xpath(
'//*[@id="info"]/span[contains(.,"译者")]/descendant::a/text()'
)
# 提取标签,默认提取前3个标签
tag_match = tag_pattern.findall(data.text)
if len(tag_match):
tags = [
tag.replace("7:", "")
for tag in filter(
lambda tag: tag and tag.startswith("7:"), tag_match[0].split("|")
)
]
# list转成string,并清除 author、press_year、design、ISBN 等字段的前后空格
book_name = "".join(book_name)
author_name = " ".join(author_name).lstrip() # 如果由多位作者,用空格分隔
translator = " ".join(translator).lstrip() # 如果由多位译者,用空格分隔
book_img = "".join(book_img)
press = "".join(press).strip()
press_year = "".join(press_year).lstrip()
pages = "".join(pages)
price = "".join(price).lstrip()
ISBN = "".join(ISBN).lstrip()
brand = "".join(brand).lstrip()
series = "".join(series).lstrip()
design = "".join(design).lstrip()
substitle = "".join(substitle).lstrip()
score = "".join(score).lstrip()
# 正则处理price字段,并将 price、pages 转换成Num
priceMatch = re.search(r"\d+\.?\d*", price, re.I)
if priceMatch:
price = priceMatch.group(0)
else:
price = 0
price = float(price)
score = float(score)
pages = re.findall(r"\d+", pages)
if pages != '':
pages = int(pages [0])
else:
pages = 0
# 当豆瓣图书页面没有出品方、丛书、装帧信息时,默认填「无」
if brand == "":
brand = "无"
if series == "":
series = "无"
if design == "":
design = "无"
# series = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","",series)
# 将press_year格式化为标准日期格式(到月)。如果有年月,出版日期为当年当月的1号,如果只有年,出版日期为当年的1月1日,如果都没有则使用默认日期。
pubilishDateList = re.findall(r"\d+", press_year)
if len(pubilishDateList) >= 2:
pubilishDate = pubilishDateList[0] + "-" + pubilishDateList[1].zfill(2) + "-01"
elif len(pubilishDateList) == 1:
pubilishDate = pubilishDateList[0] + "-" + "01-01"
else:
pubilishDate = "2023-01-01"
# 购买日期字段,默认为当天日期
purchase_date = time.strftime("%Y-%m-%d", time.localtime())
body = {
"parent": {"type": "database_id", "database_id": notion_database_id},
"properties": {
"书名": {"title": [{"type": "text", "text": {"content": book_name}}]},
"副标题": {"rich_text": [{"type": "text", "text": {"content": substitle}}]},
"出版社": {"select": {"name": press}},
"作者": {"rich_text": [{"type": "text", "text": {"content": author_name}}]},
"译者": {"rich_text": [{"type": "text", "text": {"content": translator}}]},
"ISBN": {"rich_text": [{"type": "text", "text": {"content": ISBN}}]},
"丛书": {"rich_text": [{"type": "text", "text": {"content": series}}]},
"装帧": {"rich_text": [{"type": "text", "text": {"content": design}}]},
"出品方": {"select": {"name": brand}},
"册数": {"number": 1},
"定价": {"number": price},
"页数": {"number": pages},
"豆瓣评分": {"number": score},
"豆瓣": {"url": data.url},
"封面": {"url": book_img},
"出版日": {"date": {"start": pubilishDate, "end": None}},
"时间": {"date": {"start": purchase_date, "end": None}},
"封面": {
"files": [
{
"name": "testname",
"type": "external",
"external": {"url": book_img},
}
]
},
"标签": {
"multi_select": [
{"name": tags[0]},
{"name": tags[1]},
{"name": tags[2]},
{"name": tags[3]},
{"name": tags[4]},
]
},
},
}
# 向 Notion API 发送HTTP请求
NotionData = requests.request(
"POST",
# API 链接
"https://api.notion.com/v1/pages",
# 读取消息体,消息体需要另行编辑,后文再说
json=body,
# 消息头,内有必要信息
headers={
# 设置机器人令牌,即 Notion 的机器人码
"Authorization": "Bearer " + notion_api_token,
# 设置 Notion 版本,目前不用改
"Notion-Version": "2021-05-13",
},
)
# 根据POST返回结构打印信息
if str(NotionData.status_code) == "200":
print("导入信息成功,图书信息为:")
print(
book_name,
author_name,
press,
pubilishDate,
pages,
price,
brand,
series,
design,
ISBN,
sep=" | ",
)
print(
"-------------------------------------------------------------------------------------------------"
)
else:
print("导入失败,换本书试试,或检查脚本内Body内容与Notion书库字段:")
print(NotionData.text)
if __name__ == "__main__":
app.run(
host="0.0.0.0",
port=516,
)
# server = pywsgi.WSGIServer(('0.0.0.0',8080),app,keyfile='./static/cert/localhost+3.pem',certfile='./static/cert/localhost+3-key.pem')
# server.serve_forever()