Skip to content

Commit

Permalink
javdb: 抓取数据时检测FC2封面图是否真的存在
Browse files Browse the repository at this point in the history
  • Loading branch information
Yuukiy committed Apr 1, 2024
1 parent 2e80f87 commit 1ca6b8c
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 73 deletions.
26 changes: 26 additions & 0 deletions unittest/data/FC2-2735981 (javdb).json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"dvdid": "FC2-2735981",
"cid": null,
"url": "https://javdb365.com/v/d25M0",
"plot": null,
"cover": null,
"big_cover": null,
"genre": null,
"genre_id": null,
"genre_norm": null,
"score": "7.26",
"title": "高額寄せ集め2",
"ori_title": null,
"magnet": null,
"serial": null,
"actress": null,
"actress_pics": null,
"director": null,
"duration": null,
"producer": null,
"publisher": null,
"uncensored": null,
"publish_date": "2022-03-19",
"preview_pics": null,
"preview_video": null
}
35 changes: 10 additions & 25 deletions unittest/data/FC2-3189680 (javdb).json
Original file line number Diff line number Diff line change
@@ -1,41 +1,26 @@
{
"dvdid": "FC2-3189680",
"cid": null,
"url": "https://javdb.com/v/rmVapJ",
"url": "https://javdb365.com/v/rmVapJ",
"plot": null,
"cover": "https://c0.jdbstatic.com/covers/rm/rmVapJ.jpg",
"big_cover": null,
"genre": [
"私人攝影",
"制服",
"內射"
],
"genre_id": [
"fc2?c1=11",
"fc2?c1=19",
"fc2?c1=18"
],
"genre": null,
"genre_id": null,
"genre_norm": null,
"score": "8.04",
"score": "7.98",
"title": "【体育館倉庫】某ハーフ子役モデルを高額援助。計2回のゴムなし大量中出し。※4K特典(1時間越え)",
"ori_title": null,
"magnet": [
"magnet:?xt=urn:btih:6d4fed9103648ab2e4a22697f363bd243f7feffa&dn=FC2-3189680",
"magnet:?xt=urn:btih:4dea0950176f3c5af0f14ad96528f79d7bdc48fa&dn=FC2PPV 3189680"
],
"magnet": null,
"serial": null,
"actress": [
"永瀬ゆい"
],
"actress": null,
"actress_pics": null,
"director": null,
"duration": "49",
"producer": "体育館倉庫",
"duration": null,
"producer": null,
"publisher": null,
"uncensored": null,
"publish_date": "2023-02-20",
"preview_pics": [
"https://c0.jdbstatic.com/samples/rm/rmVapJ_l_0.jpg"
],
"preview_video": "https://javdb.com/v/rmVapJ"
"preview_pics": null,
"preview_video": null
}
57 changes: 10 additions & 47 deletions unittest/data/FC2-985469 (javdb).json
Original file line number Diff line number Diff line change
@@ -1,63 +1,26 @@
{
"dvdid": "FC2-985469",
"cid": null,
"url": "https://javdb.com/v/nzA44",
"url": "https://javdb365.com/v/nzA44",
"plot": null,
"cover": "https://c0.jdbstatic.com/covers/nz/nzA44.jpg",
"big_cover": null,
"genre": [
"私人攝影",
"素人",
"內射",
"原作",
"無碼",
"角色扮演",
"戀物癖",
"可愛"
],
"genre_id": [
"fc2?c1=11",
"fc2?c1=26",
"fc2?c1=18",
"fc2?c1=7",
"fc2?c1=24",
"fc2?c1=9",
"fc2?c1=13",
"fc2?c1=20"
],
"genre": null,
"genre_id": null,
"genre_norm": null,
"score": "8.82",
"score": "8.80",
"title": "【個人撮影・無】JD2回生ちゃんに中出し!エロマンガ先生のパジャマコスで中出しえっちさせててもらいました♪",
"ori_title": null,
"magnet": [
"magnet:?xt=urn:btih:903ecbf73fd1a466e11e9454388c77c854f2463f&dn=FC2PPV-985469",
"magnet:?xt=urn:btih:851c21dab8d9a4883e8940240107aa94e5e0905d&dn=FC2-PPV-985469",
"magnet:?xt=urn:btih:649d36f0fd470f5950a1edc6fbb5922a1a82a39e&dn=[7sht.me]FC2PPV-985469",
"magnet:?xt=urn:btih:b2a57edade565821bd611b89dfca1f8d0e63e891&dn=fc2-985469",
"magnet:?xt=urn:btih:6be7ae12c4be3a0f7e27d1720b4e23e1134dcfd1&dn=FC2-985469",
"magnet:?xt=urn:btih:e2eaaaa085d14a010ff014991f3ef8ac48954920&dn=FC2-PPV-985469-HD",
"magnet:?xt=urn:btih:fc56fea5e868c3c28f2e3cd1731adc347c55267c&dn=FC2-PPV-983579-985469-纱雾"
],
"magnet": null,
"serial": null,
"actress": [],
"actress": null,
"actress_pics": null,
"director": null,
"duration": "113",
"producer": "COS☆ぱこ",
"duration": null,
"producer": null,
"publisher": null,
"uncensored": null,
"publish_date": "2018-11-23",
"preview_pics": [
"https://c0.jdbstatic.com/samples/nz/nzA44_l_0.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_1.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_2.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_3.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_4.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_5.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_6.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_7.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_8.jpg",
"https://c0.jdbstatic.com/samples/nz/nzA44_l_9.jpg"
],
"preview_video": "https://javdb.com/v/nzA44"
"preview_pics": null,
"preview_video": null
}
12 changes: 12 additions & 0 deletions web/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ def __init__(self, use_scraper=False) -> None:
self.scraper = None
self.__get = requests.get
self.__post = requests.post
self.__head = requests.head
else:
self.scraper = cloudscraper.create_scraper()
self.__get = self._scraper_monitor(self.scraper.get)
self.__post = self._scraper_monitor(self.scraper.post)
self.__head = self._scraper_monitor(self.scraper.head)

def _scraper_monitor(self, func):
"""监控cloudscraper的工作状态,遇到不支持的Challenge时尝试退回常规的requests请求"""
Expand Down Expand Up @@ -82,6 +84,16 @@ def post(self, url, data, delay_raise=False):
r.raise_for_status()
return r

def head(self, url, delay_raise=True):
r = self.__head(url,
headers=self.headers,
proxies=self.proxies,
cookies=self.cookies,
timeout=self.timeout)
if not delay_raise:
r.raise_for_status()
return r

def get_html(self, url):
r = self.get(url)
html = resp2html(r)
Expand Down
7 changes: 6 additions & 1 deletion web/javdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ def parse_clean_data(movie: MovieInfo):
"""解析指定番号的影片数据并进行清洗"""
try:
parse_data(movie)
# 检查封面URL是否真的存在对应图片
if movie.cover is not None:
r = request.head(movie.cover)
if r.status_code != 200:
movie.cover = None
except SiteBlocked:
raise
logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试')
Expand Down Expand Up @@ -326,7 +331,7 @@ def collect_actress_alias(type=0, use_original=True):
logger.root.handlers[1].level = logging.DEBUG

# collect_actress_alias()
movie = MovieInfo('JUQ-471')
movie = MovieInfo('FC2-2735981')
try:
parse_clean_data(movie)
print(movie)
Expand Down

0 comments on commit 1ca6b8c

Please sign in to comment.