From 3f5b726a2601ef21f35cb166575a97e0cc8b2d1a Mon Sep 17 00:00:00 2001 From: Yuukiy <76897913+Yuukiy@users.noreply.github.com> Date: Sun, 14 Jan 2024 15:35:04 +0800 Subject: [PATCH] =?UTF-8?q?fanza:=20=E6=94=AF=E6=8C=81=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=20=E6=88=90=E4=BA=BA=E6=98=A0=E7=94=BB=E3=80=81=E5=90=8C?= =?UTF-8?q?=E4=BA=BA=E4=BD=9C=E5=93=81=E3=80=81=E5=8D=95=E5=93=81=E7=A7=9F?= =?UTF-8?q?=E8=B5=81=20=E7=9A=84=E6=95=B0=E6=8D=AE=20(Fix=20#192)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- unittest/data/145dmn000007 (fanza).json | 65 ++++++++++++ unittest/data/1stars931r (fanza).json | 54 ++++++++++ unittest/data/62knbm009 (fanza).json | 2 +- unittest/data/d_aisoft3356 (fanza).json | 47 +++++++++ unittest/data/parathd03639 (fanza).json | 62 ++++++++++++ web/fanza.py | 128 +++++++++++++++++------- 6 files changed, 322 insertions(+), 36 deletions(-) create mode 100644 unittest/data/145dmn000007 (fanza).json create mode 100644 unittest/data/1stars931r (fanza).json create mode 100644 unittest/data/d_aisoft3356 (fanza).json create mode 100644 unittest/data/parathd03639 (fanza).json diff --git a/unittest/data/145dmn000007 (fanza).json b/unittest/data/145dmn000007 (fanza).json new file mode 100644 index 000000000..a4e78e70a --- /dev/null +++ b/unittest/data/145dmn000007 (fanza).json @@ -0,0 +1,65 @@ +{ + "dvdid": null, + "cid": "145dmn000007", + "url": "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=145dmn000007/", + "plot": "どんな厳格な男でも、性欲に我を失ってしまう時がある。ましてやそれが若くセクシーな女性達に触れる機会の多い職業ならば…。テレビ番組のアシスタント・ディレクター、予約が殺到する超人気のカリスマ美容師、そして聖職とも言える女子校教師。責任感に抑圧される彼らのストレスが限界を越えた時、おぞましきセックス犯罪が誘発される…!!", + "cover": "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007pl.jpg", + "big_cover": null, + "genre": [ + "ドラマ", + "Vシネマ", + "縛り・緊縛", + "ハメ撮り" + ], + "genre_id": [ + "4114", + "4110", + "5021", + "6002" + ], + "genre_norm": null, + "score": "4.00", + "title": "実録性犯罪ファイル 職業別ストレス症候群", + "ori_title": null, + "magnet": null, + "serial": "実録 性犯罪ファイル", + "actress": [ + "河愛純", + "相沢リリ", + "野田めぐみ", + "入江浩治", + "麻央はじめ", + "神戸顕一", + "すわしんじ" + ], + "actress_pics": null, + "director": "笠原唯央", + "duration": "59", + "producer": "TMC", + "publisher": null, + "uncensored": false, + "publish_date": "2022-08-12", + "preview_pics": [ + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-1.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-2.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-3.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-4.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-5.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-6.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-7.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-8.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-9.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-10.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-11.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-12.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-13.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-14.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-15.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-16.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-17.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-18.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-19.jpg", + "https://pics.dmm.co.jp/digital/video/145dmn000007/145dmn000007-20.jpg" + ], + "preview_video": "" +} \ No newline at end of file diff --git a/unittest/data/1stars931r (fanza).json b/unittest/data/1stars931r (fanza).json new file mode 100644 index 000000000..dbb1a884e --- /dev/null +++ b/unittest/data/1stars931r (fanza).json @@ -0,0 +1,54 @@ +{ + "作为此类影片的代表": "https://www.dmm.co.jp/rental/ppr/-/detail/=/cid=1stars931r/", + "dvdid": null, + "cid": "1stars931r", + "url": "https://www.dmm.co.jp/rental/ppr/-/detail/=/cid=1stars931r/", + "plot": "", + "cover": "https://pics.dmm.co.jp/mono/movie/1stars931r/1stars931rpl.jpg", + "big_cover": null, + "genre": [ + "巨乳", + "ドキュメンタリー", + "単体作品", + "アイドル・芸能人", + "デビュー作品", + "サンプル動画", + "軟体" + ], + "genre_id": [ + "2001", + "4023", + "4025", + "4118", + "6006", + "6102", + "6935" + ], + "genre_norm": null, + "score": 45, + "title": "芸能界引退後、即AVデビュー 渚恋生", + "ori_title": null, + "magnet": null, + "serial": "AV DEBUT(STAR)", + "actress": [ + "渚恋生" + ], + "actress_pics": null, + "director": "星シュート", + "duration": "210", + "producer": "SODクリエイト", + "publisher": null, + "uncensored": false, + "publish_date": null, + "preview_pics": [ + "https://pics.dmm.co.jp/digital/video/1stars00931/1stars00931-1.jpg", + "https://pics.dmm.co.jp/digital/video/1stars00931/1stars00931-2.jpg", + "https://pics.dmm.co.jp/digital/video/1stars00931/1stars00931-3.jpg", + "https://pics.dmm.co.jp/digital/video/1stars00931/1stars00931-4.jpg", + "https://pics.dmm.co.jp/digital/video/1stars00931/1stars00931-5.jpg", + "https://pics.dmm.co.jp/digital/video/1stars00931/1stars00931-6.jpg", + "https://pics.dmm.co.jp/digital/video/1stars00931/1stars00931-7.jpg", + "https://pics.dmm.co.jp/digital/video/1stars00931/1stars00931-8.jpg" + ], + "preview_video": "https://cc3001.dmm.co.jp/litevideo/freepv/1/1st/1stars931/1stars931mhb.mp4" +} \ No newline at end of file diff --git a/unittest/data/62knbm009 (fanza).json b/unittest/data/62knbm009 (fanza).json index 32bd8948d..fd5e265e6 100644 --- a/unittest/data/62knbm009 (fanza).json +++ b/unittest/data/62knbm009 (fanza).json @@ -20,7 +20,7 @@ "6123" ], "genre_norm": null, - "score": "2.00", + "score": "4.00", "title": "同居する粘液 第1話日常の中の非日常", "ori_title": null, "magnet": null, diff --git a/unittest/data/d_aisoft3356 (fanza).json b/unittest/data/d_aisoft3356 (fanza).json new file mode 100644 index 000000000..673362704 --- /dev/null +++ b/unittest/data/d_aisoft3356 (fanza).json @@ -0,0 +1,47 @@ +{ + "dvdid": null, + "cid": "d_aisoft3356", + "url": "https://www.dmm.co.jp/mono/doujin/-/detail/=/cid=d_aisoft3356/", + "plot": "夏の田舎、蝉の音に混じり聞こえてくる元気な掛け声。\n暇を持て余し木陰で昼寝をしていたあなたは、姪のわかばとふたばにかくれんぼに誘われました。\nそして汗だくになって遊ぶ中、わかばと一緒に今は使われていない古民家へやって来ます。\n\nそこは隠れるには絶好の秘密の場所で……\n\nこれは背伸びしたい年頃のおませな双子姉妹と、\n少し歳の離れたおじさんとのちょっぴりエッチな夏の一齣です。\n\n・基本動画50本+差分+テキスト\n・動画解像度 960×600\n\n対応OS:WindowsVista / Windows7\n製品仕様:DVD-ROM1枚 DVD型トールケース", + "cover": "https://pics.dmm.co.jp/mono/doujin/d_aisoft3356/d_aisoft3356pl.jpg", + "big_cover": null, + "genre": [ + "3DCG", + "動画・アニメーション", + "音声付き", + "オリジナル", + "貧乳・微乳", + "ミニ系", + "サンプル動画" + ], + "genre_id": [ + "12", + "13", + "15", + "20", + "2005", + "2008", + "6102" + ], + "genre_norm": null, + "score": "10.00", + "title": "夏のひめごと。", + "ori_title": null, + "magnet": null, + "serial": null, + "actress": null, + "actress_pics": null, + "director": null, + "duration": null, + "producer": null, + "publisher": null, + "uncensored": false, + "publish_date": "2014-11-07", + "preview_pics": [ + "https://pics.dmm.co.jp/mono/doujin/d_aisoft3356/d_aisoft3356js-001.jpg", + "https://pics.dmm.co.jp/mono/doujin/d_aisoft3356/d_aisoft3356js-002.jpg", + "https://pics.dmm.co.jp/mono/doujin/d_aisoft3356/d_aisoft3356js-003.jpg", + "https://pics.dmm.co.jp/mono/doujin/d_aisoft3356/d_aisoft3356js-004.jpg" + ], + "preview_video": null +} \ No newline at end of file diff --git a/unittest/data/parathd03639 (fanza).json b/unittest/data/parathd03639 (fanza).json new file mode 100644 index 000000000..7595a09c1 --- /dev/null +++ b/unittest/data/parathd03639 (fanza).json @@ -0,0 +1,62 @@ +{ + "作为此类影片的代表": "https://www.dmm.co.jp/monthly/paradisetv/-/detail/=/cid=parathd03639/", + "dvdid": null, + "cid": "parathd03639", + "url": "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=parathd03639/", + "plot": "★推定Gカップの巨乳の奥さん、推定Hカップの長女、推定Hカップの次女が働いている家族経営のコンビニでバイトする俺が3人ともモノにする物語!3人の巨乳とSEXだ!◆推定Hカップの長女さくら(25)。胸の盛り上がる服に思わず手が出てしまった!バイト終わりのバックヤードで制服姿の長女とおっぱいを揺らしてSEX!◆推定Hカップの次女ねる(20)。バイト終わりに制服から私服へ着替えていた。我慢できず自分の住むアパートに誘って次女とヤる!Hカップ爆乳にパイズリしてもらい昇天寸前!◆推定Gカップの奥さん(45)。夫である店長が夜勤で不在。自宅を訪ねて奥さんともSEX!", + "cover": "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639pl.jpg", + "big_cover": null, + "genre": [ + "中出し", + "ドラマ", + "巨乳", + "職業色々", + "ハイビジョン", + "パラダイスTV" + ], + "genre_id": [ + "5001", + "4114", + "2001", + "1026", + "6533", + "6008" + ], + "genre_norm": null, + "score": "10.00", + "title": "【連続スケベ小説 総集編】コンビニで働く巨乳母娘3人と中●しSEXしちゃった俺", + "ori_title": null, + "magnet": null, + "serial": "連続スケベ小説", + "actress": [], + "actress_pics": null, + "director": null, + "duration": "115", + "producer": "パラダイステレビ", + "publisher": null, + "uncensored": false, + "publish_date": "2023-01-05", + "preview_pics": [ + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-1.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-2.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-3.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-4.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-5.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-6.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-7.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-8.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-9.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-10.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-11.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-12.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-13.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-14.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-15.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-16.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-17.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-18.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-19.jpg", + "https://pics.dmm.co.jp/digital/video/parathd03639/parathd03639-20.jpg" + ], + "preview_video": "https://cc3001.dmm.co.jp/litevideo/freepv/p/par/parathd03639/parathd03639_mhb_w.mp4" +} \ No newline at end of file diff --git a/web/fanza.py b/web/fanza.py index b9cab8e09..4eb788f5c 100644 --- a/web/fanza.py +++ b/web/fanza.py @@ -4,6 +4,7 @@ import sys import json import logging +from typing import Dict, List, Tuple sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) @@ -21,40 +22,88 @@ request.headers['Accept-Language'] = 'ja,en-US;q=0.9' -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - try_urls = { - 'videoa': f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/', # AV - 'anime': f'{base_url}/mono/anime/-/detail/=/cid={movie.cid}/', # 里番 - } - for type_, url in try_urls.items(): - r = request.get(url, delay_raise=True) - # 404错误表明没有这部影片的数据 - if r.status_code == 404: - continue - - r.raise_for_status() - html = resp2html(r) - break - else: - raise MovieNotFoundError(__name__, movie.cid) +_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1} +_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1} +def sort_search_result(result: List[Dict]): + """排序搜索结果""" + scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result} + sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True) + return sorted_result + + +def get_urls_of_cid(cid: str) -> Tuple[str, str]: + """搜索cid可能的影片URL""" + r = request.get(f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0") + if r.status_code == 404: + raise MovieNotFoundError(__name__, cid) + r.raise_for_status() + html = resp2html(r) + result = html.xpath("//ul[@id='list']/li/div/p/a/@href") + parsed_result = {} + for url in result: + items = url.split('/') + type_, cid = None, None + for i, part in enumerate(items): + if part == '-': + product, type_ = items[i-2], items[i-1] + elif part.startswith('cid='): + cid = part[4:] + new_url = '/'.join(i for i in items if not i.startswith('?')) + '/' + parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url}) + break + if cid not in parsed_result: + if len(result) > 0: + logger.debug(f"Unknown URL in search result: " + ', '.join(result)) + raise MovieNotFoundError(__name__, cid) + sorted_result = sort_search_result(parsed_result[cid]) + return sorted_result + + +def resp2html_wrapper(resp): + html = resp2html(resp) if 'not available in your region' in html.text_content(): raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') + return html + - movie.url = url - parse_func = globals()[f'parse_{type_}_page'] - parse_func(movie, html) +def parse_data(movie: MovieInfo): + """解析指定番号的影片数据""" + default_url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/' + r0 = request.get(default_url, delay_raise=True) + if r0.status_code == 404: + urls = get_urls_of_cid(movie.cid) + for d in urls: + func_name = f"parse_{d['type']}_page" + if func_name in globals(): + parse_func = globals()[func_name] + else: + logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}") + continue + r = request.get(d['url']) + html = resp2html_wrapper(r) + try: + parse_func(movie, html) + movie.url = d['url'] + break + except: + if id(d) == id(urls[-1]): + raise + else: + html = resp2html_wrapper(r0) + parse_videoa_page(movie, html) + movie.url = default_url def parse_videoa_page(movie: MovieInfo, html): """解析AV影片的页面布局""" - title = html.xpath("//h1[@id='title']/text()")[0] + title = html.xpath("//div[@class='hreview']/h1/text()")[0] # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来 container = html.xpath("//table[@class='mg-b12']/tr/td")[0] cover = container.xpath("//div[@id='sample-video']/a/@href")[0] # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 - date_str = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()")[0].strip() - publish_date = date_str.replace('/', '-') + date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()") + if date_tag: + movie.publish_date = date_tag[0].strip().replace('/', '-') duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() match = re.search(r'\d+', duration_str) if match: @@ -75,19 +124,23 @@ def parse_videoa_page(movie: MovieInfo, html): # if label_tag: # label = label_tag[0].strip() # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 - genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=')]") + genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]") genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text.strip()) genre_id.append(tag.get('href').split('=')[-1].strip('/')) cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() - plot = container.xpath("//div[@class='mg-b20 lh4']/text()")[0].strip() + plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip() preview_pics = container.xpath("//a[@name='sample-image']/img/@src") - score_str = container.xpath("//p[@class='d-review__average']/strong/text()")[0].strip() - match = re.search(r'\d+', score_str) - if match: - score = float(match.group()) * 2 - movie.score = f'{score:.2f}' + score_tag = container.xpath("//p[@class='d-review__average']/strong/text()") + if score_tag: + match = re.search(r'\d+', score_tag[0].strip()) + if match: + score = float(match.group()) * 2 + movie.score = f'{score:.2f}' + else: + score_img = container.xpath("//td[@class='dcd-review__anchor_content']/img/@src")[0] + movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 if cfg.Crawler.hardworking_mode: # 预览视频是动态加载的,不在静态网页中 @@ -109,7 +162,6 @@ def parse_videoa_page(movie: MovieInfo, html): movie.cid = cid movie.title = title movie.cover = cover - movie.publish_date = publish_date movie.actress = actress movie.genre = genre movie.genre_id = genre_id @@ -125,8 +177,9 @@ def parse_anime_page(movie: MovieInfo, html): cover = container.xpath("//a[@name='package-image']/@href")[0] date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip() publish_date = date_str.replace('/', '-') - duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() - duration = duration_str.replace('分', '') + duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()") + if duration_tag: + movie.duration = duration_tag[0].strip().replace('分', '') serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") if serial_tag: movie.serial = serial_tag[0].strip() @@ -148,7 +201,6 @@ def parse_anime_page(movie: MovieInfo, html): movie.title = title movie.cover = cover movie.publish_date = publish_date - movie.duration = duration movie.genre = genre movie.genre_id = genre_id movie.plot = plot @@ -157,12 +209,18 @@ def parse_anime_page(movie: MovieInfo, html): movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 +# parse_dvd_page = parse_videoa_page # 118wtktabf067 +parse_ppr_page = parse_videoa_page +parse_nikkatsu_page = parse_videoa_page +parse_doujin_page = parse_anime_page + + if __name__ == "__main__": import pretty_errors pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo(cid='62knbm009') + movie = MovieInfo(cid='1stars931r') try: parse_data(movie) print(movie)