diff --git a/.github/workflows/pyinstaller.yml b/.github/workflows/pyinstaller.yml index 6bdb1ce2d..4135ab675 100644 --- a/.github/workflows/pyinstaller.yml +++ b/.github/workflows/pyinstaller.yml @@ -48,16 +48,16 @@ jobs: - name: Build with PyInstaller for windows run: cmd.exe /c 'make.bat' - - name: Install pytest - run: | - python -m pip install pytest + # - name: Install pytest + # run: | + # python -m pip install pytest - - name: Switch code page - run: | - chcp 65001 + # - name: Switch code page + # run: | + # chcp 65001 - - name: Test JavSP.exe - run: pytest unittest/test_exe.py + # - name: Test JavSP.exe + # run: pytest unittest/test_exe.py - name: Set VERSION variable for windows run: | diff --git a/.github/workflows/test-web-funcs.yml b/.github/workflows/test-web-funcs.yml index 3768c1422..7e76febc6 100644 --- a/.github/workflows/test-web-funcs.yml +++ b/.github/workflows/test-web-funcs.yml @@ -40,9 +40,6 @@ jobs: - name: Switch code page run: | chcp 65001 - - name: Test proxyfree.py - run: | - pytest unittest/test_proxyfree.py - name: Test web crawlers run: | pytest unittest/test_crawlers.py diff --git a/JavSP.py b/JavSP.py index 727b787d3..cf59123f1 100644 --- a/JavSP.py +++ b/JavSP.py @@ -21,6 +21,7 @@ from core.print import TqdmOut +from core.baidu_aip import aip_crop_poster # 将StreamHandler的stream修改为TqdmOut,以与Tqdm协同工作 @@ -440,21 +441,12 @@ def reviewMovieID(all_movies, root): def crop_poster_wrapper(fanart_file, poster_file, method='normal', hard_sub=False, uncensored=False): """包装各种海报裁剪方法,提供统一的调用""" if method == 'baidu': - from core.ai_crop.baidu_aip import aip_crop_poster try: aip_crop_poster(fanart_file, poster_file) except Exception as e: logger.debug('人脸识别失败,回退到常规裁剪方法') logger.debug(e, exc_info=True) crop_poster(fanart_file, poster_file) - elif method == 'retina': - from core.ai_crop.retina import ai_crop_poster - try: - ai_crop_poster(fanart_file, poster_file) - except Exception as e: - logger.debug('人脸识别失败,回退到常规裁剪方法') - logger.debug(e, exc_info=True) - crop_poster(fanart_file, poster_file) else: crop_poster(fanart_file, poster_file) if cfg.Picture.add_label_to_cover: diff --git a/core/ai_crop/retina.py b/core/ai_crop/retina.py deleted file mode 100644 index 44257d0ac..000000000 --- a/core/ai_crop/retina.py +++ /dev/null @@ -1,25 +0,0 @@ -from retinaface import RetinaFace - -from PIL import Image, ImageOps -def ai_crop_poster(fanart, poster='', hw_ratio=1.42): - im = ImageOps.exif_transpose(Image.open(fanart)) - fanart_w, fanart_h = im.size - poster_h = fanart_h - poster_w = fanart_h / hw_ratio - - resp = RetinaFace.detect_faces(fanart) - - if not 'face_1' in resp: - raise Exception("Retina can't detect face") - - [x1, y1, x2, y2] = resp['face_1']['facial_area'] - center_x = (x1 + x2) / 2 - center_y = (y1 + y2) / 2 - poster_left = max(center_x - poster_w / 2, 0) - poster_left = min(poster_left, fanart_w - poster_w) - poster_left = int(poster_left) - im_poster = im.crop((poster_left, 0, int(poster_left + poster_w), poster_h)) - if im_poster.mode != 'RGB': - im_poster = im_poster.convert('RGB') - im_poster.save(poster, quality=95) - diff --git a/core/ai_crop/baidu_aip.py b/core/baidu_aip.py similarity index 100% rename from core/ai_crop/baidu_aip.py rename to core/baidu_aip.py diff --git a/core/config.ini b/core/config.ini index d5b8565ab..c2c0f4e6d 100644 --- a/core/config.ini +++ b/core/config.ini @@ -30,10 +30,10 @@ retry = 3 timeout = 10 # 要使用的爬虫列表(汇总数据时从前到后进行) -# airav avsox avwiki fanza fc2 fc2fan javbus javdb javlib javmenu jav321 msin mgstage prestige +# airav avsox avwiki fanza fc2 fc2fan javbus javdb javlib javmenu jav321 mgstage prestige [CrawlerSelect] normal = airav,avsox,javbus,javdb,javlib,jav321,mgstage,prestige -fc2 = fc2,msin,avsox,javdb,javmenu +fc2 = fc2,avsox,javdb,javmenu cid = fanza getchu = dl_getchu gyutto = gyutto @@ -104,7 +104,6 @@ use_ai_crop = no # 要使用图像识别来裁剪的番号系列($label), \d表示纯数字番号(FC2和识别到的无码影片会自动使用图像识别裁剪) use_ai_crop_labels = \d,ARA,SIRO,GANA,MIUM # 要使用的图像识别引擎,详细配置见文档 https://github.com/Yuukiy/JavSP/wiki/AI-%7C-%E4%BA%BA%E8%84%B8%E8%AF%86%E5%88%AB -# Can be either 'baidu' or 'retina' ai_engine = # 百度人体分析应用的AppID(仅在图像识别引擎为baidu时需要) aip_appid = diff --git a/core/config.py b/core/config.py index d278d0354..c6fe39eef 100644 --- a/core/config.py +++ b/core/config.py @@ -343,8 +343,6 @@ def validate_ai_config(cfg: Config): empty_keys = [i for i in required_keys if not piccfg[i]] if empty_keys: logger.error('使用百度人体分析时,相关设置不能为空: ' + ', '.join(empty_keys)) - elif piccfg.ai_engine.lower() == 'retina': - piccfg.ai_engine = 'retina' else: logger.error('不支持的图像识别引擎: ' + piccfg.ai_engine) diff --git a/requirements-linux.txt b/requirements-linux.txt index d35208715..475119397 100644 --- a/requirements-linux.txt +++ b/requirements-linux.txt @@ -1,6 +1,6 @@ altgraph==0.17 baidu-aip==2.2.18.0 -certifi==2023.7.22 +certifi==2024.7.4 chardet==4.0.0 cloudscraper==1.2.71 colorama==0.4.4 @@ -14,7 +14,7 @@ pretty-errors==1.2.19 pycryptodome==3.19.1 PySocks==1.7.1 requests==2.31.0 -tqdm==4.59.0 +tqdm==4.66.3 urllib3==1.25.11 cryptography==42.0.4 retina-face==0.0.14 diff --git a/requirements.txt b/requirements.txt index 307e7dff6..903b608cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ altgraph==0.17 baidu-aip==2.2.18.0 -certifi==2023.7.22 +certifi==2024.7.4 chardet==4.0.0 cloudscraper==1.2.71 colorama==0.4.4 @@ -14,9 +14,8 @@ pretty-errors==1.2.19 pycryptodome==3.19.1 PySocks==1.7.1 requests==2.31.0 -tqdm==4.59.0 +tqdm==4.66.3 urllib3==1.25.11 pywin32==303 pywin32-ctypes==0.2.0 -retina-face==0.0.14 -keras==2.15.0 +cryptography==42.0.4 diff --git a/unittest/data/62knbm009 (fanza).json b/unittest/data/62knbm009 (fanza).json index fd5e265e6..51e6911d4 100644 --- a/unittest/data/62knbm009 (fanza).json +++ b/unittest/data/62knbm009 (fanza).json @@ -20,7 +20,7 @@ "6123" ], "genre_norm": null, - "score": "4.00", + "score": "5.00", "title": "同居する粘液 第1話日常の中の非日常", "ori_title": null, "magnet": null, diff --git a/unittest/data/FC2-1879420 (msin).json b/unittest/data/FC2-1879420 (msin).json deleted file mode 100644 index 3dd015d68..000000000 --- a/unittest/data/FC2-1879420 (msin).json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "dvdid": "FC2-1879420", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=1675568", - "plot": null, - "cover": "https://img.msin.info/images/cover/fc2/fc2-ppv-1879420.jpg", - "big_cover": null, - "genre": [ - "素人", - "個人撮影", - "ロリ", - "美少女", - "デルデルシリーズ", - "リーガル" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "【極上】ロリ美少女リーガルちゃんとガチセックスしてみた", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [ - "リーガルちゃん(合法ちゃん)" - ], - "actress_pics": { - "リーガルちゃん(合法ちゃん)": "https://img.msin.info/images/actress/30787.jpg" - }, - "director": null, - "duration": "51", - "producer": "deruderuking", - "publisher": null, - "uncensored": null, - "publish_date": "2021-06-20", - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/FC2-238629 (msin).json b/unittest/data/FC2-238629 (msin).json deleted file mode 100644 index 8b5b09f75..000000000 --- a/unittest/data/FC2-238629 (msin).json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "dvdid": "FC2-238629", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=2124358", - "plot": null, - "cover": null, - "big_cover": null, - "genre": [], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "佐々波綾无码流出", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [], - "actress_pics": {}, - "director": null, - "duration": null, - "producer": null, - "publisher": null, - "uncensored": null, - "publish_date": null, - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/FC2-626157 (msin).json b/unittest/data/FC2-626157 (msin).json deleted file mode 100644 index bf2a8aa27..000000000 --- a/unittest/data/FC2-626157 (msin).json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "dvdid": "FC2-626157", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=2233456", - "plot": null, - "cover": null, - "big_cover": null, - "genre": [], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "ちひろ24歳♪超S級昼顔妻♪【2時間36分】《素人ハメ撮り》《個人撮影》《156》《ちゅぱ王》", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [ - "ちひろ24歳" - ], - "actress_pics": { - "ちひろ24歳": "https://db.msin.jp/.svg/Noimage.png" - }, - "director": null, - "duration": null, - "producer": "《ちゅぱ王》素人ハメ撮り", - "publisher": null, - "uncensored": null, - "publish_date": "2017-08-03", - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/FC2-718323 (msin).json b/unittest/data/FC2-718323 (msin).json deleted file mode 100644 index be78dc9fd..000000000 --- a/unittest/data/FC2-718323 (msin).json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "dvdid": "FC2-718323", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=6593", - "plot": null, - "cover": "https://img.msin.info/images/cover/fc2/fc2-ppv-718323.jpg", - "big_cover": null, - "genre": [ - "人妻", - "ハメ撮り", - "中出し", - "個人撮影", - "オリジナル", - "無修正", - "寝取られ", - "美人", - "可愛い", - "生ハメ" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "【個人撮影】破壊力抜群のデカケツ美人妻れいなさんと再戦そして大量中出し【背徳の制服編】", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [ - "れいな29歳(桜瀬奈)" - ], - "actress_pics": { - "れいな29歳(桜瀬奈)": "https://img.msin.info/images/actress/230.jpg" - }, - "director": null, - "duration": "78", - "producer": "EX-STANDARD", - "publisher": null, - "uncensored": null, - "publish_date": "2017-11-30", - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/FC2-985469 (msin).json b/unittest/data/FC2-985469 (msin).json deleted file mode 100644 index 806108366..000000000 --- a/unittest/data/FC2-985469 (msin).json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "dvdid": "FC2-985469", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=4208", - "plot": null, - "cover": "https://img.msin.info/images/cover/fc2/fc2-ppv-985469.jpg", - "big_cover": null, - "genre": [ - "ハメ撮り", - "素人", - "中出し", - "個人撮影", - "オリジナル", - "無修正", - "コスプレ", - "フェチ", - "可愛い", - "JD" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "【個人撮影】JD2回生ちゃんに中出し!エロマンガ先生のパジャマコスで中出しえっちさせててもらいました♪ LAXD-PPV-985469", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [ - "JD2回ちゃん(やみこ)" - ], - "actress_pics": { - "JD2回ちゃん(やみこ)": "https://img.msin.info/images/actress/29037.jpg" - }, - "director": null, - "duration": null, - "producer": "COS☆ぱこ", - "publisher": null, - "uncensored": null, - "publish_date": "2018-11-23", - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/KQBD-089 (msin).json b/unittest/data/KQBD-089 (msin).json deleted file mode 100644 index 178322820..000000000 --- a/unittest/data/KQBD-089 (msin).json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "dvdid": "KQBD-089", - "cid": "244kqbd00089", - "url": "https://db.msin.jp/jp.page/movie?id=1206328", - "plot": null, - "cover": "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089pl.jpg", - "big_cover": null, - "genre": [ - "4K", - "ハイビジョン", - "単体作品", - "フェラ", - "電マ", - "女子校生", - "拘束", - "セーラー服" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "【4Kリマスター版】制服美少女と性交 心花ゆら", - "ori_title": null, - "magnet": null, - "serial": "制服美少女と性交", - "actress": [ - "心花ゆら" - ], - "actress_pics": { - "心花ゆら": "https://img.msin.info/jp.images/actress/1036363.jpg" - }, - "director": null, - "duration": "124", - "producer": "ドリームチケット", - "publisher": null, - "uncensored": false, - "publish_date": "2023-11-30", - "preview_pics": [ - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-1.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-2.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-3.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-4.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-5.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-6.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-7.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-8.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-9.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-10.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-11.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-12.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-13.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-14.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-15.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-16.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-17.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-18.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-19.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-20.jpg" - ], - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/hjmo00214 (msin).json b/unittest/data/hjmo00214 (msin).json deleted file mode 100644 index 88a26b859..000000000 --- a/unittest/data/hjmo00214 (msin).json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "dvdid": "HJMO-214", - "cid": "hjmo00214", - "url": "https://db.msin.jp/jp.page/movie?id=226579", - "plot": null, - "cover": "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214pl.jpg", - "big_cover": null, - "genre": [ - "羞恥", - "独占配信", - "デジモ", - "企画", - "素人", - "寝取り", - "寝取られ", - "NTR", - "ハイビジョン" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "彼女なら!彼氏のち○ぽ当ててみろ!! 11", - "ori_title": null, - "magnet": null, - "serial": "彼女なら!彼氏のち○ぽ当ててみろ!!", - "actress": [ - "りか(菊川里菜)", - "めい", - "なみ", - "まい(倉木みお)", - "ゆり" - ], - "actress_pics": { - "りか(菊川里菜)": "https://img.msin.info/jp.images/actress/1011262.jpg", - "めい": "https://db.msin.jp/.svg/Noimage.png", - "なみ": "https://db.msin.jp/.svg/Noimage.png", - "まい(倉木みお)": "https://img.msin.info/jp.images/actress/1005866.jpg", - "ゆり": "https://db.msin.jp/.svg/Noimage.png" - }, - "director": "はじめ", - "duration": "235", - "producer": "はじめ企画", - "publisher": null, - "uncensored": false, - "publish_date": "2011-12-08", - "preview_pics": [ - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-1.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-2.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-3.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-4.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-5.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-6.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-7.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-8.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-9.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-10.jpg" - ], - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/parathd03639 (fanza).json b/unittest/data/parathd03639 (fanza).json index 7595a09c1..5aa5f8370 100644 --- a/unittest/data/parathd03639 (fanza).json +++ b/unittest/data/parathd03639 (fanza).json @@ -11,16 +11,14 @@ "ドラマ", "巨乳", "職業色々", - "ハイビジョン", - "パラダイスTV" + "ハイビジョン" ], "genre_id": [ "5001", "4114", "2001", "1026", - "6533", - "6008" + "6533" ], "genre_norm": null, "score": "10.00", diff --git a/web/fanza.py b/web/fanza.py index 0f28d4240..b1dc5e5d2 100644 --- a/web/fanza.py +++ b/web/fanza.py @@ -63,6 +63,8 @@ def resp2html_wrapper(resp): html = resp2html(resp) if 'not available in your region' in html.text_content(): raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') + elif '無料会員登録はこちら' in html.text_content(): + raise CredentialError('此数据需要注册FANZA才能访问,或者尝试更换为日本IP') return html @@ -176,7 +178,7 @@ def parse_anime_page(movie: MovieInfo, html): """解析动画影片的页面布局""" title = html.xpath("//h1[@id='title']/text()")[0] container = html.xpath("//table[@class='mg-b12']/tr/td")[0] - cover = container.xpath("//a[@name='package-image']/@href")[0] + cover = container.xpath("//img[@name='package-image']/@src")[0] date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip() publish_date = date_str.replace('/', '-') duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()") @@ -195,7 +197,7 @@ def parse_anime_page(movie: MovieInfo, html): genre_id.append(tag.get('href').split('=')[-1].strip('/')) cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip() - preview_pics = container.xpath("//a[@name='sample-image']/img/@src") + preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy") score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 @@ -222,7 +224,7 @@ def parse_anime_page(movie: MovieInfo, html): pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo(cid='145tb017') + movie = MovieInfo(cid='d_aisoft3356') try: parse_data(movie) print(movie) diff --git a/web/javmenu.py b/web/javmenu.py index fe0202ca6..2fd5d25d9 100644 --- a/web/javmenu.py +++ b/web/javmenu.py @@ -33,9 +33,9 @@ def parse_data(movie: MovieInfo): # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站 title = title.replace(' | JAV目錄大全 | 每日更新', '') title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '') - cover_tag = container.xpath("//div[@class='single-video']")[0] - video_tag = cover_tag.find('video') - if video_tag is not None: + cover_tag = container.xpath("//div[@class='single-video']") + if len(cover_tag) > 0: + video_tag = cover_tag[0].find('video') # URL首尾竟然也有空格…… movie.cover = video_tag.get('data-poster').strip() # 预览影片改为blob了,无法获取 @@ -83,7 +83,7 @@ def parse_data(movie: MovieInfo): pretty_errors.configure(display_link=True) logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('IPX-177') + movie = MovieInfo('FC2-718323') try: parse_data(movie) print(movie) diff --git a/web/msin.py b/web/msin.py deleted file mode 100644 index 6a2eb113c..000000000 --- a/web/msin.py +++ /dev/null @@ -1,150 +0,0 @@ -"""从db.msin.jp抓取数据""" -import os -import sys -import logging -import requests - - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from web.base import * -from web.exceptions import * -from core.config import cfg -from core.lib import strftime_to_minutes -from core.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://db.msin.jp' -cookies = {'age': 'off'} - - -def normal_parser(movie: MovieInfo, html): - container = html.xpath("//div[@id='center_main']")[0] - info = container.xpath("div/div/div/div[@class='movie_info_ditail']")[0] - avid = info.xpath("div[@class='mv_pn']/text()")[0] - cid = info.xpath("div[@class='mv_fileName']/text()")[0] - title = info.xpath("div[contains(@class, 'mv_title')]/text()")[0] - cover_tag = container.xpath("//div[@class='movie_top']/img/@src") - if cover_tag: - movie.cover = cover_tag[0] - genre = info.xpath("div[@class='mv_genre']/label/text()") - actress, actress_pics = [], {} - actress_tags = info.xpath("div[contains(text(),'出演者:')]/following-sibling::div[1]/div[@class='performer_box']") - for tag in actress_tags: - name = tag.xpath("div[@class='performer_text']/a/text()")[0] - name = name.replace('(FC2動画)', '') - pic_url = tag.xpath("div[@class='performer_image']/a/img/@src")[0] - actress.append(name) - actress_pics[name] = pic_url - duration_tag = info.xpath("div[@class='mv_duration']/text()") - if duration_tag: - movie.duration = str(strftime_to_minutes(duration_tag[0])) - publish_date_tag = info.xpath("a[@class='mv_createDate']/text()") - if publish_date_tag: - movie.publish_date = publish_date_tag[0] - director_tag = info.xpath("//a[contains(@href, '/jp.page/director?')]/text()") - if director_tag: - movie.director = director_tag[0] - serial_tag = info.xpath("a[@class='mv_series']/text()") - if serial_tag: - movie.serial = serial_tag[0] - producer_tag = info.xpath("//a[@class='mv_mfr']/text()") - if producer_tag: - movie.producer = producer_tag[0] - preview_pics = info.xpath("div[contains(@class, 'mv_com1')]/div/text()") - if preview_pics: - movie.preview_pics = [i for i in preview_pics if i.startswith('https://')] - - if cfg.Crawler.hardworking_mode and False: # iframe嵌套太多了,目前用不到预览视频就先不解析了 - play_tag = container.xpath("//a[@class='playbutton popup']/@href") - if play_tag: - play_url = play_tag[0] - r2 = request_get(play_url) - TARGET_TXT = 'iframe.contentDocument.location.replace("' - begin = r2.text.find(TARGET_TXT) + len(TARGET_TXT) - end = r2.text.find('"', begin) - iframe_url = r2.text[begin:end] - iframe = get_html(iframe_url) - - movie.cid = cid - movie.dvdid = avid - movie.title = title.replace(avid, '').strip() - movie.genre = [i.strip() for i in genre] - movie.actress = actress - movie.actress_pics = actress_pics - movie.uncensored = False - - -def fc2_parser(movie: MovieInfo, html): - container = html.xpath("//div[@id='top_content']")[0] - info = container.xpath("div/div/div[@id='movie_info_ditail']")[0] - avid = info.xpath("div[@class='mv_fileName']/text()")[0].upper() - title = info.xpath("div[contains(@class, 'mv_title')]/text()")[0] - # 部分影片有预览图,但是是跳转到FC2进行预览的,且预览地址是通过js脚本解析的(带有key) - cover_tag = container.xpath("//div[@class='movie_top']/img/@src") - if cover_tag: - movie.cover = cover_tag[0] - genre = info.xpath("div[@class='mv_tag']/label/text()") - actress, actress_pics = [], {} - actress_tags = info.xpath("div[contains(text(),'出演者:')]/following-sibling::div[1]/div[@class='performer_box']") - for tag in actress_tags: - name = tag.xpath("div[@class='performer_text']/a/text()")[0] - name = name.replace('(FC2動画)', '') - pic_url = tag.xpath("div[@class='performer_image']/a/img/@src")[0] - actress.append(name) - actress_pics[name] = pic_url - duration_tag = info.xpath("div[@class='mv_duration']/text()") - if duration_tag: - movie.duration = str(strftime_to_minutes(duration_tag[0])) - - publish_date = info.xpath("a[@class='mv_createDate']/text()") - if publish_date: - movie.publish_date = publish_date[0] - producer = info.xpath("a[@class='mv_writer']/text()") - if producer: - movie.producer = producer[0] - - movie.title = title.replace(avid, '').strip() - movie.genre = [i.strip() for i in genre] - movie.actress = actress - movie.actress_pics = actress_pics - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - full_id = movie.dvdid - if full_id.startswith('FC2-'): - full_id = full_id.lower().replace('fc2-', 'fc2-ppv-') - url = f"{base_url}/search/movie?str={full_id}" # 海外:品番検索 - else: - url = f"{base_url}/branch/search?sort=jp.movie&str={full_id}" # 国内:品番検索 - r = request_get(url, cookies=cookies, delay_raise=True) - # 404说明曾经有这部影片但是下架了,如果是200但网页内容是No Results说明是完全找不到影片 - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - html = resp2html(r) - error_string = html.xpath("//div[@class='error_string']/text()") - if error_string: - if error_string[0].strip() == 'No Results': - raise MovieNotFoundError(__name__, movie.dvdid) - - movie.url = r.url - if full_id.startswith('fc2-'): - fc2_parser(movie, html) - else: - normal_parser(movie, html) - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - - movie = MovieInfo('hjmo00214') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/web/translate.py b/web/translate.py index 919d1fb81..11281c813 100644 --- a/web/translate.py +++ b/web/translate.py @@ -157,7 +157,7 @@ def google_trans(texts, to='zh_CN'): # API: https://www.jianshu.com/p/ce35d89c25c3 # client参数的选择: https://github.com/lmk123/crx-selection-translate/issues/223#issue-184432017 global _google_trans_wait - url = f"http://translate.google.com/translate_a/single?client=at&dt=t&dj=1&ie=UTF-8&sl=auto&tl={to}&q=" + texts + url = f"https://translate.google.com.hk/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={to}&q={texts}" r = requests.get(url, proxies=cfg.Network.proxy) while r.status_code == 429: logger.warning(f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试") @@ -169,6 +169,7 @@ def google_trans(texts, to='zh_CN'): result = r.json() else: result = {'error_code': r.status_code, 'error_msg': r.reason} + time.sleep(4) # Google翻译的API有QPS限制,因此需要等待一段时间 return result