From c2c72b6c890db11d4f2331b1bc4f1598e2476df4 Mon Sep 17 00:00:00 2001 From: Yuukiy <76897913+Yuukiy@users.noreply.github.com> Date: Sat, 17 Aug 2024 16:56:45 +0800 Subject: [PATCH] =?UTF-8?q?msin:=20=E7=AB=99=E7=82=B9=E5=85=B3=E9=97=AD?= =?UTF-8?q?=EF=BC=8C=E7=A7=BB=E9=99=A4=E7=9B=B8=E5=BA=94=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E5=8F=8A=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/config.ini | 4 +- unittest/data/FC2-1879420 (msin).json | 37 ------- unittest/data/FC2-238629 (msin).json | 26 ----- unittest/data/FC2-626157 (msin).json | 30 ------ unittest/data/FC2-718323 (msin).json | 41 ------- unittest/data/FC2-985469 (msin).json | 41 ------- unittest/data/KQBD-089 (msin).json | 60 ----------- unittest/data/hjmo00214 (msin).json | 59 ---------- web/msin.py | 150 -------------------------- 9 files changed, 2 insertions(+), 446 deletions(-) delete mode 100644 unittest/data/FC2-1879420 (msin).json delete mode 100644 unittest/data/FC2-238629 (msin).json delete mode 100644 unittest/data/FC2-626157 (msin).json delete mode 100644 unittest/data/FC2-718323 (msin).json delete mode 100644 unittest/data/FC2-985469 (msin).json delete mode 100644 unittest/data/KQBD-089 (msin).json delete mode 100644 unittest/data/hjmo00214 (msin).json delete mode 100644 web/msin.py diff --git a/core/config.ini b/core/config.ini index eb1605bd4..c2c0f4e6d 100644 --- a/core/config.ini +++ b/core/config.ini @@ -30,10 +30,10 @@ retry = 3 timeout = 10 # 要使用的爬虫列表(汇总数据时从前到后进行) -# airav avsox avwiki fanza fc2 fc2fan javbus javdb javlib javmenu jav321 msin mgstage prestige +# airav avsox avwiki fanza fc2 fc2fan javbus javdb javlib javmenu jav321 mgstage prestige [CrawlerSelect] normal = airav,avsox,javbus,javdb,javlib,jav321,mgstage,prestige -fc2 = fc2,msin,avsox,javdb,javmenu +fc2 = fc2,avsox,javdb,javmenu cid = fanza getchu = dl_getchu gyutto = gyutto diff --git a/unittest/data/FC2-1879420 (msin).json b/unittest/data/FC2-1879420 (msin).json deleted file mode 100644 index 3dd015d68..000000000 --- a/unittest/data/FC2-1879420 (msin).json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "dvdid": "FC2-1879420", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=1675568", - "plot": null, - "cover": "https://img.msin.info/images/cover/fc2/fc2-ppv-1879420.jpg", - "big_cover": null, - "genre": [ - "素人", - "個人撮影", - "ロリ", - "美少女", - "デルデルシリーズ", - "リーガル" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "【極上】ロリ美少女リーガルちゃんとガチセックスしてみた", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [ - "リーガルちゃん(合法ちゃん)" - ], - "actress_pics": { - "リーガルちゃん(合法ちゃん)": "https://img.msin.info/images/actress/30787.jpg" - }, - "director": null, - "duration": "51", - "producer": "deruderuking", - "publisher": null, - "uncensored": null, - "publish_date": "2021-06-20", - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/FC2-238629 (msin).json b/unittest/data/FC2-238629 (msin).json deleted file mode 100644 index 8b5b09f75..000000000 --- a/unittest/data/FC2-238629 (msin).json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "dvdid": "FC2-238629", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=2124358", - "plot": null, - "cover": null, - "big_cover": null, - "genre": [], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "佐々波綾无码流出", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [], - "actress_pics": {}, - "director": null, - "duration": null, - "producer": null, - "publisher": null, - "uncensored": null, - "publish_date": null, - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/FC2-626157 (msin).json b/unittest/data/FC2-626157 (msin).json deleted file mode 100644 index bf2a8aa27..000000000 --- a/unittest/data/FC2-626157 (msin).json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "dvdid": "FC2-626157", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=2233456", - "plot": null, - "cover": null, - "big_cover": null, - "genre": [], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "ちひろ24歳♪超S級昼顔妻♪【2時間36分】《素人ハメ撮り》《個人撮影》《156》《ちゅぱ王》", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [ - "ちひろ24歳" - ], - "actress_pics": { - "ちひろ24歳": "https://db.msin.jp/.svg/Noimage.png" - }, - "director": null, - "duration": null, - "producer": "《ちゅぱ王》素人ハメ撮り", - "publisher": null, - "uncensored": null, - "publish_date": "2017-08-03", - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/FC2-718323 (msin).json b/unittest/data/FC2-718323 (msin).json deleted file mode 100644 index be78dc9fd..000000000 --- a/unittest/data/FC2-718323 (msin).json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "dvdid": "FC2-718323", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=6593", - "plot": null, - "cover": "https://img.msin.info/images/cover/fc2/fc2-ppv-718323.jpg", - "big_cover": null, - "genre": [ - "人妻", - "ハメ撮り", - "中出し", - "個人撮影", - "オリジナル", - "無修正", - "寝取られ", - "美人", - "可愛い", - "生ハメ" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "【個人撮影】破壊力抜群のデカケツ美人妻れいなさんと再戦そして大量中出し【背徳の制服編】", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [ - "れいな29歳(桜瀬奈)" - ], - "actress_pics": { - "れいな29歳(桜瀬奈)": "https://img.msin.info/images/actress/230.jpg" - }, - "director": null, - "duration": "78", - "producer": "EX-STANDARD", - "publisher": null, - "uncensored": null, - "publish_date": "2017-11-30", - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/FC2-985469 (msin).json b/unittest/data/FC2-985469 (msin).json deleted file mode 100644 index 806108366..000000000 --- a/unittest/data/FC2-985469 (msin).json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "dvdid": "FC2-985469", - "cid": null, - "url": "https://db.msin.jp/page/movie?id=4208", - "plot": null, - "cover": "https://img.msin.info/images/cover/fc2/fc2-ppv-985469.jpg", - "big_cover": null, - "genre": [ - "ハメ撮り", - "素人", - "中出し", - "個人撮影", - "オリジナル", - "無修正", - "コスプレ", - "フェチ", - "可愛い", - "JD" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "【個人撮影】JD2回生ちゃんに中出し!エロマンガ先生のパジャマコスで中出しえっちさせててもらいました♪ LAXD-PPV-985469", - "ori_title": null, - "magnet": null, - "serial": null, - "actress": [ - "JD2回ちゃん(やみこ)" - ], - "actress_pics": { - "JD2回ちゃん(やみこ)": "https://img.msin.info/images/actress/29037.jpg" - }, - "director": null, - "duration": null, - "producer": "COS☆ぱこ", - "publisher": null, - "uncensored": null, - "publish_date": "2018-11-23", - "preview_pics": null, - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/KQBD-089 (msin).json b/unittest/data/KQBD-089 (msin).json deleted file mode 100644 index 178322820..000000000 --- a/unittest/data/KQBD-089 (msin).json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "dvdid": "KQBD-089", - "cid": "244kqbd00089", - "url": "https://db.msin.jp/jp.page/movie?id=1206328", - "plot": null, - "cover": "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089pl.jpg", - "big_cover": null, - "genre": [ - "4K", - "ハイビジョン", - "単体作品", - "フェラ", - "電マ", - "女子校生", - "拘束", - "セーラー服" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "【4Kリマスター版】制服美少女と性交 心花ゆら", - "ori_title": null, - "magnet": null, - "serial": "制服美少女と性交", - "actress": [ - "心花ゆら" - ], - "actress_pics": { - "心花ゆら": "https://img.msin.info/jp.images/actress/1036363.jpg" - }, - "director": null, - "duration": "124", - "producer": "ドリームチケット", - "publisher": null, - "uncensored": false, - "publish_date": "2023-11-30", - "preview_pics": [ - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-1.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-2.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-3.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-4.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-5.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-6.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-7.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-8.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-9.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-10.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-11.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-12.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-13.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-14.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-15.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-16.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-17.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-18.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-19.jpg", - "https://pics.dmm.co.jp/digital/video/244kqbd00089/244kqbd00089jp-20.jpg" - ], - "preview_video": null -} \ No newline at end of file diff --git a/unittest/data/hjmo00214 (msin).json b/unittest/data/hjmo00214 (msin).json deleted file mode 100644 index 88a26b859..000000000 --- a/unittest/data/hjmo00214 (msin).json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "dvdid": "HJMO-214", - "cid": "hjmo00214", - "url": "https://db.msin.jp/jp.page/movie?id=226579", - "plot": null, - "cover": "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214pl.jpg", - "big_cover": null, - "genre": [ - "羞恥", - "独占配信", - "デジモ", - "企画", - "素人", - "寝取り", - "寝取られ", - "NTR", - "ハイビジョン" - ], - "genre_id": null, - "genre_norm": null, - "score": null, - "title": "彼女なら!彼氏のち○ぽ当ててみろ!! 11", - "ori_title": null, - "magnet": null, - "serial": "彼女なら!彼氏のち○ぽ当ててみろ!!", - "actress": [ - "りか(菊川里菜)", - "めい", - "なみ", - "まい(倉木みお)", - "ゆり" - ], - "actress_pics": { - "りか(菊川里菜)": "https://img.msin.info/jp.images/actress/1011262.jpg", - "めい": "https://db.msin.jp/.svg/Noimage.png", - "なみ": "https://db.msin.jp/.svg/Noimage.png", - "まい(倉木みお)": "https://img.msin.info/jp.images/actress/1005866.jpg", - "ゆり": "https://db.msin.jp/.svg/Noimage.png" - }, - "director": "はじめ", - "duration": "235", - "producer": "はじめ企画", - "publisher": null, - "uncensored": false, - "publish_date": "2011-12-08", - "preview_pics": [ - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-1.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-2.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-3.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-4.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-5.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-6.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-7.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-8.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-9.jpg", - "https://pics.dmm.co.jp/digital/video/hjmo00214/hjmo00214jp-10.jpg" - ], - "preview_video": null -} \ No newline at end of file diff --git a/web/msin.py b/web/msin.py deleted file mode 100644 index 6a2eb113c..000000000 --- a/web/msin.py +++ /dev/null @@ -1,150 +0,0 @@ -"""从db.msin.jp抓取数据""" -import os -import sys -import logging -import requests - - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from web.base import * -from web.exceptions import * -from core.config import cfg -from core.lib import strftime_to_minutes -from core.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://db.msin.jp' -cookies = {'age': 'off'} - - -def normal_parser(movie: MovieInfo, html): - container = html.xpath("//div[@id='center_main']")[0] - info = container.xpath("div/div/div/div[@class='movie_info_ditail']")[0] - avid = info.xpath("div[@class='mv_pn']/text()")[0] - cid = info.xpath("div[@class='mv_fileName']/text()")[0] - title = info.xpath("div[contains(@class, 'mv_title')]/text()")[0] - cover_tag = container.xpath("//div[@class='movie_top']/img/@src") - if cover_tag: - movie.cover = cover_tag[0] - genre = info.xpath("div[@class='mv_genre']/label/text()") - actress, actress_pics = [], {} - actress_tags = info.xpath("div[contains(text(),'出演者:')]/following-sibling::div[1]/div[@class='performer_box']") - for tag in actress_tags: - name = tag.xpath("div[@class='performer_text']/a/text()")[0] - name = name.replace('(FC2動画)', '') - pic_url = tag.xpath("div[@class='performer_image']/a/img/@src")[0] - actress.append(name) - actress_pics[name] = pic_url - duration_tag = info.xpath("div[@class='mv_duration']/text()") - if duration_tag: - movie.duration = str(strftime_to_minutes(duration_tag[0])) - publish_date_tag = info.xpath("a[@class='mv_createDate']/text()") - if publish_date_tag: - movie.publish_date = publish_date_tag[0] - director_tag = info.xpath("//a[contains(@href, '/jp.page/director?')]/text()") - if director_tag: - movie.director = director_tag[0] - serial_tag = info.xpath("a[@class='mv_series']/text()") - if serial_tag: - movie.serial = serial_tag[0] - producer_tag = info.xpath("//a[@class='mv_mfr']/text()") - if producer_tag: - movie.producer = producer_tag[0] - preview_pics = info.xpath("div[contains(@class, 'mv_com1')]/div/text()") - if preview_pics: - movie.preview_pics = [i for i in preview_pics if i.startswith('https://')] - - if cfg.Crawler.hardworking_mode and False: # iframe嵌套太多了,目前用不到预览视频就先不解析了 - play_tag = container.xpath("//a[@class='playbutton popup']/@href") - if play_tag: - play_url = play_tag[0] - r2 = request_get(play_url) - TARGET_TXT = 'iframe.contentDocument.location.replace("' - begin = r2.text.find(TARGET_TXT) + len(TARGET_TXT) - end = r2.text.find('"', begin) - iframe_url = r2.text[begin:end] - iframe = get_html(iframe_url) - - movie.cid = cid - movie.dvdid = avid - movie.title = title.replace(avid, '').strip() - movie.genre = [i.strip() for i in genre] - movie.actress = actress - movie.actress_pics = actress_pics - movie.uncensored = False - - -def fc2_parser(movie: MovieInfo, html): - container = html.xpath("//div[@id='top_content']")[0] - info = container.xpath("div/div/div[@id='movie_info_ditail']")[0] - avid = info.xpath("div[@class='mv_fileName']/text()")[0].upper() - title = info.xpath("div[contains(@class, 'mv_title')]/text()")[0] - # 部分影片有预览图,但是是跳转到FC2进行预览的,且预览地址是通过js脚本解析的(带有key) - cover_tag = container.xpath("//div[@class='movie_top']/img/@src") - if cover_tag: - movie.cover = cover_tag[0] - genre = info.xpath("div[@class='mv_tag']/label/text()") - actress, actress_pics = [], {} - actress_tags = info.xpath("div[contains(text(),'出演者:')]/following-sibling::div[1]/div[@class='performer_box']") - for tag in actress_tags: - name = tag.xpath("div[@class='performer_text']/a/text()")[0] - name = name.replace('(FC2動画)', '') - pic_url = tag.xpath("div[@class='performer_image']/a/img/@src")[0] - actress.append(name) - actress_pics[name] = pic_url - duration_tag = info.xpath("div[@class='mv_duration']/text()") - if duration_tag: - movie.duration = str(strftime_to_minutes(duration_tag[0])) - - publish_date = info.xpath("a[@class='mv_createDate']/text()") - if publish_date: - movie.publish_date = publish_date[0] - producer = info.xpath("a[@class='mv_writer']/text()") - if producer: - movie.producer = producer[0] - - movie.title = title.replace(avid, '').strip() - movie.genre = [i.strip() for i in genre] - movie.actress = actress - movie.actress_pics = actress_pics - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - full_id = movie.dvdid - if full_id.startswith('FC2-'): - full_id = full_id.lower().replace('fc2-', 'fc2-ppv-') - url = f"{base_url}/search/movie?str={full_id}" # 海外:品番検索 - else: - url = f"{base_url}/branch/search?sort=jp.movie&str={full_id}" # 国内:品番検索 - r = request_get(url, cookies=cookies, delay_raise=True) - # 404说明曾经有这部影片但是下架了,如果是200但网页内容是No Results说明是完全找不到影片 - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - html = resp2html(r) - error_string = html.xpath("//div[@class='error_string']/text()") - if error_string: - if error_string[0].strip() == 'No Results': - raise MovieNotFoundError(__name__, movie.dvdid) - - movie.url = r.url - if full_id.startswith('fc2-'): - fc2_parser(movie, html) - else: - normal_parser(movie, html) - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - - movie = MovieInfo('hjmo00214') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1)