From cc46ff552db2917a0b58f89ab3b3a1cee33d1ec1 Mon Sep 17 00:00:00 2001 From: poorguy404 <2289129267@qq.com> Date: Mon, 10 Feb 2025 01:41:17 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20support=20fantia.jp/product=20=E6=94=AF?= =?UTF-8?q?=E6=8C=81https://fantia.jp/products/123456=E8=BF=99=E7=B1=BB?= =?UTF-8?q?=E7=BD=91=E7=AB=99=E7=9A=84=E5=88=AE=E5=89=8A=20=E8=A6=81?= =?UTF-8?q?=E6=B1=82=E8=A7=86=E9=A2=91=E7=95=AA=E5=8F=B7=E5=89=8D=E7=BC=80?= =?UTF-8?q?=E4=B8=BA=20fantia-product-=20=E5=A6=82fantia-product-123456?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.yml | 9 ++- javsp/avid.py | 7 ++ javsp/config.py | 9 +++ javsp/web/fantia_product.py | 136 ++++++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 javsp/web/fantia_product.py diff --git a/config.yml b/config.yml index 17547fbff..50bde5abe 100644 --- a/config.yml +++ b/config.yml @@ -47,6 +47,7 @@ crawler: cid: [fanza] getchu: [dl_getchu] gyutto: [gyutto] + fantia_product: [fantia_product] # 爬虫至少要获取到哪些字段才可以视为抓取成功? required_keys: [cover, title] # 努力爬取更准确更丰富的信息(会略微增加部分站点的爬取耗时) @@ -129,6 +130,7 @@ summarizer: - '^GANA' - '^MIUM' - '^HHL' + - '^FANTIA' # 要使用的图像识别引擎,详细配置见文档 https://github.com/Yuukiy/JavSP/wiki/AI-%7C-%E4%BA%BA%E8%84%B8%E8%AF%86%E5%88%AB # NOTE: 此处无法直接对应,请参照注释手动填入 engine: null # null表示禁用图像剪裁 @@ -187,7 +189,12 @@ translator: title: true # 是否翻译剧情简介 plot: true - + +# 记录每个网址的cookie,win11系统 get_browsers_cookies() 方法仍然不可用: Could not decode to UTF-8 column 'encrypted_value' with text +cookie: + # fantia.jp + fantia: '' + ################################ other: # 是否在stdin/stdout进行交互 diff --git a/javsp/avid.py b/javsp/avid.py index 3ed4a9297..3c584a219 100644 --- a/javsp/avid.py +++ b/javsp/avid.py @@ -32,6 +32,10 @@ def get_id(filepath_str: str) -> str: match = re.search(r'GYUTTO-(\d+)', norm, re.I) if match: return 'GYUTTO-' + match.group(1) + elif 'FANTIA-PRODUCT' in norm: + match = re.search(r'FANTIA-PRODUCT[-_]*(\d+)', norm, re.I) + if match: + return 'FANTIA-PRODUCT-' + match.group(1) elif '259LUXU' in norm: # special case having form of '259luxu' match = re.search(r'259LUXU-(\d+)', norm, re.I) if match: @@ -141,6 +145,9 @@ def guess_av_type(avid: str) -> str: match = re.match(r'^GYUTTO-(\d+)',avid,re.I) if match: return 'gyutto' + match = re.match(r'^fantia-product-(\d+)',avid,re.I) + if match: + return 'fantia_product' # 如果传入的avid完全匹配cid的模式,则将影片归类为cid cid = get_cid(avid) if cid == avid: diff --git a/javsp/config.py b/javsp/config.py index c491e6621..42ff4b622 100644 --- a/javsp/config.py +++ b/javsp/config.py @@ -38,6 +38,7 @@ class CrawlerID(str, Enum): prestige = 'prestige' arzon = 'arzon' arzon_iv = 'arzon_iv' + fantia_product = 'fantia_product' class Network(BaseConfig): proxy_server: Url | None @@ -53,6 +54,7 @@ def items(self) -> List[tuple[str, list[CrawlerID]]]: ('cid', self.cid), ('getchu', self.getchu), ('gyutto', self.gyutto), + ('fantia_product',self.fantia_product) ] def __getitem__(self, index) -> list[CrawlerID]: @@ -67,6 +69,8 @@ def __getitem__(self, index) -> list[CrawlerID]: return self.getchu case 'gyutto': return self.gyutto + case 'fantia_product': + return self.fantia_product raise Exception("Unknown crawler type") normal: list[CrawlerID] @@ -74,6 +78,7 @@ def __getitem__(self, index) -> list[CrawlerID]: cid: list[CrawlerID] getchu: list[CrawlerID] gyutto: list[CrawlerID] + fantia_product: list[CrawlerID] class MovieInfoField(str, Enum): dvdid = 'dvdid' @@ -210,6 +215,9 @@ class Translator(BaseConfig): engine: TranslateEngine = Field(..., discriminator='name') fields: TranslateField +class Cookie(BaseConfig): + fantia: str + class Other(BaseConfig): interactive: bool check_update: bool @@ -233,5 +241,6 @@ class Cfg(BaseConfig): crawler: Crawler summarizer: Summarizer translator: Translator + cookie: Cookie other: Other CONFIG_SOURCES=get_config_source() diff --git a/javsp/web/fantia_product.py b/javsp/web/fantia_product.py new file mode 100644 index 000000000..fae35bbf1 --- /dev/null +++ b/javsp/web/fantia_product.py @@ -0,0 +1,136 @@ +"""从JavDB抓取数据""" +import os +import re +import logging + +import lxml.html + +from javsp.web.base import Request, resp2html +from javsp.web.exceptions import * +from javsp.config import Cfg +from javsp.datatype import MovieInfo, GenreMap + +# 初始化Request实例。使用scraper绕过CloudFlare后,需要指定网页语言,否则可能会返回其他语言网页,影响解析 +request = Request(use_scraper=False) + +cookie = Cfg().cookie.fantia +request.headers['Cookie'] = cookie + +logger = logging.getLogger(__name__) +base_url = 'https://fantia.jp/products' + + +def get_html_wrapper(url): + """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题""" + global request, cookies_pool + if len(cookie) == 0: + raise ValueError('检测到fantia-product-的影片, 但fantia cookie为空') + r = request.get(url, delay_raise=True) + if r.status_code == 200: + html = resp2html(r) + return html + else: + raise WebsiteError(f'fantia product: {r.status_code} 非预期状态码: {url}') + + +def parse_data(movie: MovieInfo): + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + id = movie.dvdid.lower() + prefix = "fantia-product-" + if not id.startswith(prefix): + raise ValueError(f"Invalid Fantia Product number: " + movie.dvdid) + fantia_num = id.replace(prefix, '') + url = f'{base_url}/{fantia_num}' + + try: + html: lxml.html.HtmlComment = get_html_wrapper(url) + except (SitePermissionError, CredentialError): + return + + # title + title = html.xpath("//div[@class='product-header']/h1") + if len(title) > 0: + title = title[0].text_content() + + # plot + plot_vec = html.xpath("//div[@class='product-description']/div/p") + plot = '' + for line in plot_vec: + plot += line.text + + # cover + cover = html.xpath("//picture/img[@class='img-fluid ']") + if len(cover) > 0: + cover = str(cover[0].get('src')).strip() + + # actress + actress = html.xpath("//h1[@class='fanclub-name']/a") + actress_str = actress[0].text.strip() if actress else None + actress = [actress_str] if actress_str else [] + + # actress_pic + # 为了使用actress_alias.json,需要有演员头像,好在fantia都能获取到 + actress_pics = {} + actress_pic = html.xpath("//div[@class='fanclub-header']/a/picture/img") + if len(actress_pic) > 0: + actress_pic = str(actress_pic[0].get('data-src')).strip() + print(actress_pic) + actress_pics[actress_str] = actress_pic + + # genre + genres = [] + tags_1 = html.xpath("//div[@class='product-header']/div/div/a") + for genre in tags_1: + genre = str(genre.text).removeprefix('#').strip() + if len(genre) > 0: + genres.append(genre) + tags_2 = html.xpath("//div[@class='product-header']/div/a") + for genre in tags_2: + genre = str(genre.text).removeprefix('#').strip() + if len(genre) > 0: + genres.append(genre) + + # preview_pics + + movie.title = title + movie.dvdid = id + movie.url = url + movie.cover = cover + movie.plot = plot + movie.actress = actress + movie.genre = genres + movie.actress_pics = actress_pics + movie.uncensored = False # 没在fantia上看到过无码的 + + +def parse_clean_data(movie: MovieInfo): + """解析指定番号的影片数据并进行清洗""" + try: + parse_data(movie) + # 检查封面URL是否真的存在对应图片 + if movie.cover is not None: + r = request.head(movie.cover) + if r.status_code != 200: + movie.cover = None + except SiteBlocked: + raise + logger.error('unexpected error') + + + + +if __name__ == "__main__": + import pretty_errors + + pretty_errors.configure(display_link=True) + logger.root.handlers[1].level = logging.DEBUG + + movie = MovieInfo('fantia-product-648810') + try: + parse_clean_data(movie) + print(movie) + except CrawlerError as e: + logger.error(e, exc_info=1)