feat: support fantia.jp/product

支持https://fantia.jp/products/123456这类网站的刮削要求视频番号前缀为 fantia-product- 如fantia-product-123456
poorguy404 · Feb 9, 2025 · cc46ff5 · cc46ff5
1 parent c4cfe61
commit cc46ff5
Show file tree

Hide file tree

Showing 4 changed files with 160 additions and 1 deletion.
diff --git a/config.yml b/config.yml
@@ -47,6 +47,7 @@ crawler:
     cid: [fanza]
     getchu: [dl_getchu]
     gyutto: [gyutto]
+    fantia_product: [fantia_product]
   # 爬虫至少要获取到哪些字段才可以视为抓取成功？
   required_keys: [cover, title]
   # 努力爬取更准确更丰富的信息（会略微增加部分站点的爬取耗时）
@@ -129,6 +130,7 @@ summarizer:
         - '^GANA'
         - '^MIUM'
         - '^HHL'
+        - '^FANTIA'
       # 要使用的图像识别引擎，详细配置见文档 https://github.com/Yuukiy/JavSP/wiki/AI-%7C-%E4%BA%BA%E8%84%B8%E8%AF%86%E5%88%AB
       # NOTE: 此处无法直接对应，请参照注释手动填入
       engine: null # null表示禁用图像剪裁
@@ -187,7 +189,12 @@ translator:
     title: true
     # 是否翻译剧情简介
     plot: true
-
+
+# 记录每个网址的cookie,win11系统 get_browsers_cookies() 方法仍然不可用: Could not decode to UTF-8 column 'encrypted_value' with text
+cookie:
+  # fantia.jp
+  fantia: ''
+
 ################################
 other:
   # 是否在stdin/stdout进行交互

diff --git a/javsp/avid.py b/javsp/avid.py
@@ -32,6 +32,10 @@ def get_id(filepath_str: str) -> str:
         match = re.search(r'GYUTTO-(\d+)', norm, re.I)
         if match:
             return 'GYUTTO-' + match.group(1)
+    elif 'FANTIA-PRODUCT' in norm:
+        match = re.search(r'FANTIA-PRODUCT[-_]*(\d+)', norm, re.I)
+        if match:
+            return 'FANTIA-PRODUCT-' + match.group(1)
     elif '259LUXU' in norm: # special case having form of '259luxu'
         match = re.search(r'259LUXU-(\d+)', norm, re.I)
         if match:
@@ -141,6 +145,9 @@ def guess_av_type(avid: str) -> str:
     match = re.match(r'^GYUTTO-(\d+)',avid,re.I)
     if match:
         return 'gyutto'
+    match = re.match(r'^fantia-product-(\d+)',avid,re.I)
+    if match:
+        return 'fantia_product'
     # 如果传入的avid完全匹配cid的模式，则将影片归类为cid
     cid = get_cid(avid)
     if cid == avid:

diff --git a/javsp/config.py b/javsp/config.py
@@ -38,6 +38,7 @@ class CrawlerID(str, Enum):
     prestige = 'prestige'
     arzon = 'arzon'
     arzon_iv = 'arzon_iv'
+    fantia_product = 'fantia_product'
 
 class Network(BaseConfig):
     proxy_server: Url | None
@@ -53,6 +54,7 @@ def items(self) -> List[tuple[str, list[CrawlerID]]]:
             ('cid', self.cid),
             ('getchu', self.getchu),
             ('gyutto', self.gyutto),
+            ('fantia_product',self.fantia_product)
         ]
 
     def __getitem__(self, index) -> list[CrawlerID]:
@@ -67,13 +69,16 @@ def __getitem__(self, index) -> list[CrawlerID]:
                 return self.getchu
             case 'gyutto':
                 return self.gyutto
+            case 'fantia_product':
+                return self.fantia_product
         raise Exception("Unknown crawler type")
 
     normal: list[CrawlerID]
     fc2: list[CrawlerID]
     cid: list[CrawlerID]
     getchu: list[CrawlerID]
     gyutto: list[CrawlerID]
+    fantia_product: list[CrawlerID]
 
 class MovieInfoField(str, Enum):
     dvdid = 'dvdid'
@@ -210,6 +215,9 @@ class Translator(BaseConfig):
     engine: TranslateEngine = Field(..., discriminator='name')
     fields: TranslateField
 
+class Cookie(BaseConfig):
+    fantia: str
+
 class Other(BaseConfig):
     interactive: bool
     check_update: bool
@@ -233,5 +241,6 @@ class Cfg(BaseConfig):
     crawler: Crawler
     summarizer: Summarizer
     translator: Translator
+    cookie: Cookie
     other: Other
     CONFIG_SOURCES=get_config_source()
diff --git a/javsp/web/fantia_product.py b/javsp/web/fantia_product.py
@@ -0,0 +1,136 @@
+"""从JavDB抓取数据"""
+import os
+import re
+import logging
+
+import lxml.html
+
+from javsp.web.base import Request, resp2html
+from javsp.web.exceptions import *
+from javsp.config import Cfg
+from javsp.datatype import MovieInfo, GenreMap
+
+# 初始化Request实例。使用scraper绕过CloudFlare后，需要指定网页语言，否则可能会返回其他语言网页，影响解析
+request = Request(use_scraper=False)
+
+cookie = Cfg().cookie.fantia
+request.headers['Cookie'] = cookie
+
+logger = logging.getLogger(__name__)
+base_url = 'https://fantia.jp/products'
+
+
+def get_html_wrapper(url):
+    """包装外发的request请求并负责转换为可xpath的html，同时处理Cookies无效等问题"""
+    global request, cookies_pool
+    if len(cookie) == 0:
+        raise ValueError('检测到fantia-product-的影片, 但fantia cookie为空')
+    r = request.get(url, delay_raise=True)
+    if r.status_code == 200:
+        html = resp2html(r)
+        return html
+    else:
+        raise WebsiteError(f'fantia product: {r.status_code} 非预期状态码: {url}')
+
+
+def parse_data(movie: MovieInfo):
+    """从网页抓取并解析指定番号的数据
+    Args:
+        movie (MovieInfo): 要解析的影片信息，解析后的信息直接更新到此变量内
+    """
+    id = movie.dvdid.lower()
+    prefix = "fantia-product-"
+    if not id.startswith(prefix):
+        raise ValueError(f"Invalid Fantia Product number: " + movie.dvdid)
+    fantia_num = id.replace(prefix, '')
+    url = f'{base_url}/{fantia_num}'
+
+    try:
+        html: lxml.html.HtmlComment = get_html_wrapper(url)
+    except (SitePermissionError, CredentialError):
+        return
+
+    # title
+    title = html.xpath("//div[@class='product-header']/h1")
+    if len(title) > 0:
+        title = title[0].text_content()
+
+    # plot
+    plot_vec = html.xpath("//div[@class='product-description']/div/p")
+    plot = ''
+    for line in plot_vec:
+        plot += line.text
+
+    # cover
+    cover = html.xpath("//picture/img[@class='img-fluid ']")
+    if len(cover) > 0:
+        cover = str(cover[0].get('src')).strip()
+
+    # actress
+    actress = html.xpath("//h1[@class='fanclub-name']/a")
+    actress_str = actress[0].text.strip() if actress else None
+    actress = [actress_str] if actress_str else []
+
+    # actress_pic
+    # 为了使用actress_alias.json,需要有演员头像,好在fantia都能获取到
+    actress_pics = {}
+    actress_pic = html.xpath("//div[@class='fanclub-header']/a/picture/img")
+    if len(actress_pic) > 0:
+        actress_pic = str(actress_pic[0].get('data-src')).strip()
+        print(actress_pic)
+        actress_pics[actress_str] = actress_pic
+
+    # genre
+    genres = []
+    tags_1 = html.xpath("//div[@class='product-header']/div/div/a")
+    for genre in tags_1:
+        genre = str(genre.text).removeprefix('#').strip()
+        if len(genre) > 0:
+            genres.append(genre)
+    tags_2 = html.xpath("//div[@class='product-header']/div/a")
+    for genre in tags_2:
+        genre = str(genre.text).removeprefix('#').strip()
+        if len(genre) > 0:
+            genres.append(genre)
+
+    # preview_pics
+
+    movie.title = title
+    movie.dvdid = id
+    movie.url = url
+    movie.cover = cover
+    movie.plot = plot
+    movie.actress = actress
+    movie.genre = genres
+    movie.actress_pics = actress_pics
+    movie.uncensored = False  # 没在fantia上看到过无码的
+
+
+def parse_clean_data(movie: MovieInfo):
+    """解析指定番号的影片数据并进行清洗"""
+    try:
+        parse_data(movie)
+        # 检查封面URL是否真的存在对应图片
+        if movie.cover is not None:
+            r = request.head(movie.cover)
+            if r.status_code != 200:
+                movie.cover = None
+    except SiteBlocked:
+        raise
+        logger.error('unexpected error')
+
+
+
+
+if __name__ == "__main__":
+    import pretty_errors
+
+    pretty_errors.configure(display_link=True)
+    logger.root.handlers[1].level = logging.DEBUG
+
+    movie = MovieInfo('fantia-product-648810')
+    try:
+        parse_clean_data(movie)
+        print(movie)
+    except CrawlerError as e:
+        logger.error(e, exc_info=1)