Skip to content

Commit

Permalink
feat: support fantia.jp/product
Browse files Browse the repository at this point in the history
支持https://fantia.jp/products/123456这类网站的刮削
要求视频番号前缀为 fantia-product-
如fantia-product-123456
  • Loading branch information
poorguy404 committed Feb 9, 2025
1 parent c4cfe61 commit cc46ff5
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 1 deletion.
9 changes: 8 additions & 1 deletion config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ crawler:
cid: [fanza]
getchu: [dl_getchu]
gyutto: [gyutto]
fantia_product: [fantia_product]
# 爬虫至少要获取到哪些字段才可以视为抓取成功?
required_keys: [cover, title]
# 努力爬取更准确更丰富的信息(会略微增加部分站点的爬取耗时)
Expand Down Expand Up @@ -129,6 +130,7 @@ summarizer:
- '^GANA'
- '^MIUM'
- '^HHL'
- '^FANTIA'
# 要使用的图像识别引擎,详细配置见文档 https://github.com/Yuukiy/JavSP/wiki/AI-%7C-%E4%BA%BA%E8%84%B8%E8%AF%86%E5%88%AB
# NOTE: 此处无法直接对应,请参照注释手动填入
engine: null # null表示禁用图像剪裁
Expand Down Expand Up @@ -187,7 +189,12 @@ translator:
title: true
# 是否翻译剧情简介
plot: true


# 记录每个网址的cookie,win11系统 get_browsers_cookies() 方法仍然不可用: Could not decode to UTF-8 column 'encrypted_value' with text
cookie:
# fantia.jp
fantia: ''

################################
other:
# 是否在stdin/stdout进行交互
Expand Down
7 changes: 7 additions & 0 deletions javsp/avid.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ def get_id(filepath_str: str) -> str:
match = re.search(r'GYUTTO-(\d+)', norm, re.I)
if match:
return 'GYUTTO-' + match.group(1)
elif 'FANTIA-PRODUCT' in norm:
match = re.search(r'FANTIA-PRODUCT[-_]*(\d+)', norm, re.I)
if match:
return 'FANTIA-PRODUCT-' + match.group(1)
elif '259LUXU' in norm: # special case having form of '259luxu'
match = re.search(r'259LUXU-(\d+)', norm, re.I)
if match:
Expand Down Expand Up @@ -141,6 +145,9 @@ def guess_av_type(avid: str) -> str:
match = re.match(r'^GYUTTO-(\d+)',avid,re.I)
if match:
return 'gyutto'
match = re.match(r'^fantia-product-(\d+)',avid,re.I)
if match:
return 'fantia_product'
# 如果传入的avid完全匹配cid的模式,则将影片归类为cid
cid = get_cid(avid)
if cid == avid:
Expand Down
9 changes: 9 additions & 0 deletions javsp/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class CrawlerID(str, Enum):
prestige = 'prestige'
arzon = 'arzon'
arzon_iv = 'arzon_iv'
fantia_product = 'fantia_product'

class Network(BaseConfig):
proxy_server: Url | None
Expand All @@ -53,6 +54,7 @@ def items(self) -> List[tuple[str, list[CrawlerID]]]:
('cid', self.cid),
('getchu', self.getchu),
('gyutto', self.gyutto),
('fantia_product',self.fantia_product)
]

def __getitem__(self, index) -> list[CrawlerID]:
Expand All @@ -67,13 +69,16 @@ def __getitem__(self, index) -> list[CrawlerID]:
return self.getchu
case 'gyutto':
return self.gyutto
case 'fantia_product':
return self.fantia_product
raise Exception("Unknown crawler type")

normal: list[CrawlerID]
fc2: list[CrawlerID]
cid: list[CrawlerID]
getchu: list[CrawlerID]
gyutto: list[CrawlerID]
fantia_product: list[CrawlerID]

class MovieInfoField(str, Enum):
dvdid = 'dvdid'
Expand Down Expand Up @@ -210,6 +215,9 @@ class Translator(BaseConfig):
engine: TranslateEngine = Field(..., discriminator='name')
fields: TranslateField

class Cookie(BaseConfig):
fantia: str

class Other(BaseConfig):
interactive: bool
check_update: bool
Expand All @@ -233,5 +241,6 @@ class Cfg(BaseConfig):
crawler: Crawler
summarizer: Summarizer
translator: Translator
cookie: Cookie
other: Other
CONFIG_SOURCES=get_config_source()
136 changes: 136 additions & 0 deletions javsp/web/fantia_product.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""从JavDB抓取数据"""
import os
import re
import logging

import lxml.html

from javsp.web.base import Request, resp2html
from javsp.web.exceptions import *
from javsp.config import Cfg
from javsp.datatype import MovieInfo, GenreMap

# 初始化Request实例。使用scraper绕过CloudFlare后,需要指定网页语言,否则可能会返回其他语言网页,影响解析
request = Request(use_scraper=False)

cookie = Cfg().cookie.fantia
request.headers['Cookie'] = cookie

logger = logging.getLogger(__name__)
base_url = 'https://fantia.jp/products'


def get_html_wrapper(url):
"""包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题"""
global request, cookies_pool
if len(cookie) == 0:
raise ValueError('检测到fantia-product-的影片, 但fantia cookie为空')
r = request.get(url, delay_raise=True)
if r.status_code == 200:
html = resp2html(r)
return html
else:
raise WebsiteError(f'fantia product: {r.status_code} 非预期状态码: {url}')


def parse_data(movie: MovieInfo):
"""从网页抓取并解析指定番号的数据
Args:
movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
"""
id = movie.dvdid.lower()
prefix = "fantia-product-"
if not id.startswith(prefix):
raise ValueError(f"Invalid Fantia Product number: " + movie.dvdid)
fantia_num = id.replace(prefix, '')
url = f'{base_url}/{fantia_num}'

try:
html: lxml.html.HtmlComment = get_html_wrapper(url)
except (SitePermissionError, CredentialError):
return

# title
title = html.xpath("//div[@class='product-header']/h1")
if len(title) > 0:
title = title[0].text_content()

# plot
plot_vec = html.xpath("//div[@class='product-description']/div/p")
plot = ''
for line in plot_vec:
plot += line.text

# cover
cover = html.xpath("//picture/img[@class='img-fluid ']")
if len(cover) > 0:
cover = str(cover[0].get('src')).strip()

# actress
actress = html.xpath("//h1[@class='fanclub-name']/a")
actress_str = actress[0].text.strip() if actress else None
actress = [actress_str] if actress_str else []

# actress_pic
# 为了使用actress_alias.json,需要有演员头像,好在fantia都能获取到
actress_pics = {}
actress_pic = html.xpath("//div[@class='fanclub-header']/a/picture/img")
if len(actress_pic) > 0:
actress_pic = str(actress_pic[0].get('data-src')).strip()
print(actress_pic)
actress_pics[actress_str] = actress_pic

# genre
genres = []
tags_1 = html.xpath("//div[@class='product-header']/div/div/a")
for genre in tags_1:
genre = str(genre.text).removeprefix('#').strip()
if len(genre) > 0:
genres.append(genre)
tags_2 = html.xpath("//div[@class='product-header']/div/a")
for genre in tags_2:
genre = str(genre.text).removeprefix('#').strip()
if len(genre) > 0:
genres.append(genre)

# preview_pics

movie.title = title
movie.dvdid = id
movie.url = url
movie.cover = cover
movie.plot = plot
movie.actress = actress
movie.genre = genres
movie.actress_pics = actress_pics
movie.uncensored = False # 没在fantia上看到过无码的


def parse_clean_data(movie: MovieInfo):
"""解析指定番号的影片数据并进行清洗"""
try:
parse_data(movie)
# 检查封面URL是否真的存在对应图片
if movie.cover is not None:
r = request.head(movie.cover)
if r.status_code != 200:
movie.cover = None
except SiteBlocked:
raise
logger.error('unexpected error')




if __name__ == "__main__":
import pretty_errors

pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG

movie = MovieInfo('fantia-product-648810')
try:
parse_clean_data(movie)
print(movie)
except CrawlerError as e:
logger.error(e, exc_info=1)

0 comments on commit cc46ff5

Please sign in to comment.