diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7f34058..0bda6e2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,6 +7,7 @@ jobs: - uses: actions/setup-python@v2 with: python-version: '3.10' - - run: pip install wheel - - run: cat requirements-lock.txt | xargs -n 1 pip install --no-deps || exit 0 + - run: pip install -U pip wheel + - run: cat requirements.txt | xargs -n 1 pip install || exit 0 + - run: pip install . - run: python cute.py test diff --git a/.gitignore b/.gitignore index 24cb5f0..08eb851 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ dist .venv test*.* +temp diff --git a/comiccrawler/analyzer.py b/comiccrawler/analyzer.py index f3c7913..0075b49 100644 --- a/comiccrawler/analyzer.py +++ b/comiccrawler/analyzer.py @@ -138,7 +138,7 @@ def analyze_pages(self): print('Analyzing {}...'.format(url)) sleep(getattr(self.mission.module, "rest_analyze", 0)) r = urlparse(self.mission.url) - self.html = self.grabber.html(url, retry=True, header={ + self.html = self.grabber.html(url, retry=True, headers={ "Referer": self.mission.url, "Origin": f"{r.scheme}://{r.netloc}" }) diff --git a/comiccrawler/crawler.py b/comiccrawler/crawler.py index 8f5e438..dba88f3 100644 --- a/comiccrawler/crawler.py +++ b/comiccrawler/crawler.py @@ -220,7 +220,7 @@ def get_html(self): self.html = True else: r = urlparse(self.mission.url) - self.html = self.downloader.html(self.ep.current_url, header={ + self.html = self.downloader.html(self.ep.current_url, headers={ "Referer": self.mission.url, "Origin": f"{r.scheme}://{r.netloc}" }) diff --git a/comiccrawler/error.py b/comiccrawler/error.py index 82020cd..1b6672c 100644 --- a/comiccrawler/error.py +++ b/comiccrawler/error.py @@ -1,4 +1,5 @@ -from requests import HTTPError +# from requests import HTTPError +from curl_cffi.requests.exceptions import HTTPError class ComicCrawlerSignal(BaseException): """Extend BaseException.""" diff --git a/comiccrawler/grabber.py b/comiccrawler/grabber.py index c43d2f6..34d04b9 100644 --- a/comiccrawler/grabber.py +++ b/comiccrawler/grabber.py @@ -5,15 +5,15 @@ from threading import Lock from urllib.parse import quote, urlsplit, urlunsplit, urlparse import re -import socket +# import socket import time import json import enlighten -import requests from worker import WorkerExit, async_, await_, sleep, Defer -from urllib3.util import is_fp_closed +# from urllib3.util import is_fp_closed from urllib3.exceptions import IncompleteRead +from curl_cffi.requests.exceptions import HTTPError from .config import setting from .io import content_write @@ -74,24 +74,19 @@ def grabber_log(obj): content = time.strftime("%Y-%m-%dT%H:%M:%S%z") + "\n" + json.dumps(obj, indent=2, sort_keys=True) + "\n\n" content_write(profile("grabber.log"), content, append=True) -def grabber(url, header=None, *, referer=None, cookie=None, - retry=False, done=None, proxy=None, **kwargs): +def grabber(url, *, referer=None, retry=False, done=None, proxy=None, **kwargs): """Request url, return text or bytes of the content.""" s = session_manager.get(url) if referer: s.headers['Referer'] = quote_unicode(referer) - if cookie: - quote_unicode_dict(cookie) - requests.utils.add_dict_to_cookiejar(s.cookies, cookie) - if isinstance(proxy, str): proxies = {'http': proxy, 'https': proxy} else: proxies = proxy - r = await_(do_request, s, url, proxies, retry, headers=header, **kwargs) + r = await_(do_request, s, url, proxies, retry, **kwargs) if done: done(s, r) @@ -116,27 +111,13 @@ def do_request(s, url, proxies, retry, **kwargs): }) if r.status_code in SUCCESS_CODES: - content_length = r.headers.get("Content-Length") - if not kwargs.get("stream", False) and content_length and int(content_length) != r.raw.tell(): - raise ValueError( - "incomplete response. Content-Length: {content_length}, got: {actual}" - .format(content_length=content_length, actual=r.raw.tell()) - ) break if not retry or r.status_code not in RETRYABLE_HTTP_CODES: r.raise_for_status() # 302 error without location header if r.status_code == 302: # pylint: disable=protected-access - match = re.search( - r"^location:\s*(.+)", - str(r.raw._original_response.msg), - re.M + re.I - ) - if not match: - raise TypeError("status 302 without location header") - url = match.group(1) - continue + raise TypeError("status 302 without location header") print(r) print("retry after {sleep_time} seconds".format(sleep_time=sleep_time)) sleep(sleep_time) @@ -160,19 +141,9 @@ def guess_encoding(r): def iter_content(r): """Iterate the content of the response.""" - # FIXME: requests streaming is so broken wtf - # https://github.com/psf/requests/issues/5536 - # https://github.com/urllib3/urllib3/issues/2123 - if r.raw.chunked and r.raw.supports_chunked_reads(): - yield from r.raw.read_chunked(decode_content=True) - else: - while not is_fp_closed(r.raw._fp) or len(r.raw._decoded_buffer) > 0: # pylint: disable=protected-access - b = r.raw.read1(decode_content=True) - yield b - if not b: - sleep(0.1) + yield from r.iter_content() -def grabimg(*args, on_opened=None, tempfile=None, header=None, **kwargs): +def grabimg(*args, on_opened=None, tempfile=None, headers=None, **kwargs): """Grab the image. Return ImgResult""" kwargs["stream"] = True loaded = 0 @@ -182,12 +153,12 @@ def grabimg(*args, on_opened=None, tempfile=None, header=None, **kwargs): except FileNotFoundError: pass if loaded: - if not header: - header = {} - header["Range"] = f"bytes={loaded}-" + if not headers: + headers = {} + headers["Range"] = f"bytes={loaded}-" try: - r = grabber(*args, header=header, **kwargs) - except requests.HTTPError as err: + r = grabber(*args, headers=headers, **kwargs) + except HTTPError as err: if err.response.status_code != 416: raise err try: @@ -227,8 +198,7 @@ def _(): counter.update(len(chunk)) loaded += len(chunk) except WorkerExit: - socket.close(r.raw._fp.fileno()) # pylint: disable=protected-access - r.raw.release_conn() + r.close() raise if total and loaded < total: raise IncompleteRead(loaded, total - loaded) diff --git a/comiccrawler/mods/oh.py b/comiccrawler/mods/oh.py index beba38e..5504c29 100644 --- a/comiccrawler/mods/oh.py +++ b/comiccrawler/mods/oh.py @@ -13,7 +13,7 @@ from ..core import Episode, grabhtml -domain = ["www.ohmanhua.com", "www.cocomanhua.com"] +domain = ["www.ohmanhua.com", "www.cocomanhua.com", "www.colamanga.com"] name = "OH漫畫" def get_title(html, url): @@ -48,7 +48,7 @@ def __str__(self): scripts = ScriptCache() def get_images(html, url): - cdata = re.search("var C_DATA='[^']+'", html).group(0) + cdata = re.search("var C_DATA=('[^']+')", html).group(1) scripts.fetch(html, url, [ "\/l\.js", @@ -58,31 +58,71 @@ def get_images(html, url): ]) code = """ +const _log = console.log; + +Function.prototype.toString = (function(_toString) { + return function() { + return _toString.apply(this, arguments).replace(/\\r?\\n/g, ''); + } +})(Function.prototype.toString); + +self.setInterval = function() {}; + +self.eval = function(_eval) { + return function() { + _log('eval', arguments[0]); + return _eval.apply(this, arguments); + }; +}(self.eval); + +self.convertWordArrayToUint8Array = + self.convertUint8ArrayToWordArray = + self.__b_a = + self.__cad = + self.__js = + undefined; + (function() { + let _cookies = ""; + function noop(path = "") { - if (path === "document.cookie") return ""; + if (path === "document.cookie") return _cookies; if (path === "$.inArray") return (v, a) => a.indexOf(v); return new Proxy(() => {}, { - apply: () => noop("?"), - get: (target, prop) => noop(`${path}.${prop}`) + apply: () => noop(`${path}.called`), + get: (target, prop) => { + const propPath = typeof prop == "symbol" ? `${path}.${String(prop)}` : `${path}.${prop}`; + if (propPath == "document.domain") return "www.colamanga.com"; + _log("get", propPath); + return noop(propPath); + }, + set: (target, prop, value) => { + const propPath = `${path}.${prop}`; + if (propPath == "document.cookie") { + _cookies += value.split(";")[0] + "; "; + } + _log(propPath, value); + return value; + } }); } - - const exports = undefined; - const window = global; - window.location = { + + self.window = self; + self.location = { protocol: "http://", href: '""" + url + """' } - const navigator = { + self.navigator = { userAgent: "" }; - const document = noop("document") - const $ = noop("$"); + self.document = noop("document") + self.$ = noop("$"); + self.devtools = noop("devtools"); + self.localStorage = noop("localStorage"); - """ + cdata + "\n" + str(scripts) + """ + self.C_DATA = """ + cdata + "\n" + str(scripts) + """ window.use_domain = { }, @@ -108,8 +148,10 @@ class Image { __cr.preLoadImg(i++) } while (dirty); return imgs; - }).call(global); + }).call(self); """ + # import pathlib + # pathlib.Path("oh0.mjs").write_text(code, encoding="utf-8") imgs = eval(code) return [urljoin(url, i) for i in imgs] diff --git a/comiccrawler/mods/setnmh.py b/comiccrawler/mods/setnmh.py index fac61f0..fd333f1 100644 --- a/comiccrawler/mods/setnmh.py +++ b/comiccrawler/mods/setnmh.py @@ -31,7 +31,7 @@ def get_episodes(html, url): "order_by": "1", "chapter_type": "1" }, - header = { + headers = { "X-Requested-With": "XMLHttpRequest" } ) @@ -74,7 +74,7 @@ def get_images(html, url): "chapter_id": chapter_id, "page": page }, - header = { + headers = { "X-Requested-With": "XMLHttpRequest" } ) diff --git a/comiccrawler/module_grabber.py b/comiccrawler/module_grabber.py index a20f2f4..5a728e8 100644 --- a/comiccrawler/module_grabber.py +++ b/comiccrawler/module_grabber.py @@ -1,4 +1,4 @@ -from requests.utils import dict_from_cookiejar +# from requests.utils import dict_from_cookiejar from .grabber import grabhtml, grabimg @@ -19,8 +19,8 @@ def img(self, url, **kwargs): def grab(self, grab_method, url=None, **kwargs): new_kwargs = { - "header": self.get_header(), - "cookie": purify_cookie(self.get_cookie()), + "headers": self.get_header(), + "cookies": purify_cookie(self.get_cookie()), "done": self.handle_grab, "proxy": self.mod.config.get("proxy"), "verify": self.mod.config.getboolean("verify", True) @@ -50,13 +50,14 @@ def get_cookie(self): return cookie def handle_grab(self, session, _response): - cookie = dict_from_cookiejar(session.cookies) - config = getattr(self.mod, "config", None) - if not config: - return - - for key in config: - if key.startswith("cookie_"): - name = key[7:] - if name in cookie: - config[key] = cookie[name] + pass + # cookie = dict_from_cookiejar(session.cookies) + # config = getattr(self.mod, "config", None) + # if not config: + # return + # + # for key in config: + # if key.startswith("cookie_"): + # name = key[7:] + # if name in cookie: + # config[key] = cookie[name] diff --git a/comiccrawler/session_manager.py b/comiccrawler/session_manager.py index b56827a..a8f33b9 100644 --- a/comiccrawler/session_manager.py +++ b/comiccrawler/session_manager.py @@ -2,7 +2,7 @@ from threading import Lock from typing import Callable, Any -from requests import Session as RequestsSession +from curl_cffi.requests import Session as RequestsSession from .util import extract_curl diff --git a/comiccrawler/util.py b/comiccrawler/util.py index bc617e9..b987d37 100644 --- a/comiccrawler/util.py +++ b/comiccrawler/util.py @@ -2,9 +2,10 @@ import string from functools import total_ordering from pathlib import Path -from requests.cookies import RequestsCookieJar +from http.cookiejar import CookieJar import uncurl +import curl_cffi.requests.cookies def dump(html): Path("dump.html").write_text(html, encoding="utf-8") @@ -100,7 +101,9 @@ def balance(s: str, index: int, left="(", right=")", skip=0): return s[start:end] -def get_cookie(cookie_jar: RequestsCookieJar, name, domain=None) -> str: +def get_cookie(cookie_jar: CookieJar | curl_cffi.requests.cookies.Cookies, name, domain=None) -> str: + if hasattr(cookie_jar, "jar"): + cookie_jar = cookie_jar.jar l = [cookie for cookie in cookie_jar if cookie.name == name] def key(cookie): if not domain or not cookie.domain: diff --git a/requirements-lock.txt b/requirements-lock.txt index 207553c..4cf9f60 100644 --- a/requirements-lock.txt +++ b/requirements-lock.txt @@ -5,8 +5,10 @@ bidict==0.23.1 blessed==1.20.0 Brotli==1.1.0 certifi==2024.8.30 +cffi==1.17.1 charset-normalizer==3.4.0 colorama==0.4.6 +curl_cffi==0.7.3 deno_vm==0.6.0 desktop3==0.5.3 dill==0.3.9 @@ -26,14 +28,13 @@ markdown-it-py==3.0.0 mccabe==0.7.0 mdurl==0.1.2 more-itertools==10.5.0 -mutagen==1.47.0 -natsort==6.2.1 nh3==0.2.18 ordered-set==3.1.1 pkginfo==1.10.0 platformdirs==4.3.6 prefixed==0.9.0 puremagic==1.28 +pycparser==2.22 pycryptodomex==3.21.0 Pygments==2.18.0 pylint==3.3.1 @@ -51,7 +52,6 @@ semver==2.13.0 Send2Trash==1.8.3 setuptools==75.5.0 six==1.16.0 -tomli==2.0.1 tomlkit==0.13.2 tornado==6.4.1 twine==5.1.1 @@ -59,7 +59,6 @@ typing_extensions==4.8.0 uncurl==0.0.11 urllib3==2.2.3 wcwidth==0.2.13 -websockets==13.1 win_unicode_console==0.5 yt-dlp==2024.11.4 zipp==3.21.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index aca8c77..0fa6083 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,7 @@ install_requires = belfrywidgets~=1.0 bidict~=0.23.1 brotli~=1.1 + curl_cffi~=0.7.3 deno-vm~=0.6.0 desktop3~=0.5.3 docopt~=0.6.2 @@ -38,7 +39,6 @@ install_requires = puremagic~=1.28 pycryptodomex~=3.21 pythreadworker~=0.10.0 - requests~=2.32 safeprint~=0.2.0 uncurl~=0.0.11 urllib3~=2.2