Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change: switch to curl_cffi, fix oh #393

Merged
merged 2 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ jobs:
- uses: actions/setup-python@v2
with:
python-version: '3.10'
- run: pip install wheel
- run: cat requirements-lock.txt | xargs -n 1 pip install --no-deps || exit 0
- run: pip install -U pip wheel
- run: cat requirements.txt | xargs -n 1 pip install || exit 0
- run: pip install .
- run: python cute.py test
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ dist

.venv
test*.*
temp
2 changes: 1 addition & 1 deletion comiccrawler/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def analyze_pages(self):
print('Analyzing {}...'.format(url))
sleep(getattr(self.mission.module, "rest_analyze", 0))
r = urlparse(self.mission.url)
self.html = self.grabber.html(url, retry=True, header={
self.html = self.grabber.html(url, retry=True, headers={
"Referer": self.mission.url,
"Origin": f"{r.scheme}://{r.netloc}"
})
Expand Down
2 changes: 1 addition & 1 deletion comiccrawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def get_html(self):
self.html = True
else:
r = urlparse(self.mission.url)
self.html = self.downloader.html(self.ep.current_url, header={
self.html = self.downloader.html(self.ep.current_url, headers={
"Referer": self.mission.url,
"Origin": f"{r.scheme}://{r.netloc}"
})
Expand Down
3 changes: 2 additions & 1 deletion comiccrawler/error.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from requests import HTTPError
# from requests import HTTPError
from curl_cffi.requests.exceptions import HTTPError

class ComicCrawlerSignal(BaseException):
"""Extend BaseException."""
Expand Down
58 changes: 14 additions & 44 deletions comiccrawler/grabber.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
from threading import Lock
from urllib.parse import quote, urlsplit, urlunsplit, urlparse
import re
import socket
# import socket
import time
import json

import enlighten
import requests
from worker import WorkerExit, async_, await_, sleep, Defer
from urllib3.util import is_fp_closed
# from urllib3.util import is_fp_closed
from urllib3.exceptions import IncompleteRead
from curl_cffi.requests.exceptions import HTTPError

from .config import setting
from .io import content_write
Expand Down Expand Up @@ -74,24 +74,19 @@ def grabber_log(obj):
content = time.strftime("%Y-%m-%dT%H:%M:%S%z") + "\n" + json.dumps(obj, indent=2, sort_keys=True) + "\n\n"
content_write(profile("grabber.log"), content, append=True)

def grabber(url, header=None, *, referer=None, cookie=None,
retry=False, done=None, proxy=None, **kwargs):
def grabber(url, *, referer=None, retry=False, done=None, proxy=None, **kwargs):
"""Request url, return text or bytes of the content."""
s = session_manager.get(url)

if referer:
s.headers['Referer'] = quote_unicode(referer)

if cookie:
quote_unicode_dict(cookie)
requests.utils.add_dict_to_cookiejar(s.cookies, cookie)

if isinstance(proxy, str):
proxies = {'http': proxy, 'https': proxy}
else:
proxies = proxy

r = await_(do_request, s, url, proxies, retry, headers=header, **kwargs)
r = await_(do_request, s, url, proxies, retry, **kwargs)

if done:
done(s, r)
Expand All @@ -116,27 +111,13 @@ def do_request(s, url, proxies, retry, **kwargs):
})

if r.status_code in SUCCESS_CODES:
content_length = r.headers.get("Content-Length")
if not kwargs.get("stream", False) and content_length and int(content_length) != r.raw.tell():
raise ValueError(
"incomplete response. Content-Length: {content_length}, got: {actual}"
.format(content_length=content_length, actual=r.raw.tell())
)
break
if not retry or r.status_code not in RETRYABLE_HTTP_CODES:
r.raise_for_status()
# 302 error without location header
if r.status_code == 302:
# pylint: disable=protected-access
match = re.search(
r"^location:\s*(.+)",
str(r.raw._original_response.msg),
re.M + re.I
)
if not match:
raise TypeError("status 302 without location header")
url = match.group(1)
continue
raise TypeError("status 302 without location header")
print(r)
print("retry after {sleep_time} seconds".format(sleep_time=sleep_time))
sleep(sleep_time)
Expand All @@ -160,19 +141,9 @@ def guess_encoding(r):

def iter_content(r):
"""Iterate the content of the response."""
# FIXME: requests streaming is so broken wtf
# https://github.com/psf/requests/issues/5536
# https://github.com/urllib3/urllib3/issues/2123
if r.raw.chunked and r.raw.supports_chunked_reads():
yield from r.raw.read_chunked(decode_content=True)
else:
while not is_fp_closed(r.raw._fp) or len(r.raw._decoded_buffer) > 0: # pylint: disable=protected-access
b = r.raw.read1(decode_content=True)
yield b
if not b:
sleep(0.1)
yield from r.iter_content()

def grabimg(*args, on_opened=None, tempfile=None, header=None, **kwargs):
def grabimg(*args, on_opened=None, tempfile=None, headers=None, **kwargs):
"""Grab the image. Return ImgResult"""
kwargs["stream"] = True
loaded = 0
Expand All @@ -182,12 +153,12 @@ def grabimg(*args, on_opened=None, tempfile=None, header=None, **kwargs):
except FileNotFoundError:
pass
if loaded:
if not header:
header = {}
header["Range"] = f"bytes={loaded}-"
if not headers:
headers = {}
headers["Range"] = f"bytes={loaded}-"
try:
r = grabber(*args, header=header, **kwargs)
except requests.HTTPError as err:
r = grabber(*args, headers=headers, **kwargs)
except HTTPError as err:
if err.response.status_code != 416:
raise err
try:
Expand Down Expand Up @@ -227,8 +198,7 @@ def _():
counter.update(len(chunk))
loaded += len(chunk)
except WorkerExit:
socket.close(r.raw._fp.fileno()) # pylint: disable=protected-access
r.raw.release_conn()
r.close()
raise
if total and loaded < total:
raise IncompleteRead(loaded, total - loaded)
Expand Down
70 changes: 56 additions & 14 deletions comiccrawler/mods/oh.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ..core import Episode, grabhtml

domain = ["www.ohmanhua.com", "www.cocomanhua.com"]
domain = ["www.ohmanhua.com", "www.cocomanhua.com", "www.colamanga.com"]
name = "OH漫畫"

def get_title(html, url):
Expand Down Expand Up @@ -48,7 +48,7 @@ def __str__(self):
scripts = ScriptCache()

def get_images(html, url):
cdata = re.search("var C_DATA='[^']+'", html).group(0)
cdata = re.search("var C_DATA=('[^']+')", html).group(1)

scripts.fetch(html, url, [
"\/l\.js",
Expand All @@ -58,31 +58,71 @@ def get_images(html, url):
])

code = """
const _log = console.log;

Function.prototype.toString = (function(_toString) {
return function() {
return _toString.apply(this, arguments).replace(/\\r?\\n/g, '');
}
})(Function.prototype.toString);

self.setInterval = function() {};

self.eval = function(_eval) {
return function() {
_log('eval', arguments[0]);
return _eval.apply(this, arguments);
};
}(self.eval);

self.convertWordArrayToUint8Array =
self.convertUint8ArrayToWordArray =
self.__b_a =
self.__cad =
self.__js =
undefined;

(function() {

let _cookies = "";

function noop(path = "") {
if (path === "document.cookie") return "";
if (path === "document.cookie") return _cookies;
if (path === "$.inArray") return (v, a) => a.indexOf(v);

return new Proxy(() => {}, {
apply: () => noop("?"),
get: (target, prop) => noop(`${path}.${prop}`)
apply: () => noop(`${path}.called`),
get: (target, prop) => {
const propPath = typeof prop == "symbol" ? `${path}.${String(prop)}` : `${path}.${prop}`;
if (propPath == "document.domain") return "www.colamanga.com";
_log("get", propPath);
return noop(propPath);
},
set: (target, prop, value) => {
const propPath = `${path}.${prop}`;
if (propPath == "document.cookie") {
_cookies += value.split(";")[0] + "; ";
}
_log(propPath, value);
return value;
}
});
}

const exports = undefined;
const window = global;
window.location = {

self.window = self;
self.location = {
protocol: "http://",
href: '""" + url + """'
}
const navigator = {
self.navigator = {
userAgent: ""
};
const document = noop("document")
const $ = noop("$");
self.document = noop("document")
self.$ = noop("$");
self.devtools = noop("devtools");
self.localStorage = noop("localStorage");

""" + cdata + "\n" + str(scripts) + """
self.C_DATA = """ + cdata + "\n" + str(scripts) + """

window.use_domain = {
},
Expand All @@ -108,8 +148,10 @@ class Image {
__cr.preLoadImg(i++)
} while (dirty);
return imgs;
}).call(global);
}).call(self);
"""

# import pathlib
# pathlib.Path("oh0.mjs").write_text(code, encoding="utf-8")
imgs = eval(code)
return [urljoin(url, i) for i in imgs]
4 changes: 2 additions & 2 deletions comiccrawler/mods/setnmh.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_episodes(html, url):
"order_by": "1",
"chapter_type": "1"
},
header = {
headers = {
"X-Requested-With": "XMLHttpRequest"
}
)
Expand Down Expand Up @@ -74,7 +74,7 @@ def get_images(html, url):
"chapter_id": chapter_id,
"page": page
},
header = {
headers = {
"X-Requested-With": "XMLHttpRequest"
}
)
Expand Down
27 changes: 14 additions & 13 deletions comiccrawler/module_grabber.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from requests.utils import dict_from_cookiejar
# from requests.utils import dict_from_cookiejar

from .grabber import grabhtml, grabimg

Expand All @@ -19,8 +19,8 @@ def img(self, url, **kwargs):

def grab(self, grab_method, url=None, **kwargs):
new_kwargs = {
"header": self.get_header(),
"cookie": purify_cookie(self.get_cookie()),
"headers": self.get_header(),
"cookies": purify_cookie(self.get_cookie()),
"done": self.handle_grab,
"proxy": self.mod.config.get("proxy"),
"verify": self.mod.config.getboolean("verify", True)
Expand Down Expand Up @@ -50,13 +50,14 @@ def get_cookie(self):
return cookie

def handle_grab(self, session, _response):
cookie = dict_from_cookiejar(session.cookies)
config = getattr(self.mod, "config", None)
if not config:
return

for key in config:
if key.startswith("cookie_"):
name = key[7:]
if name in cookie:
config[key] = cookie[name]
pass
# cookie = dict_from_cookiejar(session.cookies)
# config = getattr(self.mod, "config", None)
# if not config:
# return
#
# for key in config:
# if key.startswith("cookie_"):
# name = key[7:]
# if name in cookie:
# config[key] = cookie[name]
2 changes: 1 addition & 1 deletion comiccrawler/session_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from threading import Lock
from typing import Callable, Any

from requests import Session as RequestsSession
from curl_cffi.requests import Session as RequestsSession

from .util import extract_curl

Expand Down
7 changes: 5 additions & 2 deletions comiccrawler/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import string
from functools import total_ordering
from pathlib import Path
from requests.cookies import RequestsCookieJar
from http.cookiejar import CookieJar

import uncurl
import curl_cffi.requests.cookies

def dump(html):
Path("dump.html").write_text(html, encoding="utf-8")
Expand Down Expand Up @@ -100,7 +101,9 @@ def balance(s: str, index: int, left="(", right=")", skip=0):

return s[start:end]

def get_cookie(cookie_jar: RequestsCookieJar, name, domain=None) -> str:
def get_cookie(cookie_jar: CookieJar | curl_cffi.requests.cookies.Cookies, name, domain=None) -> str:
if hasattr(cookie_jar, "jar"):
cookie_jar = cookie_jar.jar
l = [cookie for cookie in cookie_jar if cookie.name == name]
def key(cookie):
if not domain or not cookie.domain:
Expand Down
Loading
Loading