From 5af5f45f6f2c810d8fe50a94b586d869cdd58278 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi
Date: Thu, 18 Jan 2024 14:30:37 +0100
Subject: [PATCH] introduce advanced download function (scaffolding)
---
tests/downloads_tests.py | 15 +++++----
trafilatura/downloads.py | 69 ++++++++++++++++++++++++++--------------
trafilatura/utils.py | 16 ++++++----
3 files changed, 64 insertions(+), 36 deletions(-)
diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py
index b494a33b..23b61bd2 100644
--- a/tests/downloads_tests.py
+++ b/tests/downloads_tests.py
@@ -31,12 +31,12 @@
from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT,
_determine_headers, _handle_response,
_parse_config, _pycurl_is_live_page,
- _send_pycurl_request, _send_request,
+ _send_pycurl_request, _send_urllib_request,
_urllib3_is_live_page,
add_to_compressed_dict, fetch_url,
is_live_page, load_download_buffer)
from trafilatura.settings import DEFAULT_CONFIG, use_config
-from trafilatura.utils import decode_response, load_html
+from trafilatura.utils import decode_file, decode_response, load_html
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
@@ -59,7 +59,7 @@ def _reset_downloads_global_objects():
def test_fetch():
'''Test URL fetching.'''
# logic: empty request?
- assert _send_request('', True, DEFAULT_CONFIG) is None
+ assert _send_urllib_request('', True, DEFAULT_CONFIG) is None
# is_live general tests
assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
@@ -74,13 +74,13 @@ def test_fetch():
assert fetch_url('https://httpbun.com/status/404') is None
# test if the functions default to no_ssl
# doesn't work?
- # assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
+ # assert _send_urllib_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
if pycurl is not None:
assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
# no SSL, no decoding
url = 'https://httpbun.com/status/200'
for no_ssl in (True, False):
- response = _send_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG)
+ response = _send_urllib_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG)
assert response.data == b''
if pycurl is not None:
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
@@ -144,14 +144,15 @@ def test_decode():
mock = Mock()
mock.data = b' '
assert decode_response(mock) is not None
+ assert decode_file(mock) is not None
# GZip
html_string = "
ABC
"
gz_string = gzip.compress(html_string.encode("utf-8"))
- assert decode_response(gz_string) == html_string
+ assert decode_response(gz_string) == html_string == decode_file(gz_string)
# Brotli
if brotli is not None:
brotli_string = brotli.compress(html_string.encode("utf-8"))
- assert decode_response(brotli_string) == html_string
+ assert decode_file(brotli_string) == html_string
def test_queue():
diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
index bdb1c88c..7fffac68 100644
--- a/trafilatura/downloads.py
+++ b/trafilatura/downloads.py
@@ -6,11 +6,13 @@
import logging
import random
-from collections import namedtuple
+import warnings
+
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep
import certifi
+import urllib3
try:
import pycurl
@@ -24,7 +26,6 @@
except ImportError:
pycurl = None
-import urllib3
from courlan import UrlStore
from courlan.network import redirection_test
@@ -35,12 +36,11 @@
from .settings import DEFAULT_CONFIG
-from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks,
- uniquify_list)
+from .utils import (URL_BLACKLIST_REGEX, decode_file,
+ make_chunks, uniquify_list)
LOGGER = logging.getLogger(__name__)
-PKG_VERSION = version("trafilatura")
NUM_CONNECTIONS = 50
@@ -50,10 +50,18 @@
RETRY_STRATEGY = None
DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
-USER_AGENT = 'trafilatura/' + PKG_VERSION + ' (+https://github.com/adbar/trafilatura)'
+USER_AGENT = 'trafilatura/' + version("trafilatura") + ' (+https://github.com/adbar/trafilatura)'
DEFAULT_HEADERS['User-Agent'] = USER_AGENT
-RawResponse = namedtuple('RawResponse', ['data', 'status', 'url'])
+
+class Response:
+ __slots__ = ["data", "html", "status", "url"]
+
+ def __init__(self, data, status, url):
+ self.data = data
+ self.html = None
+ self.status = status
+ self.url = url
# caching throws an error
@@ -83,7 +91,7 @@ def _determine_headers(config, headers=None):
return headers or DEFAULT_HEADERS
-def _send_request(url, no_ssl, config):
+def _send_urllib_request(url, no_ssl, config):
"Internal function to robustly send a request (SSL or not) and return its result."
# customize headers
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
@@ -115,59 +123,74 @@ def _send_request(url, no_ssl, config):
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
except urllib3.exceptions.SSLError:
LOGGER.warning('retrying after SSLError: %s', url)
- return _send_request(url, True, config)
+ return _send_urllib_request(url, True, config)
except Exception as err:
LOGGER.error('download error: %s %s', url, err) # sys.exc_info()[0]
else:
# necessary for standardization
- return RawResponse(response.data, response.status, response.geturl())
+ return Response(response.data, response.status, response.geturl())
# catchall
return None
def _handle_response(url, response, decode, config):
'Internal function to run safety checks on response result.'
+ lentest = len(response.html or "") if decode else len(response.data or "")
if response.status != 200:
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
- elif response.data is None or len(response.data) < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
+ elif response.data is None or lentest < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
LOGGER.error('too small/incorrect for URL %s', url)
# raise error instead?
- elif len(response.data) > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
+ elif lentest > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
LOGGER.error('too large: length %s for URL %s', len(response.data), url)
# raise error instead?
else:
- return decode_response(response.data) if decode is True else response
+ return response.html if decode else response
# catchall
return None
def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG):
- """Fetches page using urllib3 and decodes the response.
+ """Fetches page using urllib3 or pycurl and decodes the response.
Args:
url: URL of the page to fetch.
- decode: Decode response instead of returning urllib3 response object (boolean).
+ decode: Decode response instead of returning response object (boolean).
no_ssl: Don't try to establish a secure connection (to prevent SSLError).
config: Pass configuration values for output control.
Returns:
- RawResponse object: data (headers + body), status (HTML code as string) and url
+ Response object: data (headers + body), status (HTML code as string) and url
or None in case the result is invalid or there was a problem with the network.
"""
- LOGGER.debug('sending request: %s', url)
- if pycurl is None:
- response = _send_request(url, no_ssl, config)
- else:
- response = _send_pycurl_request(url, no_ssl, config)
+ if not decode:
+ warnings.warn(
+ """Raw response objects will be deprecated for fetch_url,
+ use fetch_response instead.""",
+ PendingDeprecationWarning
+ )
+ response = fetch_response(url, decode, no_ssl, config)
if response is not None and response != '':
return _handle_response(url, response, decode, config)
# return '' (useful do discard further processing?)
# return response
- LOGGER.debug('request failed: %s', url)
return None
+def fetch_response(url, decode=False, no_ssl=False, config=DEFAULT_CONFIG):
+ "Fetches page using urllib3 or pycurl and returns a raw response object."
+ dl_function = _send_urllib_request if pycurl is None else _send_pycurl_request
+ LOGGER.debug('sending request: %s', url)
+ response = dl_function(url, no_ssl, config) # Response
+ if not response: # None or ""
+ LOGGER.debug('request failed: %s', url)
+ return None
+ if decode and response.data:
+ response.html = decode_file(response.data)
+ return response
+
+
def _pycurl_is_live_page(url):
"Send a basic HTTP HEAD request with pycurl."
# Initialize pycurl object
@@ -325,4 +348,4 @@ def _send_pycurl_request(url, no_ssl, config):
# tidy up
curl.close()
- return RawResponse(bufferbytes, respcode, effective_url)
+ return Response(bufferbytes, respcode, effective_url)
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index 7cc49ff5..dcaf5db9 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -9,6 +9,7 @@
# import csv
import logging
import re
+import warnings
# if brotli is installed
try:
@@ -129,17 +130,20 @@ def detect_encoding(bytesobject):
return [g for g in guesses if g not in UNICODE_ALIASES]
-def decode_response(response):
+def decode_response(content):
"""Read the urllib3 object corresponding to the server response,
- check if it could be GZip and eventually decompress it, then
try to guess its encoding and decode it to return a unicode string"""
- # urllib3 response object / bytes switch
- resp_content = response if isinstance(response, bytes) else response.data
- return decode_file(resp_content)
+ warnings.warn(
+ """Raw response objects will be deprecated for fetch_url,
+ use fetch_response instead.""",
+ PendingDeprecationWarning
+ )
+ return decode_file(content)
def decode_file(filecontent):
- """Guess bytestring encoding and try to decode to Unicode string.
+ """Check if the bytestring could be GZip and eventually decompress it,
+ guess bytestring encoding and try to decode to Unicode string.
Resort to destructive conversion otherwise."""
# init
if isinstance(filecontent, str):