diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index b494a33b..23b61bd2 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -31,12 +31,12 @@ from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT, _determine_headers, _handle_response, _parse_config, _pycurl_is_live_page, - _send_pycurl_request, _send_request, + _send_pycurl_request, _send_urllib_request, _urllib3_is_live_page, add_to_compressed_dict, fetch_url, is_live_page, load_download_buffer) from trafilatura.settings import DEFAULT_CONFIG, use_config -from trafilatura.utils import decode_response, load_html +from trafilatura.utils import decode_file, decode_response, load_html logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) @@ -59,7 +59,7 @@ def _reset_downloads_global_objects(): def test_fetch(): '''Test URL fetching.''' # logic: empty request? - assert _send_request('', True, DEFAULT_CONFIG) is None + assert _send_urllib_request('', True, DEFAULT_CONFIG) is None # is_live general tests assert _urllib3_is_live_page('https://httpbun.com/status/301') is True @@ -74,13 +74,13 @@ def test_fetch(): assert fetch_url('https://httpbun.com/status/404') is None # test if the functions default to no_ssl # doesn't work? - # assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None + # assert _send_urllib_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None if pycurl is not None: assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None # no SSL, no decoding url = 'https://httpbun.com/status/200' for no_ssl in (True, False): - response = _send_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG) + response = _send_urllib_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG) assert response.data == b'' if pycurl is not None: response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG) @@ -144,14 +144,15 @@ def test_decode(): mock = Mock() mock.data = b' ' assert decode_response(mock) is not None + assert decode_file(mock) is not None # GZip html_string = "
ABC
" gz_string = gzip.compress(html_string.encode("utf-8")) - assert decode_response(gz_string) == html_string + assert decode_response(gz_string) == html_string == decode_file(gz_string) # Brotli if brotli is not None: brotli_string = brotli.compress(html_string.encode("utf-8")) - assert decode_response(brotli_string) == html_string + assert decode_file(brotli_string) == html_string def test_queue(): diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index bdb1c88c..7fffac68 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -6,11 +6,13 @@ import logging import random -from collections import namedtuple +import warnings + from concurrent.futures import ThreadPoolExecutor, as_completed from time import sleep import certifi +import urllib3 try: import pycurl @@ -24,7 +26,6 @@ except ImportError: pycurl = None -import urllib3 from courlan import UrlStore from courlan.network import redirection_test @@ -35,12 +36,11 @@ from .settings import DEFAULT_CONFIG -from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks, - uniquify_list) +from .utils import (URL_BLACKLIST_REGEX, decode_file, + make_chunks, uniquify_list) LOGGER = logging.getLogger(__name__) -PKG_VERSION = version("trafilatura") NUM_CONNECTIONS = 50 @@ -50,10 +50,18 @@ RETRY_STRATEGY = None DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True) -USER_AGENT = 'trafilatura/' + PKG_VERSION + ' (+https://github.com/adbar/trafilatura)' +USER_AGENT = 'trafilatura/' + version("trafilatura") + ' (+https://github.com/adbar/trafilatura)' DEFAULT_HEADERS['User-Agent'] = USER_AGENT -RawResponse = namedtuple('RawResponse', ['data', 'status', 'url']) + +class Response: + __slots__ = ["data", "html", "status", "url"] + + def __init__(self, data, status, url): + self.data = data + self.html = None + self.status = status + self.url = url # caching throws an error @@ -83,7 +91,7 @@ def _determine_headers(config, headers=None): return headers or DEFAULT_HEADERS -def _send_request(url, no_ssl, config): +def _send_urllib_request(url, no_ssl, config): "Internal function to robustly send a request (SSL or not) and return its result." # customize headers global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY @@ -115,59 +123,74 @@ def _send_request(url, no_ssl, config): response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY) except urllib3.exceptions.SSLError: LOGGER.warning('retrying after SSLError: %s', url) - return _send_request(url, True, config) + return _send_urllib_request(url, True, config) except Exception as err: LOGGER.error('download error: %s %s', url, err) # sys.exc_info()[0] else: # necessary for standardization - return RawResponse(response.data, response.status, response.geturl()) + return Response(response.data, response.status, response.geturl()) # catchall return None def _handle_response(url, response, decode, config): 'Internal function to run safety checks on response result.' + lentest = len(response.html or "") if decode else len(response.data or "") if response.status != 200: LOGGER.error('not a 200 response: %s for URL %s', response.status, url) - elif response.data is None or len(response.data) < config.getint('DEFAULT', 'MIN_FILE_SIZE'): + elif response.data is None or lentest < config.getint('DEFAULT', 'MIN_FILE_SIZE'): LOGGER.error('too small/incorrect for URL %s', url) # raise error instead? - elif len(response.data) > config.getint('DEFAULT', 'MAX_FILE_SIZE'): + elif lentest > config.getint('DEFAULT', 'MAX_FILE_SIZE'): LOGGER.error('too large: length %s for URL %s', len(response.data), url) # raise error instead? else: - return decode_response(response.data) if decode is True else response + return response.html if decode else response # catchall return None def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG): - """Fetches page using urllib3 and decodes the response. + """Fetches page using urllib3 or pycurl and decodes the response. Args: url: URL of the page to fetch. - decode: Decode response instead of returning urllib3 response object (boolean). + decode: Decode response instead of returning response object (boolean). no_ssl: Don't try to establish a secure connection (to prevent SSLError). config: Pass configuration values for output control. Returns: - RawResponse object: data (headers + body), status (HTML code as string) and url + Response object: data (headers + body), status (HTML code as string) and url or None in case the result is invalid or there was a problem with the network. """ - LOGGER.debug('sending request: %s', url) - if pycurl is None: - response = _send_request(url, no_ssl, config) - else: - response = _send_pycurl_request(url, no_ssl, config) + if not decode: + warnings.warn( + """Raw response objects will be deprecated for fetch_url, + use fetch_response instead.""", + PendingDeprecationWarning + ) + response = fetch_response(url, decode, no_ssl, config) if response is not None and response != '': return _handle_response(url, response, decode, config) # return '' (useful do discard further processing?) # return response - LOGGER.debug('request failed: %s', url) return None +def fetch_response(url, decode=False, no_ssl=False, config=DEFAULT_CONFIG): + "Fetches page using urllib3 or pycurl and returns a raw response object." + dl_function = _send_urllib_request if pycurl is None else _send_pycurl_request + LOGGER.debug('sending request: %s', url) + response = dl_function(url, no_ssl, config) # Response + if not response: # None or "" + LOGGER.debug('request failed: %s', url) + return None + if decode and response.data: + response.html = decode_file(response.data) + return response + + def _pycurl_is_live_page(url): "Send a basic HTTP HEAD request with pycurl." # Initialize pycurl object @@ -325,4 +348,4 @@ def _send_pycurl_request(url, no_ssl, config): # tidy up curl.close() - return RawResponse(bufferbytes, respcode, effective_url) + return Response(bufferbytes, respcode, effective_url) diff --git a/trafilatura/utils.py b/trafilatura/utils.py index 7cc49ff5..dcaf5db9 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -9,6 +9,7 @@ # import csv import logging import re +import warnings # if brotli is installed try: @@ -129,17 +130,20 @@ def detect_encoding(bytesobject): return [g for g in guesses if g not in UNICODE_ALIASES] -def decode_response(response): +def decode_response(content): """Read the urllib3 object corresponding to the server response, - check if it could be GZip and eventually decompress it, then try to guess its encoding and decode it to return a unicode string""" - # urllib3 response object / bytes switch - resp_content = response if isinstance(response, bytes) else response.data - return decode_file(resp_content) + warnings.warn( + """Raw response objects will be deprecated for fetch_url, + use fetch_response instead.""", + PendingDeprecationWarning + ) + return decode_file(content) def decode_file(filecontent): - """Guess bytestring encoding and try to decode to Unicode string. + """Check if the bytestring could be GZip and eventually decompress it, + guess bytestring encoding and try to decode to Unicode string. Resort to destructive conversion otherwise.""" # init if isinstance(filecontent, str):