Skip to content

Commit

Permalink
introduce advanced download function (scaffolding)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 18, 2024
1 parent 85cd3d8 commit 5af5f45
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 36 deletions.
15 changes: 8 additions & 7 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@
from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT,
_determine_headers, _handle_response,
_parse_config, _pycurl_is_live_page,
_send_pycurl_request, _send_request,
_send_pycurl_request, _send_urllib_request,
_urllib3_is_live_page,
add_to_compressed_dict, fetch_url,
is_live_page, load_download_buffer)
from trafilatura.settings import DEFAULT_CONFIG, use_config
from trafilatura.utils import decode_response, load_html
from trafilatura.utils import decode_file, decode_response, load_html

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

Expand All @@ -59,7 +59,7 @@ def _reset_downloads_global_objects():
def test_fetch():
'''Test URL fetching.'''
# logic: empty request?
assert _send_request('', True, DEFAULT_CONFIG) is None
assert _send_urllib_request('', True, DEFAULT_CONFIG) is None

# is_live general tests
assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
Expand All @@ -74,13 +74,13 @@ def test_fetch():
assert fetch_url('https://httpbun.com/status/404') is None
# test if the functions default to no_ssl
# doesn't work?
# assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
# assert _send_urllib_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
if pycurl is not None:
assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
# no SSL, no decoding
url = 'https://httpbun.com/status/200'
for no_ssl in (True, False):
response = _send_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG)
response = _send_urllib_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG)
assert response.data == b''
if pycurl is not None:
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
Expand Down Expand Up @@ -144,14 +144,15 @@ def test_decode():
mock = Mock()
mock.data = b' '
assert decode_response(mock) is not None
assert decode_file(mock) is not None
# GZip
html_string = "<html><head/><body><div>ABC</div></body></html>"
gz_string = gzip.compress(html_string.encode("utf-8"))
assert decode_response(gz_string) == html_string
assert decode_response(gz_string) == html_string == decode_file(gz_string)
# Brotli
if brotli is not None:
brotli_string = brotli.compress(html_string.encode("utf-8"))
assert decode_response(brotli_string) == html_string
assert decode_file(brotli_string) == html_string


def test_queue():
Expand Down
69 changes: 46 additions & 23 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@

import logging
import random
from collections import namedtuple
import warnings

from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep

import certifi
import urllib3

try:
import pycurl
Expand All @@ -24,7 +26,6 @@
except ImportError:
pycurl = None

import urllib3
from courlan import UrlStore
from courlan.network import redirection_test

Expand All @@ -35,12 +36,11 @@


from .settings import DEFAULT_CONFIG
from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks,
uniquify_list)
from .utils import (URL_BLACKLIST_REGEX, decode_file,
make_chunks, uniquify_list)


LOGGER = logging.getLogger(__name__)
PKG_VERSION = version("trafilatura")

NUM_CONNECTIONS = 50

Expand All @@ -50,10 +50,18 @@
RETRY_STRATEGY = None

DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
USER_AGENT = 'trafilatura/' + PKG_VERSION + ' (+https://github.com/adbar/trafilatura)'
USER_AGENT = 'trafilatura/' + version("trafilatura") + ' (+https://github.com/adbar/trafilatura)'
DEFAULT_HEADERS['User-Agent'] = USER_AGENT

RawResponse = namedtuple('RawResponse', ['data', 'status', 'url'])

class Response:
__slots__ = ["data", "html", "status", "url"]

def __init__(self, data, status, url):
self.data = data
self.html = None
self.status = status
self.url = url


# caching throws an error
Expand Down Expand Up @@ -83,7 +91,7 @@ def _determine_headers(config, headers=None):
return headers or DEFAULT_HEADERS


def _send_request(url, no_ssl, config):
def _send_urllib_request(url, no_ssl, config):
"Internal function to robustly send a request (SSL or not) and return its result."
# customize headers
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
Expand Down Expand Up @@ -115,59 +123,74 @@ def _send_request(url, no_ssl, config):
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
except urllib3.exceptions.SSLError:
LOGGER.warning('retrying after SSLError: %s', url)
return _send_request(url, True, config)
return _send_urllib_request(url, True, config)
except Exception as err:
LOGGER.error('download error: %s %s', url, err) # sys.exc_info()[0]
else:
# necessary for standardization
return RawResponse(response.data, response.status, response.geturl())
return Response(response.data, response.status, response.geturl())
# catchall
return None


def _handle_response(url, response, decode, config):
'Internal function to run safety checks on response result.'
lentest = len(response.html or "") if decode else len(response.data or "")
if response.status != 200:
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
elif response.data is None or len(response.data) < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
elif response.data is None or lentest < config.getint('DEFAULT', 'MIN_FILE_SIZE'):
LOGGER.error('too small/incorrect for URL %s', url)
# raise error instead?
elif len(response.data) > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
elif lentest > config.getint('DEFAULT', 'MAX_FILE_SIZE'):
LOGGER.error('too large: length %s for URL %s', len(response.data), url)
# raise error instead?
else:
return decode_response(response.data) if decode is True else response
return response.html if decode else response
# catchall
return None


def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG):
"""Fetches page using urllib3 and decodes the response.
"""Fetches page using urllib3 or pycurl and decodes the response.
Args:
url: URL of the page to fetch.
decode: Decode response instead of returning urllib3 response object (boolean).
decode: Decode response instead of returning response object (boolean).
no_ssl: Don't try to establish a secure connection (to prevent SSLError).
config: Pass configuration values for output control.
Returns:
RawResponse object: data (headers + body), status (HTML code as string) and url
Response object: data (headers + body), status (HTML code as string) and url
or None in case the result is invalid or there was a problem with the network.
"""
LOGGER.debug('sending request: %s', url)
if pycurl is None:
response = _send_request(url, no_ssl, config)
else:
response = _send_pycurl_request(url, no_ssl, config)
if not decode:
warnings.warn(
"""Raw response objects will be deprecated for fetch_url,
use fetch_response instead.""",
PendingDeprecationWarning
)
response = fetch_response(url, decode, no_ssl, config)
if response is not None and response != '':
return _handle_response(url, response, decode, config)
# return '' (useful do discard further processing?)
# return response
LOGGER.debug('request failed: %s', url)
return None


def fetch_response(url, decode=False, no_ssl=False, config=DEFAULT_CONFIG):
"Fetches page using urllib3 or pycurl and returns a raw response object."
dl_function = _send_urllib_request if pycurl is None else _send_pycurl_request
LOGGER.debug('sending request: %s', url)
response = dl_function(url, no_ssl, config) # Response
if not response: # None or ""
LOGGER.debug('request failed: %s', url)
return None
if decode and response.data:
response.html = decode_file(response.data)
return response


def _pycurl_is_live_page(url):
"Send a basic HTTP HEAD request with pycurl."
# Initialize pycurl object
Expand Down Expand Up @@ -325,4 +348,4 @@ def _send_pycurl_request(url, no_ssl, config):

# tidy up
curl.close()
return RawResponse(bufferbytes, respcode, effective_url)
return Response(bufferbytes, respcode, effective_url)
16 changes: 10 additions & 6 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# import csv
import logging
import re
import warnings

# if brotli is installed
try:
Expand Down Expand Up @@ -129,17 +130,20 @@ def detect_encoding(bytesobject):
return [g for g in guesses if g not in UNICODE_ALIASES]


def decode_response(response):
def decode_response(content):
"""Read the urllib3 object corresponding to the server response,
check if it could be GZip and eventually decompress it, then
try to guess its encoding and decode it to return a unicode string"""
# urllib3 response object / bytes switch
resp_content = response if isinstance(response, bytes) else response.data
return decode_file(resp_content)
warnings.warn(
"""Raw response objects will be deprecated for fetch_url,
use fetch_response instead.""",
PendingDeprecationWarning
)
return decode_file(content)


def decode_file(filecontent):
"""Guess bytestring encoding and try to decode to Unicode string.
"""Check if the bytestring could be GZip and eventually decompress it,
guess bytestring encoding and try to decode to Unicode string.
Resort to destructive conversion otherwise."""
# init
if isinstance(filecontent, str):
Expand Down

0 comments on commit 5af5f45

Please sign in to comment.