From dfc03f689ce674c0a9bc509d735b5530e8bc1c83 Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Sat, 30 Dec 2023 23:33:46 +0100 Subject: [PATCH 1/8] introduce `MAX_REDIRECTS` config setting and fix urllib3 redirect handling Fixes issue #450 After setting `MAX_REDIRECTS` to 5, I could fetch the original URL from the issue: `trafilatura -u https://www.hydrogeninsight.com/production/breaking-us-reveals-the-seven-regional-hydrogen-hubs-to-receive-7bn-of-government-funding/2-1-1534596` I also fixed this old issue: https://github.com/adbar/trafilatura/issues/128 The underlying urllib3 bug has not been fixed: https://github.com/urllib3/urllib3/issues/2475 I had to pass the retry strategy to the actual request method: it doesn't propagate from the pool maanger --- docs/settings.rst | 1 + tests/downloads_tests.py | 14 ++++++++++++-- trafilatura/downloads.py | 20 ++++++++++++++------ trafilatura/settings.cfg | 2 ++ 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/docs/settings.rst b/docs/settings.rst index 367cf2d8..5a2e7b7d 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -43,6 +43,7 @@ The default file included in the package is `settings.cfg `_ to be followed. Set to 0 to not follow any redirection. Using a custom file on the command-line diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index 867e5ccd..3f2434fa 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -33,7 +33,7 @@ _send_pycurl_request, _send_request, _urllib3_is_live_page, add_to_compressed_dict, fetch_url, - is_live_page, load_download_buffer) + is_live_page, load_download_buffer, _reset_global_objects) from trafilatura.settings import DEFAULT_CONFIG, use_config from trafilatura.utils import decode_response, load_html @@ -94,7 +94,17 @@ def test_fetch(): assert load_html(response) is not None # nothing to see here assert extract(response, url=response.url, config=ZERO_CONFIG) is None - + # test handling redirects + res = fetch_url('https://httpbun.com/redirect/2') + assert len(res) > 100 # We followed redirects and downloaded something in the end + new_config = use_config() # get a new config instance to avoid mutating the default one + # Patch max directs: limit to 0. We won't fetch any page as a result + new_config.set('DEFAULT', 'MAX_REDIRECTS', '0') + _reset_global_objects() # force Retry strategy and PoolManager to be recreated with the new config value + res = fetch_url('http://httpbin.org/redirect/1', config=new_config) + assert res is None + # Also test max redir implementation on pycurl + assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None def test_config(): '''Test how configuration options are read and stored.''' diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index 10201aac..f929de2d 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -43,7 +43,6 @@ PKG_VERSION = version("trafilatura") NUM_CONNECTIONS = 50 -MAX_REDIRECTS = 2 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) HTTP_POOL = None @@ -90,8 +89,8 @@ def _send_request(url, no_ssl, config): global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY if not RETRY_STRATEGY: RETRY_STRATEGY = urllib3.util.Retry( - total=0, - redirect=MAX_REDIRECTS, # raise_on_redirect=False, + total=config.getint("DEFAULT", "MAX_REDIRECTS"), + redirect=config.getint("DEFAULT", "MAX_REDIRECTS"), # raise_on_redirect=False, connect=0, backoff_factor=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')/2, status_forcelist=[ @@ -107,13 +106,13 @@ def _send_request(url, no_ssl, config): if not HTTP_POOL: HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), ca_certs=certifi.where(), num_pools=NUM_CONNECTIONS) # cert_reqs='CERT_REQUIRED' # execute request - response = HTTP_POOL.request('GET', url, headers=_determine_headers(config)) + response = HTTP_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY) else: # define pool if not NO_CERT_POOL: NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), cert_reqs='CERT_NONE', num_pools=NUM_CONNECTIONS) # execute request - response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config)) + response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY) except urllib3.exceptions.SSLError: LOGGER.warning('retrying after SSLError: %s', url) return _send_request(url, True, config) @@ -275,7 +274,7 @@ def _send_pycurl_request(url, no_ssl, config): curl.setopt(pycurl.HTTPHEADER, headerlist) # curl.setopt(pycurl.USERAGENT, '') curl.setopt(pycurl.FOLLOWLOCATION, 1) - curl.setopt(pycurl.MAXREDIRS, MAX_REDIRECTS) + curl.setopt(pycurl.MAXREDIRS, config.getint('DEFAULT', 'MAX_REDIRECTS')) curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')) curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')) curl.setopt(pycurl.NOSIGNAL, 1) @@ -327,3 +326,12 @@ def _send_pycurl_request(url, no_ssl, config): # tidy up curl.close() return RawResponse(bufferbytes, respcode, effective_url) + + +def _reset_global_objects(): + """ + Force global objects to be re-created + Currently only used during tests + """ + global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY + HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY = None, None, None diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg index dfd753b9..1bbfc669 100644 --- a/trafilatura/settings.cfg +++ b/trafilatura/settings.cfg @@ -12,6 +12,8 @@ SLEEP_TIME = 5 USER_AGENTS = # cookie for HTTP requests COOKIE = +# Maximum number of redirects that we will follow +MAX_REDIRECTS = 5 # Extraction MIN_EXTRACTED_SIZE = 250 From b49d3f43bb1db880ff4cb4ffce0108a7fc8f0bf6 Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Sat, 30 Dec 2023 23:54:39 +0100 Subject: [PATCH 2/8] use httpbin everywhere instead of httpbun --- tests/downloads_tests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index 3f2434fa..3d80280b 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -95,7 +95,7 @@ def test_fetch(): # nothing to see here assert extract(response, url=response.url, config=ZERO_CONFIG) is None # test handling redirects - res = fetch_url('https://httpbun.com/redirect/2') + res = fetch_url('http://httpbin.org/redirect/2') assert len(res) > 100 # We followed redirects and downloaded something in the end new_config = use_config() # get a new config instance to avoid mutating the default one # Patch max directs: limit to 0. We won't fetch any page as a result @@ -103,8 +103,9 @@ def test_fetch(): _reset_global_objects() # force Retry strategy and PoolManager to be recreated with the new config value res = fetch_url('http://httpbin.org/redirect/1', config=new_config) assert res is None - # Also test max redir implementation on pycurl - assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None + # Also test max redir implementation on pycurl if available + if pycurl is not None: + assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None def test_config(): '''Test how configuration options are read and stored.''' From a4bfb76b7c453db1cabcc4130aa6eb424ea9c0c9 Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Sat, 30 Dec 2023 23:57:35 +0100 Subject: [PATCH 3/8] restore MAX_REDIRECTS defaut config value to 2 --- trafilatura/settings.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg index 1bbfc669..160f5909 100644 --- a/trafilatura/settings.cfg +++ b/trafilatura/settings.cfg @@ -13,7 +13,7 @@ USER_AGENTS = # cookie for HTTP requests COOKIE = # Maximum number of redirects that we will follow -MAX_REDIRECTS = 5 +MAX_REDIRECTS = 2 # Extraction MIN_EXTRACTED_SIZE = 250 From e94ba123be22b162a945f98625a817fd922aadcc Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Sun, 31 Dec 2023 00:00:41 +0100 Subject: [PATCH 4/8] reset global objects after test_fetch to avoid side-effects --- tests/downloads_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index 3d80280b..f2c9cc1d 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -106,6 +106,7 @@ def test_fetch(): # Also test max redir implementation on pycurl if available if pycurl is not None: assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None + _reset_global_objects() # reset global objects again to avoid affecting other tests def test_config(): '''Test how configuration options are read and stored.''' From ba1fc80e43f6dd4bc4aeba889b838881c2215d1d Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Sun, 31 Dec 2023 00:11:10 +0100 Subject: [PATCH 5/8] pin lxml to < 5 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 12ccac58..bcae4109 100644 --- a/setup.py +++ b/setup.py @@ -115,7 +115,7 @@ def get_long_description(): "htmldate >= 1.6.0", "importlib_metadata; python_version < '3.8'", "justext >= 3.0.0", - "lxml >= 4.9.3 ; platform_system != 'Darwin'", + "lxml >= 4.9.3, < 5 ; platform_system != 'Darwin'", "lxml == 4.9.2 ; platform_system == 'Darwin'", "urllib3 >= 1.26, < 2; python_version < '3.7'", "urllib3 >= 1.26, < 3; python_version >= '3.7'", From a023195822ac5d3e8581003778dc1a04211a3eab Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Sun, 31 Dec 2023 10:19:21 +0100 Subject: [PATCH 6/8] test no_ssl True & False to fix coverage --- tests/downloads_tests.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index f2c9cc1d..b72d1cb3 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -70,8 +70,9 @@ def test_fetch(): assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None # no SSL, no decoding url = 'https://httpbun.com/status/200' - response = _send_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG) - assert response.data == b'' + for no_ssl in (True, False): + response = _send_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG) + assert response.data == b'' if pycurl is not None: response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG) assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG) From 178a44195af480294d62e4d3a6a3766cd9dc0929 Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Thu, 4 Jan 2024 14:15:07 +0100 Subject: [PATCH 7/8] move _reset_global_objects() to the test file --- tests/downloads_tests.py | 13 ++++++++++--- trafilatura/downloads.py | 9 --------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index b72d1cb3..f35192ad 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -27,13 +27,14 @@ from trafilatura.cli_utils import (download_queue_processing, url_processing_pipeline) from trafilatura.core import extract +import trafilatura.downloads from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT, _determine_headers, _handle_response, _parse_config, _pycurl_is_live_page, _send_pycurl_request, _send_request, _urllib3_is_live_page, add_to_compressed_dict, fetch_url, - is_live_page, load_download_buffer, _reset_global_objects) + is_live_page, load_download_buffer) from trafilatura.settings import DEFAULT_CONFIG, use_config from trafilatura.utils import decode_response, load_html @@ -47,6 +48,12 @@ UA_CONFIG = use_config(filename=os.path.join(RESOURCES_DIR, 'newsettings.cfg')) +def _reset_downloads_global_objects(): + """ + Force global objects to be re-created + """ + trafilatura.downloads.HTTP_POOL, trafilatura.downloads.NO_CERT_POOL, trafilatura.downloads.RETRY_STRATEGY = None, None, None + def test_fetch(): '''Test URL fetching.''' # logic: empty request? @@ -101,13 +108,13 @@ def test_fetch(): new_config = use_config() # get a new config instance to avoid mutating the default one # Patch max directs: limit to 0. We won't fetch any page as a result new_config.set('DEFAULT', 'MAX_REDIRECTS', '0') - _reset_global_objects() # force Retry strategy and PoolManager to be recreated with the new config value + _reset_downloads_global_objects() # force Retry strategy and PoolManager to be recreated with the new config value res = fetch_url('http://httpbin.org/redirect/1', config=new_config) assert res is None # Also test max redir implementation on pycurl if available if pycurl is not None: assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None - _reset_global_objects() # reset global objects again to avoid affecting other tests + _reset_downloads_global_objects() # reset global objects again to avoid affecting other tests def test_config(): '''Test how configuration options are read and stored.''' diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index f929de2d..bdb1c88c 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -326,12 +326,3 @@ def _send_pycurl_request(url, no_ssl, config): # tidy up curl.close() return RawResponse(bufferbytes, respcode, effective_url) - - -def _reset_global_objects(): - """ - Force global objects to be re-created - Currently only used during tests - """ - global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY - HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY = None, None, None From 3019f9dd867bd48f1ce212d411e6b374bdb2a22a Mon Sep 17 00:00:00 2001 From: Vincent Barbaresi Date: Thu, 4 Jan 2024 14:17:20 +0100 Subject: [PATCH 8/8] rewrite assignment in multiple lines for readability --- tests/downloads_tests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index f35192ad..b494a33b 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -52,7 +52,9 @@ def _reset_downloads_global_objects(): """ Force global objects to be re-created """ - trafilatura.downloads.HTTP_POOL, trafilatura.downloads.NO_CERT_POOL, trafilatura.downloads.RETRY_STRATEGY = None, None, None + trafilatura.downloads.HTTP_POOL = None + trafilatura.downloads.NO_CERT_POOL = None + trafilatura.downloads.RETRY_STRATEGY = None def test_fetch(): '''Test URL fetching.'''