From 0ad353d62a7ce0ba2177e38401cc2a222bdfd7f1 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 8 Feb 2024 15:03:21 +0100 Subject: [PATCH 1/2] sitemaps: use safeguards --- trafilatura/cli_utils.py | 8 ++++++-- trafilatura/sitemaps.py | 21 +++++++++++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index bb4601d0..d6208d44 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -231,12 +231,16 @@ def download_queue_processing(url_store, args, counter, config): def cli_discovery(args): "Group CLI functions dedicated to URL discovery." - url_store = load_input_dict(args) func = find_feed_urls if args.feed else sitemap_search + + url_store = load_input_dict(args) input_urls = url_store.dump_urls() if args.list: url_store.reset() - ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS') + + config = use_config(filename=args.config_file) + ext = config.getboolean('DEFAULT', 'EXTERNAL_URLS') + # sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME') # link discovery and storage with ThreadPoolExecutor(max_workers=args.parallel) as executor: diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py index ca0e5b86..c21bdddb 100644 --- a/trafilatura/sitemaps.py +++ b/trafilatura/sitemaps.py @@ -8,12 +8,15 @@ import logging import re + from itertools import islice +from time import sleep from typing import List, Set, Optional from courlan import ( clean_url, extract_domain, + filter_urls, fix_relative_urls, get_hostinfo, lang_filter, @@ -183,7 +186,7 @@ def process(self) -> None: def sitemap_search( - url: str, target_lang: Optional[str] = None, external: bool = False + url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2 ) -> List[str]: """Look for sitemaps for the given URL and gather links. @@ -194,6 +197,7 @@ def sitemap_search( (two-letter string, ISO 639-1 format). external: Similar hosts only or external URLs (boolean, defaults to False). + sleep_time: Wait between requests to the same website. Returns: The extracted links as a list (sorted list of unique links). @@ -208,10 +212,15 @@ def sitemap_search( LOGGER.warning("base URL unreachable, dropping sitemap: %s", url) return [] + urlfilter = None + if url.endswith((".gz", "sitemap", ".xml")): sitemapurls = [url] else: sitemapurls = [] + # set url filter to target subpages + if len(url) > len(baseurl) + 2: + urlfilter = url sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external) @@ -222,7 +231,7 @@ def sitemap_search( ] # iterate through nested sitemaps and results - while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN: + while sitemap.sitemap_urls: sitemap.current_url = sitemap.sitemap_urls.pop() sitemap.fetch() sitemap.process() @@ -231,6 +240,14 @@ def sitemap_search( s for s in sitemap.sitemap_urls if s not in sitemap.seen ] + if len(sitemap.seen) < MAX_SITEMAPS_SEEN: + sleep(sleep_time) + else: + break + + if urlfilter: + sitemap.urls = filter_urls(sitemap.urls, urlfilter) + LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname) return sitemap.urls From a40ef04f7e90eda3f9974373b8ddd2866d51706a Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 8 Feb 2024 16:51:45 +0100 Subject: [PATCH 2/2] review CLI processing --- trafilatura/cli_utils.py | 12 +++++++----- trafilatura/feeds.py | 5 ++++- trafilatura/sitemaps.py | 6 ++---- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index d6208d44..442a4a9c 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -231,20 +231,22 @@ def download_queue_processing(url_store, args, counter, config): def cli_discovery(args): "Group CLI functions dedicated to URL discovery." - func = find_feed_urls if args.feed else sitemap_search - url_store = load_input_dict(args) input_urls = url_store.dump_urls() if args.list: url_store.reset() config = use_config(filename=args.config_file) - ext = config.getboolean('DEFAULT', 'EXTERNAL_URLS') - # sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME') + func = partial( + find_feed_urls if args.feed else sitemap_search, + target_lang=args.target_language, + external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'), + sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME') + ) # link discovery and storage with ThreadPoolExecutor(max_workers=args.parallel) as executor: - futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls) + futures = (executor.submit(func, url) for url in input_urls) # process results from the parallel threads and add them # to the compressed URL dictionary for further processing for future in as_completed(futures): diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py index eebc47fa..c63f1328 100644 --- a/trafilatura/feeds.py +++ b/trafilatura/feeds.py @@ -10,6 +10,7 @@ import re from itertools import islice +from time import sleep from typing import List, Optional from courlan import ( @@ -216,7 +217,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]: def find_feed_urls( - url: str, target_lang: Optional[str] = None, external: bool = False + url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2, ) -> List[str]: """Try to find feed URLs. @@ -227,6 +228,7 @@ def find_feed_urls( (two-letter string, ISO 639-1 format). external: Similar hosts only or external URLs (boolean, defaults to False). + sleep_time: Wait between requests on the same website. Returns: The extracted links as a list (sorted list of unique links). @@ -259,6 +261,7 @@ def find_feed_urls( else: LOGGER.error("Could not download web page: %s", url) if url.strip("/") != baseurl: + sleep(sleep_time) return try_homepage(baseurl, target_lang) # try alternative: Google News if target_lang is not None: diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py index c21bdddb..d17dd088 100644 --- a/trafilatura/sitemaps.py +++ b/trafilatura/sitemaps.py @@ -197,7 +197,7 @@ def sitemap_search( (two-letter string, ISO 639-1 format). external: Similar hosts only or external URLs (boolean, defaults to False). - sleep_time: Wait between requests to the same website. + sleep_time: Wait between requests on the same website. Returns: The extracted links as a list (sorted list of unique links). @@ -231,7 +231,7 @@ def sitemap_search( ] # iterate through nested sitemaps and results - while sitemap.sitemap_urls: + while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN: sitemap.current_url = sitemap.sitemap_urls.pop() sitemap.fetch() sitemap.process() @@ -242,8 +242,6 @@ def sitemap_search( if len(sitemap.seen) < MAX_SITEMAPS_SEEN: sleep(sleep_time) - else: - break if urlfilter: sitemap.urls = filter_urls(sitemap.urls, urlfilter)