diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index bb4601d0..d6208d44 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -231,12 +231,16 @@ def download_queue_processing(url_store, args, counter, config): def cli_discovery(args): "Group CLI functions dedicated to URL discovery." - url_store = load_input_dict(args) func = find_feed_urls if args.feed else sitemap_search + + url_store = load_input_dict(args) input_urls = url_store.dump_urls() if args.list: url_store.reset() - ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS') + + config = use_config(filename=args.config_file) + ext = config.getboolean('DEFAULT', 'EXTERNAL_URLS') + # sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME') # link discovery and storage with ThreadPoolExecutor(max_workers=args.parallel) as executor: diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py index ca0e5b86..c21bdddb 100644 --- a/trafilatura/sitemaps.py +++ b/trafilatura/sitemaps.py @@ -8,12 +8,15 @@ import logging import re + from itertools import islice +from time import sleep from typing import List, Set, Optional from courlan import ( clean_url, extract_domain, + filter_urls, fix_relative_urls, get_hostinfo, lang_filter, @@ -183,7 +186,7 @@ def process(self) -> None: def sitemap_search( - url: str, target_lang: Optional[str] = None, external: bool = False + url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2 ) -> List[str]: """Look for sitemaps for the given URL and gather links. @@ -194,6 +197,7 @@ def sitemap_search( (two-letter string, ISO 639-1 format). external: Similar hosts only or external URLs (boolean, defaults to False). + sleep_time: Wait between requests to the same website. Returns: The extracted links as a list (sorted list of unique links). @@ -208,10 +212,15 @@ def sitemap_search( LOGGER.warning("base URL unreachable, dropping sitemap: %s", url) return [] + urlfilter = None + if url.endswith((".gz", "sitemap", ".xml")): sitemapurls = [url] else: sitemapurls = [] + # set url filter to target subpages + if len(url) > len(baseurl) + 2: + urlfilter = url sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external) @@ -222,7 +231,7 @@ def sitemap_search( ] # iterate through nested sitemaps and results - while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN: + while sitemap.sitemap_urls: sitemap.current_url = sitemap.sitemap_urls.pop() sitemap.fetch() sitemap.process() @@ -231,6 +240,14 @@ def sitemap_search( s for s in sitemap.sitemap_urls if s not in sitemap.seen ] + if len(sitemap.seen) < MAX_SITEMAPS_SEEN: + sleep(sleep_time) + else: + break + + if urlfilter: + sitemap.urls = filter_urls(sitemap.urls, urlfilter) + LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname) return sitemap.urls