diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index bb4601d0..442a4a9c 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -232,15 +232,21 @@ def download_queue_processing(url_store, args, counter, config): def cli_discovery(args): "Group CLI functions dedicated to URL discovery." url_store = load_input_dict(args) - func = find_feed_urls if args.feed else sitemap_search input_urls = url_store.dump_urls() if args.list: url_store.reset() - ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS') + + config = use_config(filename=args.config_file) + func = partial( + find_feed_urls if args.feed else sitemap_search, + target_lang=args.target_language, + external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'), + sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME') + ) # link discovery and storage with ThreadPoolExecutor(max_workers=args.parallel) as executor: - futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls) + futures = (executor.submit(func, url) for url in input_urls) # process results from the parallel threads and add them # to the compressed URL dictionary for further processing for future in as_completed(futures): diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py index eebc47fa..c63f1328 100644 --- a/trafilatura/feeds.py +++ b/trafilatura/feeds.py @@ -10,6 +10,7 @@ import re from itertools import islice +from time import sleep from typing import List, Optional from courlan import ( @@ -216,7 +217,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]: def find_feed_urls( - url: str, target_lang: Optional[str] = None, external: bool = False + url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2, ) -> List[str]: """Try to find feed URLs. @@ -227,6 +228,7 @@ def find_feed_urls( (two-letter string, ISO 639-1 format). external: Similar hosts only or external URLs (boolean, defaults to False). + sleep_time: Wait between requests on the same website. Returns: The extracted links as a list (sorted list of unique links). @@ -259,6 +261,7 @@ def find_feed_urls( else: LOGGER.error("Could not download web page: %s", url) if url.strip("/") != baseurl: + sleep(sleep_time) return try_homepage(baseurl, target_lang) # try alternative: Google News if target_lang is not None: diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py index ca0e5b86..d17dd088 100644 --- a/trafilatura/sitemaps.py +++ b/trafilatura/sitemaps.py @@ -8,12 +8,15 @@ import logging import re + from itertools import islice +from time import sleep from typing import List, Set, Optional from courlan import ( clean_url, extract_domain, + filter_urls, fix_relative_urls, get_hostinfo, lang_filter, @@ -183,7 +186,7 @@ def process(self) -> None: def sitemap_search( - url: str, target_lang: Optional[str] = None, external: bool = False + url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2 ) -> List[str]: """Look for sitemaps for the given URL and gather links. @@ -194,6 +197,7 @@ def sitemap_search( (two-letter string, ISO 639-1 format). external: Similar hosts only or external URLs (boolean, defaults to False). + sleep_time: Wait between requests on the same website. Returns: The extracted links as a list (sorted list of unique links). @@ -208,10 +212,15 @@ def sitemap_search( LOGGER.warning("base URL unreachable, dropping sitemap: %s", url) return [] + urlfilter = None + if url.endswith((".gz", "sitemap", ".xml")): sitemapurls = [url] else: sitemapurls = [] + # set url filter to target subpages + if len(url) > len(baseurl) + 2: + urlfilter = url sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external) @@ -231,6 +240,12 @@ def sitemap_search( s for s in sitemap.sitemap_urls if s not in sitemap.seen ] + if len(sitemap.seen) < MAX_SITEMAPS_SEEN: + sleep(sleep_time) + + if urlfilter: + sitemap.urls = filter_urls(sitemap.urls, urlfilter) + LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname) return sitemap.urls