sitemaps/CLI: add sleep and restore filter (#506)

* sitemaps: use safeguards * review CLI processing
adbar · Feb 9, 2024 · ca32cab · ca32cab
1 parent c8f978d
commit ca32cab
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 5 deletions.
diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -232,15 +232,21 @@ def download_queue_processing(url_store, args, counter, config):
 def cli_discovery(args):
     "Group CLI functions dedicated to URL discovery."
     url_store = load_input_dict(args)
-    func = find_feed_urls if args.feed else sitemap_search
     input_urls = url_store.dump_urls()
     if args.list:
         url_store.reset()
-    ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS')
+
+    config = use_config(filename=args.config_file)
+    func = partial(
+               find_feed_urls if args.feed else sitemap_search,
+               target_lang=args.target_language,
+               external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
+               sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME')
+           )
 
     # link discovery and storage
     with ThreadPoolExecutor(max_workers=args.parallel) as executor:
-        futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls)
+        futures = (executor.submit(func, url) for url in input_urls)
         # process results from the parallel threads and add them
         # to the compressed URL dictionary for further processing
         for future in as_completed(futures):

diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py
@@ -10,6 +10,7 @@
 import re
 
 from itertools import islice
+from time import sleep
 from typing import List, Optional
 
 from courlan import (
@@ -216,7 +217,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]:
 
 
 def find_feed_urls(
-    url: str, target_lang: Optional[str] = None, external: bool = False
+    url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2,
 ) -> List[str]:
     """Try to find feed URLs.
 
@@ -227,6 +228,7 @@ def find_feed_urls(
                      (two-letter string, ISO 639-1 format).
         external: Similar hosts only or external URLs
                   (boolean, defaults to False).
+        sleep_time: Wait between requests on the same website.
 
     Returns:
         The extracted links as a list (sorted list of unique links).
@@ -259,6 +261,7 @@ def find_feed_urls(
     else:
         LOGGER.error("Could not download web page: %s", url)
         if url.strip("/") != baseurl:
+            sleep(sleep_time)
             return try_homepage(baseurl, target_lang)
     # try alternative: Google News
     if target_lang is not None:

diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
@@ -8,12 +8,15 @@
 
 import logging
 import re
+
 from itertools import islice
+from time import sleep
 from typing import List, Set, Optional
 
 from courlan import (
     clean_url,
     extract_domain,
+    filter_urls,
     fix_relative_urls,
     get_hostinfo,
     lang_filter,
@@ -183,7 +186,7 @@ def process(self) -> None:
 
 
 def sitemap_search(
-    url: str, target_lang: Optional[str] = None, external: bool = False
+    url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2
 ) -> List[str]:
     """Look for sitemaps for the given URL and gather links.
 
@@ -194,6 +197,7 @@ def sitemap_search(
                      (two-letter string, ISO 639-1 format).
         external: Similar hosts only or external URLs
                   (boolean, defaults to False).
+        sleep_time: Wait between requests on the same website.
 
     Returns:
         The extracted links as a list (sorted list of unique links).
@@ -208,10 +212,15 @@ def sitemap_search(
         LOGGER.warning("base URL unreachable, dropping sitemap: %s", url)
         return []
 
+    urlfilter = None
+
     if url.endswith((".gz", "sitemap", ".xml")):
         sitemapurls = [url]
     else:
         sitemapurls = []
+        # set url filter to target subpages
+        if len(url) > len(baseurl) + 2:
+            urlfilter = url
 
     sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external)
 
@@ -231,6 +240,12 @@ def sitemap_search(
             s for s in sitemap.sitemap_urls if s not in sitemap.seen
         ]
 
+        if len(sitemap.seen) < MAX_SITEMAPS_SEEN:
+            sleep(sleep_time)
+
+    if urlfilter:
+        sitemap.urls = filter_urls(sitemap.urls, urlfilter)
+
     LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
     return sitemap.urls