From 0ad353d62a7ce0ba2177e38401cc2a222bdfd7f1 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Thu, 8 Feb 2024 15:03:21 +0100
Subject: [PATCH 1/2] sitemaps: use safeguards

---
 trafilatura/cli_utils.py |  8 ++++++--
 trafilatura/sitemaps.py  | 21 +++++++++++++++++++--
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
index bb4601d0..d6208d44 100644
--- a/trafilatura/cli_utils.py
+++ b/trafilatura/cli_utils.py
@@ -231,12 +231,16 @@ def download_queue_processing(url_store, args, counter, config):
 
 def cli_discovery(args):
     "Group CLI functions dedicated to URL discovery."
-    url_store = load_input_dict(args)
     func = find_feed_urls if args.feed else sitemap_search
+
+    url_store = load_input_dict(args)
     input_urls = url_store.dump_urls()
     if args.list:
         url_store.reset()
-    ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS')
+
+    config = use_config(filename=args.config_file)
+    ext = config.getboolean('DEFAULT', 'EXTERNAL_URLS')
+    # sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
 
     # link discovery and storage
     with ThreadPoolExecutor(max_workers=args.parallel) as executor:
diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
index ca0e5b86..c21bdddb 100644
--- a/trafilatura/sitemaps.py
+++ b/trafilatura/sitemaps.py
@@ -8,12 +8,15 @@
 
 import logging
 import re
+
 from itertools import islice
+from time import sleep
 from typing import List, Set, Optional
 
 from courlan import (
     clean_url,
     extract_domain,
+    filter_urls,
     fix_relative_urls,
     get_hostinfo,
     lang_filter,
@@ -183,7 +186,7 @@ def process(self) -> None:
 
 
 def sitemap_search(
-    url: str, target_lang: Optional[str] = None, external: bool = False
+    url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2
 ) -> List[str]:
     """Look for sitemaps for the given URL and gather links.
 
@@ -194,6 +197,7 @@ def sitemap_search(
                      (two-letter string, ISO 639-1 format).
         external: Similar hosts only or external URLs
                   (boolean, defaults to False).
+        sleep_time: Wait between requests to the same website.
 
     Returns:
         The extracted links as a list (sorted list of unique links).
@@ -208,10 +212,15 @@ def sitemap_search(
         LOGGER.warning("base URL unreachable, dropping sitemap: %s", url)
         return []
 
+    urlfilter = None
+
     if url.endswith((".gz", "sitemap", ".xml")):
         sitemapurls = [url]
     else:
         sitemapurls = []
+        # set url filter to target subpages
+        if len(url) > len(baseurl) + 2:
+            urlfilter = url
 
     sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external)
 
@@ -222,7 +231,7 @@ def sitemap_search(
         ]
 
     # iterate through nested sitemaps and results
-    while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
+    while sitemap.sitemap_urls:
         sitemap.current_url = sitemap.sitemap_urls.pop()
         sitemap.fetch()
         sitemap.process()
@@ -231,6 +240,14 @@ def sitemap_search(
             s for s in sitemap.sitemap_urls if s not in sitemap.seen
         ]
 
+        if len(sitemap.seen) < MAX_SITEMAPS_SEEN:
+            sleep(sleep_time)
+        else:
+            break
+
+    if urlfilter:
+        sitemap.urls = filter_urls(sitemap.urls, urlfilter)
+
     LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
     return sitemap.urls
 

From a40ef04f7e90eda3f9974373b8ddd2866d51706a Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Thu, 8 Feb 2024 16:51:45 +0100
Subject: [PATCH 2/2] review CLI processing

---
 trafilatura/cli_utils.py | 12 +++++++-----
 trafilatura/feeds.py     |  5 ++++-
 trafilatura/sitemaps.py  |  6 ++----
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
index d6208d44..442a4a9c 100644
--- a/trafilatura/cli_utils.py
+++ b/trafilatura/cli_utils.py
@@ -231,20 +231,22 @@ def download_queue_processing(url_store, args, counter, config):
 
 def cli_discovery(args):
     "Group CLI functions dedicated to URL discovery."
-    func = find_feed_urls if args.feed else sitemap_search
-
     url_store = load_input_dict(args)
     input_urls = url_store.dump_urls()
     if args.list:
         url_store.reset()
 
     config = use_config(filename=args.config_file)
-    ext = config.getboolean('DEFAULT', 'EXTERNAL_URLS')
-    # sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
+    func = partial(
+               find_feed_urls if args.feed else sitemap_search,
+               target_lang=args.target_language,
+               external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
+               sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME')
+           )
 
     # link discovery and storage
     with ThreadPoolExecutor(max_workers=args.parallel) as executor:
-        futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls)
+        futures = (executor.submit(func, url) for url in input_urls)
         # process results from the parallel threads and add them
         # to the compressed URL dictionary for further processing
         for future in as_completed(futures):
diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py
index eebc47fa..c63f1328 100644
--- a/trafilatura/feeds.py
+++ b/trafilatura/feeds.py
@@ -10,6 +10,7 @@
 import re
 
 from itertools import islice
+from time import sleep
 from typing import List, Optional
 
 from courlan import (
@@ -216,7 +217,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]:
 
 
 def find_feed_urls(
-    url: str, target_lang: Optional[str] = None, external: bool = False
+    url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2,
 ) -> List[str]:
     """Try to find feed URLs.
 
@@ -227,6 +228,7 @@ def find_feed_urls(
                      (two-letter string, ISO 639-1 format).
         external: Similar hosts only or external URLs
                   (boolean, defaults to False).
+        sleep_time: Wait between requests on the same website.
 
     Returns:
         The extracted links as a list (sorted list of unique links).
@@ -259,6 +261,7 @@ def find_feed_urls(
     else:
         LOGGER.error("Could not download web page: %s", url)
         if url.strip("/") != baseurl:
+            sleep(sleep_time)
             return try_homepage(baseurl, target_lang)
     # try alternative: Google News
     if target_lang is not None:
diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
index c21bdddb..d17dd088 100644
--- a/trafilatura/sitemaps.py
+++ b/trafilatura/sitemaps.py
@@ -197,7 +197,7 @@ def sitemap_search(
                      (two-letter string, ISO 639-1 format).
         external: Similar hosts only or external URLs
                   (boolean, defaults to False).
-        sleep_time: Wait between requests to the same website.
+        sleep_time: Wait between requests on the same website.
 
     Returns:
         The extracted links as a list (sorted list of unique links).
@@ -231,7 +231,7 @@ def sitemap_search(
         ]
 
     # iterate through nested sitemaps and results
-    while sitemap.sitemap_urls:
+    while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
         sitemap.current_url = sitemap.sitemap_urls.pop()
         sitemap.fetch()
         sitemap.process()
@@ -242,8 +242,6 @@ def sitemap_search(
 
         if len(sitemap.seen) < MAX_SITEMAPS_SEEN:
             sleep(sleep_time)
-        else:
-            break
 
     if urlfilter:
         sitemap.urls = filter_urls(sitemap.urls, urlfilter)