CLI: print URLs early for feeds and sitemaps with --list (#744)

* cli: also stream URL list gathered from feeds * make streaming default and add threading.RLock --------- Co-authored-by: Adrien Barbaresi <[email protected]>
adbar · Nov 15, 2024 · dafbe6d · dafbe6d
1 parent 2274ceb
commit dafbe6d
Showing 1 changed file with 7 additions and 8 deletions.
diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -20,6 +20,7 @@
 from datetime import datetime
 from functools import partial
 from os import makedirs, path, stat, walk
+from threading import RLock
 from typing import Any, Generator, Optional, List, Set, Tuple
 
 from courlan import UrlStore, extract_domain, get_base_url  # validate_url
@@ -301,6 +302,7 @@ def cli_discovery(args: Any) -> None:
         external=options.config.getboolean("DEFAULT", "EXTERNAL_URLS"),
         sleep_time=options.config.getfloat("DEFAULT", "SLEEP_TIME"),
     )
+    lock = RLock()
 
     # link discovery and storage
     with ThreadPoolExecutor(max_workers=args.parallel) as executor:
@@ -311,14 +313,11 @@ def cli_discovery(args: Any) -> None:
             if future.result() is not None:
                 url_store.add_urls(future.result())
                 # empty buffer in order to spare memory
-                if (
-                    args.sitemap
-                    and args.list
-                    and len(url_store.get_known_domains()) >= args.parallel
-                ):
-                    url_store.print_unvisited_urls()
-                    url_store.reset()
-                    reset_caches()
+                if args.list and len(url_store.get_known_domains()) >= args.parallel:
+                    with lock:
+                        url_store.print_unvisited_urls()
+                        url_store.reset()
+                        reset_caches()
 
     # process the (rest of the) links found
     error_caught = url_processing_pipeline(args, url_store)