Skip to content

Commit

Permalink
CLI: print URLs early for feeds and sitemaps with --list (#744)
Browse files Browse the repository at this point in the history
* cli: also stream URL list gathered from feeds

* make streaming default and add threading.RLock

---------

Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
gremid and adbar authored Nov 15, 2024
1 parent 2274ceb commit dafbe6d
Showing 1 changed file with 7 additions and 8 deletions.
15 changes: 7 additions & 8 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from datetime import datetime
from functools import partial
from os import makedirs, path, stat, walk
from threading import RLock
from typing import Any, Generator, Optional, List, Set, Tuple

from courlan import UrlStore, extract_domain, get_base_url # validate_url
Expand Down Expand Up @@ -301,6 +302,7 @@ def cli_discovery(args: Any) -> None:
external=options.config.getboolean("DEFAULT", "EXTERNAL_URLS"),
sleep_time=options.config.getfloat("DEFAULT", "SLEEP_TIME"),
)
lock = RLock()

# link discovery and storage
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
Expand All @@ -311,14 +313,11 @@ def cli_discovery(args: Any) -> None:
if future.result() is not None:
url_store.add_urls(future.result())
# empty buffer in order to spare memory
if (
args.sitemap
and args.list
and len(url_store.get_known_domains()) >= args.parallel
):
url_store.print_unvisited_urls()
url_store.reset()
reset_caches()
if args.list and len(url_store.get_known_domains()) >= args.parallel:
with lock:
url_store.print_unvisited_urls()
url_store.reset()
reset_caches()

# process the (rest of the) links found
error_caught = url_processing_pipeline(args, url_store)
Expand Down

0 comments on commit dafbe6d

Please sign in to comment.