Skip to content

Commit

Permalink
sitemaps/CLI: add sleep and restore filter (#506)
Browse files Browse the repository at this point in the history
* sitemaps: use safeguards

* review CLI processing
  • Loading branch information
adbar authored Feb 9, 2024
1 parent c8f978d commit ca32cab
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 5 deletions.
12 changes: 9 additions & 3 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,15 +232,21 @@ def download_queue_processing(url_store, args, counter, config):
def cli_discovery(args):
"Group CLI functions dedicated to URL discovery."
url_store = load_input_dict(args)
func = find_feed_urls if args.feed else sitemap_search
input_urls = url_store.dump_urls()
if args.list:
url_store.reset()
ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS')

config = use_config(filename=args.config_file)
func = partial(
find_feed_urls if args.feed else sitemap_search,
target_lang=args.target_language,
external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME')
)

# link discovery and storage
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls)
futures = (executor.submit(func, url) for url in input_urls)
# process results from the parallel threads and add them
# to the compressed URL dictionary for further processing
for future in as_completed(futures):
Expand Down
5 changes: 4 additions & 1 deletion trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re

from itertools import islice
from time import sleep
from typing import List, Optional

from courlan import (
Expand Down Expand Up @@ -216,7 +217,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]:


def find_feed_urls(
url: str, target_lang: Optional[str] = None, external: bool = False
url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2,
) -> List[str]:
"""Try to find feed URLs.
Expand All @@ -227,6 +228,7 @@ def find_feed_urls(
(two-letter string, ISO 639-1 format).
external: Similar hosts only or external URLs
(boolean, defaults to False).
sleep_time: Wait between requests on the same website.
Returns:
The extracted links as a list (sorted list of unique links).
Expand Down Expand Up @@ -259,6 +261,7 @@ def find_feed_urls(
else:
LOGGER.error("Could not download web page: %s", url)
if url.strip("/") != baseurl:
sleep(sleep_time)
return try_homepage(baseurl, target_lang)
# try alternative: Google News
if target_lang is not None:
Expand Down
17 changes: 16 additions & 1 deletion trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@

import logging
import re

from itertools import islice
from time import sleep
from typing import List, Set, Optional

from courlan import (
clean_url,
extract_domain,
filter_urls,
fix_relative_urls,
get_hostinfo,
lang_filter,
Expand Down Expand Up @@ -183,7 +186,7 @@ def process(self) -> None:


def sitemap_search(
url: str, target_lang: Optional[str] = None, external: bool = False
url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2
) -> List[str]:
"""Look for sitemaps for the given URL and gather links.
Expand All @@ -194,6 +197,7 @@ def sitemap_search(
(two-letter string, ISO 639-1 format).
external: Similar hosts only or external URLs
(boolean, defaults to False).
sleep_time: Wait between requests on the same website.
Returns:
The extracted links as a list (sorted list of unique links).
Expand All @@ -208,10 +212,15 @@ def sitemap_search(
LOGGER.warning("base URL unreachable, dropping sitemap: %s", url)
return []

urlfilter = None

if url.endswith((".gz", "sitemap", ".xml")):
sitemapurls = [url]
else:
sitemapurls = []
# set url filter to target subpages
if len(url) > len(baseurl) + 2:
urlfilter = url

sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external)

Expand All @@ -231,6 +240,12 @@ def sitemap_search(
s for s in sitemap.sitemap_urls if s not in sitemap.seen
]

if len(sitemap.seen) < MAX_SITEMAPS_SEEN:
sleep(sleep_time)

if urlfilter:
sitemap.urls = filter_urls(sitemap.urls, urlfilter)

LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
return sitemap.urls

Expand Down

0 comments on commit ca32cab

Please sign in to comment.