Skip to content

Commit

Permalink
sitemaps: use safeguards
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Feb 8, 2024
1 parent c8f978d commit 0ad353d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 4 deletions.
8 changes: 6 additions & 2 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,16 @@ def download_queue_processing(url_store, args, counter, config):

def cli_discovery(args):
"Group CLI functions dedicated to URL discovery."
url_store = load_input_dict(args)
func = find_feed_urls if args.feed else sitemap_search

url_store = load_input_dict(args)
input_urls = url_store.dump_urls()
if args.list:
url_store.reset()
ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS')

config = use_config(filename=args.config_file)
ext = config.getboolean('DEFAULT', 'EXTERNAL_URLS')
# sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')

# link discovery and storage
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
Expand Down
21 changes: 19 additions & 2 deletions trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@

import logging
import re

from itertools import islice
from time import sleep
from typing import List, Set, Optional

from courlan import (
clean_url,
extract_domain,
filter_urls,
fix_relative_urls,
get_hostinfo,
lang_filter,
Expand Down Expand Up @@ -183,7 +186,7 @@ def process(self) -> None:


def sitemap_search(
url: str, target_lang: Optional[str] = None, external: bool = False
url: str, target_lang: Optional[str] = None, external: bool = False, sleep_time: int = 2
) -> List[str]:
"""Look for sitemaps for the given URL and gather links.
Expand All @@ -194,6 +197,7 @@ def sitemap_search(
(two-letter string, ISO 639-1 format).
external: Similar hosts only or external URLs
(boolean, defaults to False).
sleep_time: Wait between requests to the same website.
Returns:
The extracted links as a list (sorted list of unique links).
Expand All @@ -208,10 +212,15 @@ def sitemap_search(
LOGGER.warning("base URL unreachable, dropping sitemap: %s", url)
return []

urlfilter = None

if url.endswith((".gz", "sitemap", ".xml")):
sitemapurls = [url]
else:
sitemapurls = []
# set url filter to target subpages
if len(url) > len(baseurl) + 2:
urlfilter = url

sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external)

Expand All @@ -222,7 +231,7 @@ def sitemap_search(
]

# iterate through nested sitemaps and results
while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
while sitemap.sitemap_urls:
sitemap.current_url = sitemap.sitemap_urls.pop()
sitemap.fetch()
sitemap.process()
Expand All @@ -231,6 +240,14 @@ def sitemap_search(
s for s in sitemap.sitemap_urls if s not in sitemap.seen
]

if len(sitemap.seen) < MAX_SITEMAPS_SEEN:
sleep(sleep_time)
else:
break

Check warning on line 246 in trafilatura/sitemaps.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/sitemaps.py#L246

Added line #L246 was not covered by tests

if urlfilter:
sitemap.urls = filter_urls(sitemap.urls, urlfilter)

LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
return sitemap.urls

Expand Down

0 comments on commit 0ad353d

Please sign in to comment.