sitemaps: simplify and improve search (#503)

* simplify and improve sitemap init * better coverage * fix len test * shorter code
adbar · Feb 8, 2024 · c8f978d · c8f978d
1 parent 6cc2c69
commit c8f978d
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 72 deletions.
diff --git a/tests/sitemaps_tests.py b/tests/sitemaps_tests.py
@@ -8,8 +8,9 @@
 
 from courlan import get_hostinfo
 
+import trafilatura
 from trafilatura import sitemaps
-from trafilatura.utils import decode_response, is_similar_domain
+from trafilatura.utils import decode_file, is_similar_domain
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 
@@ -29,21 +30,28 @@ def test_extraction():
     '''Test simple link extraction'''
     # link handling
     url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
-    sitemap = sitemaps.SitemapObject(baseurl, domain, url)
+    sitemap = sitemaps.SitemapObject(baseurl, domain, [])
     sitemap.handle_link(url)
-    assert not sitemap.sitemap_urls and not sitemap.urls
+    assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls
+
+    # same URL
+    url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
+    sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
+    sitemap.current_url = url
+    sitemap.handle_link(url)
+    assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls
 
-    sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', 'https://example.org/sitemap.xml')
+    sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', ['https://example.org/sitemap.xml'])
     sitemap.handle_link('https://mydomain')
-    assert not sitemap.sitemap_urls and not sitemap.urls
+    assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls
 
-    sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', 'https://example.org/sitemap.xml')
+    sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', ['https://example.org/sitemap.xml'])
     sitemap.handle_link('https://mydomain.wordpress.com/1')
-    assert not sitemap.sitemap_urls and sitemap.urls == ['https://mydomain.wordpress.com/1']
+    assert len(sitemap.sitemap_urls) == 1 and sitemap.urls == ['https://mydomain.wordpress.com/1']
 
-    sitemap = sitemaps.SitemapObject('https://programtalk.com', 'programtalk.com', 'https://programtalk.com/sitemap.xml')
+    sitemap = sitemaps.SitemapObject('https://programtalk.com', 'programtalk.com', ['https://programtalk.com/sitemap.xml'])
     sitemap.handle_link('http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent')
-    assert not sitemap.sitemap_urls and sitemap.urls == ['http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent']
+    assert len(sitemap.sitemap_urls) == 1 and sitemap.urls == ['http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent']
 
     # similar domain names
     assert not is_similar_domain('kleins-weindepot.de', 'eurosoft.net')
@@ -54,17 +62,17 @@ def test_extraction():
     url = 'https://de.sitemaps.org/1'
     sitemap_url = 'https://de.sitemaps.org/sitemap.xml'
     domain, baseurl = get_hostinfo(sitemap_url)
-    sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
+    sitemap = sitemaps.SitemapObject(baseurl, domain, [])
     sitemap.handle_link(url)
     assert not sitemap.sitemap_urls and sitemap.urls == [url]
 
     # diverging domains
     url = 'https://www.software.info/1'
-    sitemap_url = 'https://example.org/sitemap.xml'
-    domain, baseurl = get_hostinfo(sitemap_url)
-    sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
+    sitemap_urls = ['https://example.org/sitemap.xml']
+    domain, baseurl = get_hostinfo(sitemap_urls[0])
+    sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_urls)
     sitemap.handle_link(url)
-    assert not sitemap.sitemap_urls and not sitemap.urls
+    assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls
 
     # don't take this one?
     #url = 'https://subdomain.sitemaps.org/1'
@@ -78,7 +86,7 @@ def test_extraction():
     assert sitemaps.is_plausible_sitemap('http://test.org/sitemap.xml', '<!DOCTYPE html><html><body/></html>') is False
     assert sitemaps.is_plausible_sitemap('http://test.org/sitemap', '<!DOCTYPE html><html><body/></html>') is False
     # invalid
-    sitemap = sitemaps.SitemapObject(baseurl, domain, url)
+    sitemap = sitemaps.SitemapObject(baseurl, domain, [])
     sitemap.content = '<html>\n</html>'
     sitemap.extract_sitemap_links()
     assert not sitemap.sitemap_urls and not sitemap.urls
@@ -89,7 +97,7 @@ def test_extraction():
     with open(filepath, "r", encoding="utf-8") as f:
         teststring = f.read()
     assert sitemaps.is_plausible_sitemap('http://sitemaps.org/sitemap.xml', teststring) is True
-    sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
+    sitemap = sitemaps.SitemapObject(baseurl, domain, [])
     sitemap.content = teststring
     sitemap.extract_sitemap_links()
     assert not sitemap.sitemap_urls and len(sitemap.urls) == 84
@@ -103,20 +111,20 @@ def test_extraction():
     filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml')
     with open(filepath, "r", encoding="utf-8") as f:
         teststring = f.read()
-    sitemap = sitemaps.SitemapObject(baseurl, domain, url)
+    sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
     sitemap.content = teststring
     sitemap.extract_sitemap_links()
-    assert sitemap.sitemap_urls == ['http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and not sitemap.urls
+    assert sitemap.sitemap_urls == ['http://www.example.com/sitemap.xml', 'http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and not sitemap.urls
 
     # hreflang
-    sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap', 'en')
+    sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [], 'en')
     sitemap.content = '<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>http://www.test.org/english/page.html</loc></url></urlset>'
     sitemap.process()
     assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['http://www.test.org/english/page.html'])
     filepath = os.path.join(RESOURCES_DIR, 'sitemap-hreflang.xml')
     with open(filepath, "r", encoding="utf-8") as f:
         teststring = f.read()
-    sitemap = sitemaps.SitemapObject(baseurl, domain, url, 'de')
+    sitemap = sitemaps.SitemapObject(baseurl, domain, [], 'de')
     sitemap.content = teststring
     sitemap.extract_sitemap_langlinks()
     assert sitemap.sitemap_urls == ['http://www.example.com/sitemap-de.xml.gz']
@@ -127,26 +135,26 @@ def test_extraction():
     filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml.gz')
     with open(filepath, 'rb') as f:
         teststring = f.read()
-    teststring = decode_response(teststring)
+    teststring = decode_file(teststring)
     assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', teststring) is True
-    sitemap = sitemaps.SitemapObject(baseurl, domain, url)
+    sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
     sitemap.content = teststring
     sitemap.extract_sitemap_links()
-    assert len(sitemap.sitemap_urls) == 0 and len(sitemap.urls) == 84
+    assert len(sitemap.sitemap_urls) == 1 and len(sitemap.urls) == 84
 
     # check contents
     assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz?value=1', teststring) is True
 
     # TXT links
     content = 'Tralala\nhttps://test.org/1\nhttps://test.org/2'
     assert sitemaps.is_plausible_sitemap('http://example.org/sitemap', content) is True
-    sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap')
+    sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [])
     sitemap.content = 'Tralala\nhttps://test.org/1\nhttps://test.org/2'
     sitemap.process()
     assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['https://test.org/1', 'https://test.org/2'])
 
     # TXT links + language
-    sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap', 'en')
+    sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [], 'en')
     sitemap.content = 'Tralala\nhttps://test.org/en/1\nhttps://test.org/en/2\nhttps://test.org/es/3'
     sitemap.process()
     assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['https://test.org/en/1', 'https://test.org/en/2'])
@@ -164,6 +172,7 @@ def test_robotstxt():
 
 def test_whole():
     "Test whole process."
+    trafilatura.settings.MAX_SITEMAPS_SEEN = 1
     results = sitemaps.sitemap_search("https://www.sitemaps.org", target_lang="de")
     assert len(results) == 8
 

diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
@@ -9,12 +9,11 @@
 import logging
 import re
 from itertools import islice
-from typing import List, Optional
+from typing import List, Set, Optional
 
 from courlan import (
     clean_url,
     extract_domain,
-    filter_urls,
     fix_relative_urls,
     get_hostinfo,
     lang_filter,
@@ -43,17 +42,24 @@
 SCRUB_REGEX = re.compile(r"\?.*$|#.*$")
 POTENTIAL_SITEMAP = re.compile(r"\.xml\b")  # |\bsitemap\b
 
-GUESSES = ["sitemap.xml.gz", "sitemap", "sitemap_index.xml", "sitemap_news.xml"]
+GUESSES = [
+    "sitemap.xml",
+    "sitemap.xml.gz",
+    "sitemap",
+    "sitemap_index.xml",
+    "sitemap_news.xml",
+]
 
 
 class SitemapObject:
     "Store all necessary information on sitemap download and processing."
     __slots__ = [
         "base_url",
         "content",
+        "current_url",
         "domain",
         "external",
-        "sitemap_url",
+        "seen",
         "sitemap_urls",
         "target_lang",
         "urls",
@@ -63,28 +69,30 @@ def __init__(
         self,
         base_url: str,
         domain: str,
-        sitemap_url: str,
+        sitemapsurls: List[str],
         target_lang: Optional[str] = None,
         external: bool = False,
     ) -> None:
         self.base_url: str = base_url
         self.content: str = ""
         self.domain: str = domain
         self.external: bool = external
-        self.sitemap_url: str = sitemap_url
-        self.sitemap_urls: List[str] = []
+        self.current_url: str = ""
+        self.seen: Set[str] = set()
+        self.sitemap_urls: List[str] = sitemapsurls
         self.target_lang: Optional[str] = target_lang
         self.urls: List[str] = []
 
     def fetch(self) -> None:
         "Fetch a sitemap over the network."
-        LOGGER.debug("fetching sitemap: %s", self.sitemap_url)
-        self.content = fetch_url(self.sitemap_url)
+        LOGGER.debug("fetching sitemap: %s", self.current_url)
+        self.content = fetch_url(self.current_url)
+        self.seen.add(self.current_url)
 
     def handle_link(self, link: str) -> None:
         """Examine a link and determine if it's valid and if it leads to
         a sitemap or a web page."""
-        if link == self.sitemap_url:  # safety check
+        if link == self.current_url:  # safety check
             return
         # fix, check, clean and normalize
         link = fix_relative_urls(self.base_url, link)
@@ -135,7 +143,7 @@ def extract_sitemap_langlinks(self) -> None:
             "%s sitemaps and %s links with hreflang found for %s",
             len(self.sitemap_urls),
             len(self.urls),
-            self.sitemap_url,
+            self.current_url,
         )
 
     def extract_sitemap_links(self) -> None:
@@ -150,12 +158,12 @@ def extract_sitemap_links(self) -> None:
             "%s sitemaps and %s links found for %s",
             len(self.sitemap_urls),
             len(self.urls),
-            self.sitemap_url,
+            self.current_url,
         )
 
     def process(self) -> None:
         "Download a sitemap and extract the links it contains."
-        plausible = is_plausible_sitemap(self.sitemap_url, self.content)
+        plausible = is_plausible_sitemap(self.current_url, self.content)
         # safeguard
         if not plausible:
             return
@@ -200,47 +208,29 @@ def sitemap_search(
         LOGGER.warning("base URL unreachable, dropping sitemap: %s", url)
         return []
 
-    urlfilter = None
     if url.endswith((".gz", "sitemap", ".xml")):
-        sitemapurl = url
+        sitemapurls = [url]
     else:
-        sitemapurl = baseurl + "/sitemap.xml"
-        # filter triggered, prepare it
-        if len(url) > len(baseurl) + 2:
-            urlfilter = url
-
-    sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang, external)
-    sitemap.fetch()
-    sitemap.process()
-
-    if not sitemap.sitemap_urls and sitemap.urls:
-        linklist = filter_urls(sitemap.urls, urlfilter)
-        LOGGER.debug("%s sitemap links found for %s", len(linklist), domainname)
-        return linklist
-
-    # try sitemaps in robots.txt file if nothing has been found
-    if not sitemap.sitemap_urls and not sitemap.urls:
-        sitemap.sitemap_urls = find_robots_sitemaps(baseurl)
-        # try additional URLs just in case
-        if not sitemap.sitemap_urls:
-            sitemap.sitemap_urls = ["".join([baseurl, "/", g]) for g in GUESSES]
+        sitemapurls = []
+
+    sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external)
+
+    # try sitemaps in robots.txt file, additional URLs just in case
+    if not sitemap.sitemap_urls:
+        sitemap.sitemap_urls = find_robots_sitemaps(baseurl) or [
+            f"{baseurl}/{g}" for g in GUESSES
+        ]
 
     # iterate through nested sitemaps and results
-    seen = {sitemapurl}
-    i = 1
-    while sitemap.sitemap_urls:
-        sitemap.sitemap_url = sitemap.sitemap_urls.pop()
+    while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
+        sitemap.current_url = sitemap.sitemap_urls.pop()
         sitemap.fetch()
         sitemap.process()
         # sanity check: keep track of visited sitemaps and exclude them
-        seen.add(sitemap.sitemap_url)
-        sitemap.sitemap_urls = [s for s in sitemap.sitemap_urls if s not in seen]
-        # counter and safeguard
-        i += 1
-        if i > MAX_SITEMAPS_SEEN:
-            break
-
-    sitemap.urls = filter_urls(sitemap.urls, urlfilter)
+        sitemap.sitemap_urls = [
+            s for s in sitemap.sitemap_urls if s not in sitemap.seen
+        ]
+
     LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
     return sitemap.urls