Skip to content

Commit

Permalink
sitemaps: simplify and improve search (#503)
Browse files Browse the repository at this point in the history
* simplify and improve sitemap init

* better coverage

* fix len test

* shorter code
  • Loading branch information
adbar authored Feb 8, 2024
1 parent 6cc2c69 commit c8f978d
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 72 deletions.
59 changes: 34 additions & 25 deletions tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

from courlan import get_hostinfo

import trafilatura
from trafilatura import sitemaps
from trafilatura.utils import decode_response, is_similar_domain
from trafilatura.utils import decode_file, is_similar_domain

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

Expand All @@ -29,21 +30,28 @@ def test_extraction():
'''Test simple link extraction'''
# link handling
url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
sitemap = sitemaps.SitemapObject(baseurl, domain, url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [])
sitemap.handle_link(url)
assert not sitemap.sitemap_urls and not sitemap.urls
assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls

# same URL
url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
sitemap.current_url = url
sitemap.handle_link(url)
assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls

sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', 'https://example.org/sitemap.xml')
sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', ['https://example.org/sitemap.xml'])
sitemap.handle_link('https://mydomain')
assert not sitemap.sitemap_urls and not sitemap.urls
assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls

sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', 'https://example.org/sitemap.xml')
sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', ['https://example.org/sitemap.xml'])
sitemap.handle_link('https://mydomain.wordpress.com/1')
assert not sitemap.sitemap_urls and sitemap.urls == ['https://mydomain.wordpress.com/1']
assert len(sitemap.sitemap_urls) == 1 and sitemap.urls == ['https://mydomain.wordpress.com/1']

sitemap = sitemaps.SitemapObject('https://programtalk.com', 'programtalk.com', 'https://programtalk.com/sitemap.xml')
sitemap = sitemaps.SitemapObject('https://programtalk.com', 'programtalk.com', ['https://programtalk.com/sitemap.xml'])
sitemap.handle_link('http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent')
assert not sitemap.sitemap_urls and sitemap.urls == ['http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent']
assert len(sitemap.sitemap_urls) == 1 and sitemap.urls == ['http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent']

# similar domain names
assert not is_similar_domain('kleins-weindepot.de', 'eurosoft.net')
Expand All @@ -54,17 +62,17 @@ def test_extraction():
url = 'https://de.sitemaps.org/1'
sitemap_url = 'https://de.sitemaps.org/sitemap.xml'
domain, baseurl = get_hostinfo(sitemap_url)
sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [])
sitemap.handle_link(url)
assert not sitemap.sitemap_urls and sitemap.urls == [url]

# diverging domains
url = 'https://www.software.info/1'
sitemap_url = 'https://example.org/sitemap.xml'
domain, baseurl = get_hostinfo(sitemap_url)
sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
sitemap_urls = ['https://example.org/sitemap.xml']
domain, baseurl = get_hostinfo(sitemap_urls[0])
sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_urls)
sitemap.handle_link(url)
assert not sitemap.sitemap_urls and not sitemap.urls
assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls

# don't take this one?
#url = 'https://subdomain.sitemaps.org/1'
Expand All @@ -78,7 +86,7 @@ def test_extraction():
assert sitemaps.is_plausible_sitemap('http://test.org/sitemap.xml', '<!DOCTYPE html><html><body/></html>') is False
assert sitemaps.is_plausible_sitemap('http://test.org/sitemap', '<!DOCTYPE html><html><body/></html>') is False
# invalid
sitemap = sitemaps.SitemapObject(baseurl, domain, url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [])
sitemap.content = '<html>\n</html>'
sitemap.extract_sitemap_links()
assert not sitemap.sitemap_urls and not sitemap.urls
Expand All @@ -89,7 +97,7 @@ def test_extraction():
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
assert sitemaps.is_plausible_sitemap('http://sitemaps.org/sitemap.xml', teststring) is True
sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [])
sitemap.content = teststring
sitemap.extract_sitemap_links()
assert not sitemap.sitemap_urls and len(sitemap.urls) == 84
Expand All @@ -103,20 +111,20 @@ def test_extraction():
filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml')
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
sitemap = sitemaps.SitemapObject(baseurl, domain, url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
sitemap.content = teststring
sitemap.extract_sitemap_links()
assert sitemap.sitemap_urls == ['http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and not sitemap.urls
assert sitemap.sitemap_urls == ['http://www.example.com/sitemap.xml', 'http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and not sitemap.urls

# hreflang
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap', 'en')
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [], 'en')
sitemap.content = '<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>http://www.test.org/english/page.html</loc></url></urlset>'
sitemap.process()
assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['http://www.test.org/english/page.html'])
filepath = os.path.join(RESOURCES_DIR, 'sitemap-hreflang.xml')
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
sitemap = sitemaps.SitemapObject(baseurl, domain, url, 'de')
sitemap = sitemaps.SitemapObject(baseurl, domain, [], 'de')
sitemap.content = teststring
sitemap.extract_sitemap_langlinks()
assert sitemap.sitemap_urls == ['http://www.example.com/sitemap-de.xml.gz']
Expand All @@ -127,26 +135,26 @@ def test_extraction():
filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml.gz')
with open(filepath, 'rb') as f:
teststring = f.read()
teststring = decode_response(teststring)
teststring = decode_file(teststring)
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', teststring) is True
sitemap = sitemaps.SitemapObject(baseurl, domain, url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
sitemap.content = teststring
sitemap.extract_sitemap_links()
assert len(sitemap.sitemap_urls) == 0 and len(sitemap.urls) == 84
assert len(sitemap.sitemap_urls) == 1 and len(sitemap.urls) == 84

# check contents
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz?value=1', teststring) is True

# TXT links
content = 'Tralala\nhttps://test.org/1\nhttps://test.org/2'
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap', content) is True
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap')
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [])
sitemap.content = 'Tralala\nhttps://test.org/1\nhttps://test.org/2'
sitemap.process()
assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['https://test.org/1', 'https://test.org/2'])

# TXT links + language
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap', 'en')
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [], 'en')
sitemap.content = 'Tralala\nhttps://test.org/en/1\nhttps://test.org/en/2\nhttps://test.org/es/3'
sitemap.process()
assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['https://test.org/en/1', 'https://test.org/en/2'])
Expand All @@ -164,6 +172,7 @@ def test_robotstxt():

def test_whole():
"Test whole process."
trafilatura.settings.MAX_SITEMAPS_SEEN = 1
results = sitemaps.sitemap_search("https://www.sitemaps.org", target_lang="de")
assert len(results) == 8

Expand Down
84 changes: 37 additions & 47 deletions trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
import logging
import re
from itertools import islice
from typing import List, Optional
from typing import List, Set, Optional

from courlan import (
clean_url,
extract_domain,
filter_urls,
fix_relative_urls,
get_hostinfo,
lang_filter,
Expand Down Expand Up @@ -43,17 +42,24 @@
SCRUB_REGEX = re.compile(r"\?.*$|#.*$")
POTENTIAL_SITEMAP = re.compile(r"\.xml\b") # |\bsitemap\b

GUESSES = ["sitemap.xml.gz", "sitemap", "sitemap_index.xml", "sitemap_news.xml"]
GUESSES = [
"sitemap.xml",
"sitemap.xml.gz",
"sitemap",
"sitemap_index.xml",
"sitemap_news.xml",
]


class SitemapObject:
"Store all necessary information on sitemap download and processing."
__slots__ = [
"base_url",
"content",
"current_url",
"domain",
"external",
"sitemap_url",
"seen",
"sitemap_urls",
"target_lang",
"urls",
Expand All @@ -63,28 +69,30 @@ def __init__(
self,
base_url: str,
domain: str,
sitemap_url: str,
sitemapsurls: List[str],
target_lang: Optional[str] = None,
external: bool = False,
) -> None:
self.base_url: str = base_url
self.content: str = ""
self.domain: str = domain
self.external: bool = external
self.sitemap_url: str = sitemap_url
self.sitemap_urls: List[str] = []
self.current_url: str = ""
self.seen: Set[str] = set()
self.sitemap_urls: List[str] = sitemapsurls
self.target_lang: Optional[str] = target_lang
self.urls: List[str] = []

def fetch(self) -> None:
"Fetch a sitemap over the network."
LOGGER.debug("fetching sitemap: %s", self.sitemap_url)
self.content = fetch_url(self.sitemap_url)
LOGGER.debug("fetching sitemap: %s", self.current_url)
self.content = fetch_url(self.current_url)
self.seen.add(self.current_url)

def handle_link(self, link: str) -> None:
"""Examine a link and determine if it's valid and if it leads to
a sitemap or a web page."""
if link == self.sitemap_url: # safety check
if link == self.current_url: # safety check
return
# fix, check, clean and normalize
link = fix_relative_urls(self.base_url, link)
Expand Down Expand Up @@ -135,7 +143,7 @@ def extract_sitemap_langlinks(self) -> None:
"%s sitemaps and %s links with hreflang found for %s",
len(self.sitemap_urls),
len(self.urls),
self.sitemap_url,
self.current_url,
)

def extract_sitemap_links(self) -> None:
Expand All @@ -150,12 +158,12 @@ def extract_sitemap_links(self) -> None:
"%s sitemaps and %s links found for %s",
len(self.sitemap_urls),
len(self.urls),
self.sitemap_url,
self.current_url,
)

def process(self) -> None:
"Download a sitemap and extract the links it contains."
plausible = is_plausible_sitemap(self.sitemap_url, self.content)
plausible = is_plausible_sitemap(self.current_url, self.content)
# safeguard
if not plausible:
return
Expand Down Expand Up @@ -200,47 +208,29 @@ def sitemap_search(
LOGGER.warning("base URL unreachable, dropping sitemap: %s", url)
return []

urlfilter = None
if url.endswith((".gz", "sitemap", ".xml")):
sitemapurl = url
sitemapurls = [url]
else:
sitemapurl = baseurl + "/sitemap.xml"
# filter triggered, prepare it
if len(url) > len(baseurl) + 2:
urlfilter = url

sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang, external)
sitemap.fetch()
sitemap.process()

if not sitemap.sitemap_urls and sitemap.urls:
linklist = filter_urls(sitemap.urls, urlfilter)
LOGGER.debug("%s sitemap links found for %s", len(linklist), domainname)
return linklist

# try sitemaps in robots.txt file if nothing has been found
if not sitemap.sitemap_urls and not sitemap.urls:
sitemap.sitemap_urls = find_robots_sitemaps(baseurl)
# try additional URLs just in case
if not sitemap.sitemap_urls:
sitemap.sitemap_urls = ["".join([baseurl, "/", g]) for g in GUESSES]
sitemapurls = []

sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external)

# try sitemaps in robots.txt file, additional URLs just in case
if not sitemap.sitemap_urls:
sitemap.sitemap_urls = find_robots_sitemaps(baseurl) or [
f"{baseurl}/{g}" for g in GUESSES
]

# iterate through nested sitemaps and results
seen = {sitemapurl}
i = 1
while sitemap.sitemap_urls:
sitemap.sitemap_url = sitemap.sitemap_urls.pop()
while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
sitemap.current_url = sitemap.sitemap_urls.pop()
sitemap.fetch()
sitemap.process()
# sanity check: keep track of visited sitemaps and exclude them
seen.add(sitemap.sitemap_url)
sitemap.sitemap_urls = [s for s in sitemap.sitemap_urls if s not in seen]
# counter and safeguard
i += 1
if i > MAX_SITEMAPS_SEEN:
break

sitemap.urls = filter_urls(sitemap.urls, urlfilter)
sitemap.sitemap_urls = [
s for s in sitemap.sitemap_urls if s not in sitemap.seen
]

LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
return sitemap.urls

Expand Down

0 comments on commit c8f978d

Please sign in to comment.