From d31c8d74b08785c7e297d36449fd8869b49b6f1f Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 15 Dec 2023 16:05:05 +0100 Subject: [PATCH] improve feed detection (#457) --- tests/feeds_tests.py | 19 +++++++++++++++++++ trafilatura/feeds.py | 23 ++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/tests/feeds_tests.py b/tests/feeds_tests.py index 13df14e2..cea4b242 100644 --- a/tests/feeds_tests.py +++ b/tests/feeds_tests.py @@ -220,6 +220,7 @@ def test_feeds_helpers(): ) == 1 ) + # no comments wanted assert ( len( @@ -254,6 +255,24 @@ def test_feeds_helpers(): assert determine_feed( '', params ) == ["https://example.org/rss"] + assert determine_feed( + '', + params, + ) == ["https://example.org/feeds/posts/default/"] + assert ( + len( + determine_feed( + '', + params, + ) + ) + == 1 + ) + assert determine_feed( + '', + params, + ) == ["https://example.org/?feed=rss"] + # feed discovery assert not find_feed_urls("http://") assert not find_feed_urls("https://httpbun.com/status/404") diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py index 303403cd..eebc47fa 100644 --- a/trafilatura/feeds.py +++ b/trafilatura/feeds.py @@ -27,16 +27,26 @@ LOGGER = logging.getLogger(__name__) +# https://www.iana.org/assignments/media-types/media-types.xhtml +# standard + potential types FEED_TYPES = { + "application/atom", # not IANA-compatible "application/atom+xml", + "application/feed+json", # not IANA-compatible "application/json", + "application/rdf", # not IANA-compatible "application/rdf+xml", + "application/rss", # not IANA-compatible "application/rss+xml", - "application/x.atom+xml", - "application/x-atom+xml", + "application/x.atom+xml", # not IANA-compatible + "application/x-atom+xml", # not IANA-compatible + "application/xml", + "text/atom", # not IANA-compatible "text/atom+xml", "text/plain", + "text/rdf", # not IANA-compatible "text/rdf+xml", + "text/rss", # not IANA-compatible "text/rss+xml", "text/xml", } @@ -51,7 +61,14 @@ BLACKLIST = re.compile(r"\bcomments\b") # no comment feed -LINK_VALIDATION_RE = re.compile(r"\.(?:atom|rdf|rss|xml)$|\b(?:atom|rss)\b") +LINK_VALIDATION_RE = re.compile( + r"\.(?:atom|rdf|rss|xml)$|" + r"\b(?:atom|rss)\b|" + r"\?type=100$|" # Typo3 + r"feeds/posts/default/?$|" # Blogger + r"\?feed=(?:atom|rdf|rss|rss2)|" + r"feed$" # Generic +) class FeedParameters: