Skip to content

Commit

Permalink
improve feed detection (#457)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Dec 15, 2023
1 parent b341138 commit d31c8d7
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 3 deletions.
19 changes: 19 additions & 0 deletions tests/feeds_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ def test_feeds_helpers():
)
== 1
)

# no comments wanted
assert (
len(
Expand Down Expand Up @@ -254,6 +255,24 @@ def test_feeds_helpers():
assert determine_feed(
'<html><body><a href="https://example.org/rss"><body/></html>', params
) == ["https://example.org/rss"]
assert determine_feed(
'<html><body><a href="https://example.org/feeds/posts/default/"><body/></html>',
params,
) == ["https://example.org/feeds/posts/default/"]
assert (
len(
determine_feed(
'<html><body><a href="https://www.test.org/cat/?feed=rss" /><body/></html>',
params,
)
)
== 1
)
assert determine_feed(
'<html><body><a href="?feed=rss" /><body/></html>',
params,
) == ["https://example.org/?feed=rss"]

# feed discovery
assert not find_feed_urls("http://")
assert not find_feed_urls("https://httpbun.com/status/404")
Expand Down
23 changes: 20 additions & 3 deletions trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,26 @@

LOGGER = logging.getLogger(__name__)

# https://www.iana.org/assignments/media-types/media-types.xhtml
# standard + potential types
FEED_TYPES = {
"application/atom", # not IANA-compatible
"application/atom+xml",
"application/feed+json", # not IANA-compatible
"application/json",
"application/rdf", # not IANA-compatible
"application/rdf+xml",
"application/rss", # not IANA-compatible
"application/rss+xml",
"application/x.atom+xml",
"application/x-atom+xml",
"application/x.atom+xml", # not IANA-compatible
"application/x-atom+xml", # not IANA-compatible
"application/xml",
"text/atom", # not IANA-compatible
"text/atom+xml",
"text/plain",
"text/rdf", # not IANA-compatible
"text/rdf+xml",
"text/rss", # not IANA-compatible
"text/rss+xml",
"text/xml",
}
Expand All @@ -51,7 +61,14 @@

BLACKLIST = re.compile(r"\bcomments\b") # no comment feed

LINK_VALIDATION_RE = re.compile(r"\.(?:atom|rdf|rss|xml)$|\b(?:atom|rss)\b")
LINK_VALIDATION_RE = re.compile(
r"\.(?:atom|rdf|rss|xml)$|"
r"\b(?:atom|rss)\b|"
r"\?type=100$|" # Typo3
r"feeds/posts/default/?$|" # Blogger
r"\?feed=(?:atom|rdf|rss|rss2)|"
r"feed$" # Generic
)


class FeedParameters:
Expand Down

0 comments on commit d31c8d7

Please sign in to comment.