From 27bb0137636d1d96a7b339ff72a34af01047264d Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 13 Feb 2024 11:52:43 +0100 Subject: [PATCH] LXML: compile most XPath expressions (#504) * LXML: compile XPath expressions * compile all XPath expressions --- trafilatura/core.py | 10 ++++---- trafilatura/htmlprocessing.py | 6 ++--- trafilatura/metadata.py | 4 ++-- trafilatura/metaxpaths.py | 22 +++++++++-------- trafilatura/settings.py | 4 +++- trafilatura/xpaths.py | 45 +++++++++++++++++++---------------- 6 files changed, 49 insertions(+), 42 deletions(-) diff --git a/trafilatura/core.py b/trafilatura/core.py index 38a83998..492debc2 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -13,7 +13,7 @@ import warnings from copy import deepcopy -from lxml.etree import Element, SubElement, strip_elements, strip_tags +from lxml.etree import Element, SubElement, XPath, strip_elements, strip_tags from lxml.html import tostring # own @@ -545,7 +545,7 @@ def extract_content(tree, options): for expr in BODY_XPATH: # select tree if the expression has been found try: - subtree = tree.xpath(expr)[0] + subtree = expr(tree)[0] except IndexError: continue # prune the subtree @@ -624,7 +624,7 @@ def extract_comments(tree, options): # potential_tags.add('div') trouble with
for expr in COMMENTS_XPATH: # select tree if the expression has been found - subtree = tree.xpath(expr) + subtree = expr(tree) if not subtree: continue subtree = subtree[0] @@ -714,7 +714,7 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options): def basic_cleaning(tree): "Remove a few section types from the document." - for elem in tree.xpath(BASIC_CLEAN_XPATH): + for elem in BASIC_CLEAN_XPATH(tree): elem.getparent().remove(elem) return tree @@ -952,7 +952,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, if prune_xpath is not None: if isinstance(prune_xpath, str): prune_xpath = [prune_xpath] - tree = prune_unwanted_nodes(tree, prune_xpath) + tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath]) # backup (or not) for further processing tree_backup_1 = deepcopy(tree) if no_fallback is False else None diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index 090e316b..03801ac6 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -11,7 +11,7 @@ from copy import deepcopy from courlan.urlutils import fix_relative_urls, get_base_url -from lxml.etree import strip_tags +from lxml.etree import XPath, strip_tags from .filters import duplicate_test, textfilter from .settings import CUT_EMPTY_ELEMS, MANUALLY_CLEANED, MANUALLY_STRIPPED @@ -84,8 +84,8 @@ def prune_unwanted_nodes(tree, nodelist, with_backup=False): if with_backup is True: old_len = len(tree.text_content()) # ' '.join(tree.itertext()) backup = deepcopy(tree) - for expr in nodelist: - for subtree in tree.xpath(expr): + for expression in nodelist: + for subtree in expression(tree): # preserve tail text from deletion if subtree.tail is not None: previous = subtree.getprevious() diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py index 893eb320..3574818d 100644 --- a/trafilatura/metadata.py +++ b/trafilatura/metadata.py @@ -287,7 +287,7 @@ def extract_metainfo(tree, expressions, len_limit=200): for expression in expressions: # examine all results i = 0 - for elem in tree.xpath(expression): + for elem in expression(tree): content = trim(' '.join(elem.itertext())) if content and 2 < len(content) < len_limit: return content @@ -405,7 +405,7 @@ def extract_catstags(metatype, tree): for catexpr in xpath_expression: results.extend( elem.text_content() - for elem in tree.xpath(catexpr) + for elem in catexpr(tree) if re.search(regexpr, elem.attrib['href']) ) if results: diff --git a/trafilatura/metaxpaths.py b/trafilatura/metaxpaths.py index e51c26a2..5ba23cf1 100644 --- a/trafilatura/metaxpaths.py +++ b/trafilatura/metaxpaths.py @@ -5,16 +5,18 @@ # code available from https://github.com/adbar/trafilatura/ # under GNU GPLv3+ license +from lxml.etree import XPath + # the order or depth of XPaths could be changed after exhaustive testing -author_xpaths = [ +author_xpaths = [XPath(x) for x in ( '//*[(self::a or self::address or self::div or self::link or self::p or self::span or self::strong)][@rel="author" or @id="author" or @class="author" or @itemprop="author name" or rel="me" or contains(@class, "author-name") or contains(@class, "AuthorName") or contains(@class, "authorName") or contains(@class, "author name")]|//author', # specific and almost specific '//*[(self::a or self::div or self::h3 or self::h4 or self::p or self::span)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline" or contains(@id, "zuozhe") or contains(@class, "zuozhe") or contains(@id, "bianji") or contains(@class, "bianji") or contains(@id, "xiaobian") or contains(@class, "xiaobian") or contains(@class, "submitted-by") or contains(@class, "posted-by") or @class="username" or @class="BBL" or contains(@class, "journalist-name")]', # almost generic and generic, last ones not common '//*[contains(translate(@id, "A", "a"), "author") or contains(translate(@class, "A", "a"), "author") or contains(@class, "screenname") or contains(@data-component, "Byline") or contains(@itemprop, "author") or contains(@class, "writer") or contains(translate(@class, "B", "b"), "byline")]', # last resort: any element -] +)] -author_discard_xpaths = [ +author_discard_xpaths = [XPath(x) for x in ( """.//*[(self::a or self::div or self::section or self::span)][@id='comments' or @class='comments' or @class='title' or @class='date' or contains(@id, 'commentlist') or contains(@class, 'commentlist') or contains(@class, 'sidebar') or contains(@class, 'is-hidden') or contains(@class, 'quote') or contains(@id, 'comment-list') or contains(@class, 'comments-list') or contains(@class, 'embedly-instagram') or contains(@id, 'ProductReviews') or @@ -22,10 +24,10 @@ or starts-with(@class, 'comments') or starts-with(@class, 'Comments') ]""", '//time|//figure', -] +)] -categories_xpaths = [ +categories_xpaths = [XPath(x) for x in ( """//div[starts-with(@class, 'post-info') or starts-with(@class, 'postinfo') or starts-with(@class, 'post-meta') or starts-with(@class, 'postmeta') or starts-with(@class, 'meta') or starts-with(@class, 'entry-meta') or starts-with(@class, 'entry-info') or @@ -35,26 +37,26 @@ '//*[(self::li or self::span)][@class="post-category" or @class="postcategory" or @class="entry-category" or contains(@class, "cat-links")]//a[@href]', '//header[@class="entry-header"]//a[@href]', '//div[@class="row" or @class="tags"]//a[@href]', -] +)] # "//*[(self::div or self::p)][contains(@class, 'byline')]", -tags_xpaths = [ +tags_xpaths = [XPath(x) for x in ( '//div[@class="tags"]//a[@href]', "//p[starts-with(@class, 'entry-tags')]//a[@href]", '''//div[@class="row" or @class="jp-relatedposts" or @class="entry-utility" or starts-with(@class, 'tag') or starts-with(@class, 'postmeta') or starts-with(@class, 'meta')]//a[@href]''', '//*[@class="entry-meta" or contains(@class, "topics") or contains(@class, "tags-links")]//a[@href]', -] +)] # "related-topics" # https://github.com/grangier/python-goose/blob/develop/goose/extractors/tags.py -title_xpaths = [ +title_xpaths = [XPath(x) for x in ( '//*[(self::h1 or self::h2)][contains(@class, "post-title") or contains(@class, "entry-title") or contains(@class, "headline") or contains(@id, "headline") or contains(@itemprop, "headline") or contains(@class, "post__title") or contains(@class, "article-title")]', '//*[@class="entry-title" or @class="post-title"]', '//*[(self::h1 or self::h2 or self::h3)][contains(@class, "title") or contains(@id, "title")]', -] +)] # json-ld headline # '//header/h1', diff --git a/trafilatura/settings.py b/trafilatura/settings.py index 53f88e23..8e83e592 100644 --- a/trafilatura/settings.py +++ b/trafilatura/settings.py @@ -11,6 +11,8 @@ from os import cpu_count from pathlib import Path +from lxml.etree import XPath + def use_config(filename=None, config=None): @@ -72,7 +74,7 @@ def use_config(filename=None, config=None): ] # 'center', 'rb', 'wbr' -BASIC_CLEAN_XPATH = ".//aside|.//footer|.//script|.//style" +BASIC_CLEAN_XPATH = XPath(".//aside|.//footer|.//script|.//style") TAG_CATALOG = frozenset(['blockquote', 'code', 'del', 'head', 'hi', 'lb', 'list', 'p', 'pre', 'quote']) # + list(CUT_EMPTY_ELEMS) diff --git a/trafilatura/xpaths.py b/trafilatura/xpaths.py index 5687fefe..f25a1fb0 100644 --- a/trafilatura/xpaths.py +++ b/trafilatura/xpaths.py @@ -5,7 +5,10 @@ ## under GNU GPL v3 license -BODY_XPATH = [ +from lxml.etree import XPath + + +BODY_XPATH = [XPath(x) for x in ( '''.//*[(self::article or self::div or self::main or self::section)][ @class="post" or @class="entry" or contains(@class, "post-text") or contains(@class, "post_text") or @@ -46,7 +49,7 @@ or contains(translate(@class, "CP","cp"), "page-content") or @id="content" or @class="content"])[1]''', '(.//*[(self::article or self::div or self::section)][starts-with(@class, "main") or starts-with(@id, "main") or starts-with(@role, "main")])[1]|(.//main)[1]', -] +)] # starts-with(@id, "article") or # or starts-with(@id, "story") or contains(@class, "story") # starts-with(@class, "content ") or contains(@class, " content") @@ -58,7 +61,7 @@ # './/span[@class=""]', # instagram? -COMMENTS_XPATH = [ +COMMENTS_XPATH = [XPath(x) for x in ( """.//*[(self::div or self::list or self::section)][contains(@id, 'commentlist') or contains(@class, 'commentlist') or contains(@class, 'comment-page') or contains(@id, 'comment-list') or contains(@class, 'comments-list') or @@ -70,34 +73,34 @@ """.//*[(self::div or self::section or self::list)][starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread') or starts-with(@id, 'dsq-comments')]""", ".//*[(self::div or self::section)][starts-with(@id, 'social') or contains(@class, 'comment')]", -] +)] # or contains(@class, 'Comments') -REMOVE_COMMENTS_XPATH = [ +REMOVE_COMMENTS_XPATH = [XPath( """.//*[(self::div or self::list or self::section)][ starts-with(translate(@id, "C","c"), 'comment') or starts-with(translate(@class, "C","c"), 'comment') or contains(@class, 'article-comments') or contains(@class, 'post-comments') or starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread') or starts-with(@id, 'dsq-comments') - ]""", -] + ]""" +)] # or self::span # or contains(@class, 'comment') or contains(@id, 'comment') -PAYWALL_DISCARD_XPATH = [ +PAYWALL_DISCARD_XPATH = [XPath( '''.//*[(self::div or self::p)][ contains(@id, "paywall") or contains(@id, "premium") or contains(@class, "paid-content") or contains(@class, "paidcontent") or contains(@class, "obfuscated") or contains(@class, "blurred") or contains(@class, "restricted") or contains(@class, "overlay") - ]''', -] + ]''' +)] -OVERALL_DISCARD_XPATH = [ +OVERALL_DISCARD_XPATH = [XPath(x) for x in ( # navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts '''.//*[(self::div or self::item or self::list or self::p or self::section or self::span)][ @@ -155,7 +158,7 @@ or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint") or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true" or contains(@class, "notloaded")]''', -] +)] # conflicts: # contains(@id, "header") or contains(@class, "header") or # class contains "cats" (categories, also tags?) @@ -166,15 +169,15 @@ # the following conditions focus on extraction precision -TEASER_DISCARD_XPATH = [ +TEASER_DISCARD_XPATH = [XPath( '''.//*[(self::div or self::item or self::list or self::p or self::section or self::span)][ contains(translate(@id, "T", "t"), "teaser") or contains(translate(@class, "T", "t"), "teaser") - ]''', -] + ]''' +)] -PRECISION_DISCARD_XPATH = [ +PRECISION_DISCARD_XPATH = [XPath(x) for x in ( './/header', '''.//*[(self::div or self::item or self::list or self::p or self::section or self::span)][ @@ -182,19 +185,19 @@ contains(@id, "link") or contains(@class, "link") or contains(@style, "border") ]''', -] +)] -DISCARD_IMAGE_ELEMENTS = [ +DISCARD_IMAGE_ELEMENTS = [XPath( '''.//*[(self::div or self::item or self::list or self::p or self::section or self::span)][ contains(@id, "caption") or contains(@class, "caption") ] ''' -] +)] -COMMENTS_DISCARD_XPATH = [ +COMMENTS_DISCARD_XPATH = [XPath(x) for x in ( './/*[(self::div or self::section)][starts-with(@id, "respond")]', './/cite|.//quote', '''.//*[@class="comments-title" or contains(@class, "comments-title") or @@ -202,4 +205,4 @@ starts-with(@class, "reply-") or contains(@class, "-reply-") or contains(@class, "message") or contains(@class, "signin") or contains(@id, "akismet") or contains(@class, "akismet") or contains(@style, "display:none")]''', -] +)]