From 27bb0137636d1d96a7b339ff72a34af01047264d Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Tue, 13 Feb 2024 11:52:43 +0100
Subject: [PATCH] LXML: compile most XPath expressions (#504)

* LXML: compile XPath expressions

* compile all XPath expressions
---
 trafilatura/core.py           | 10 ++++----
 trafilatura/htmlprocessing.py |  6 ++---
 trafilatura/metadata.py       |  4 ++--
 trafilatura/metaxpaths.py     | 22 +++++++++--------
 trafilatura/settings.py       |  4 +++-
 trafilatura/xpaths.py         | 45 +++++++++++++++++++----------------
 6 files changed, 49 insertions(+), 42 deletions(-)
diff --git a/trafilatura/core.py b/trafilatura/core.py
index 38a83998..492debc2 100644
--- a/trafilatura/core.py
+++ b/trafilatura/core.py
@@ -13,7 +13,7 @@
 import warnings
 from copy import deepcopy
 
-from lxml.etree import Element, SubElement, strip_elements, strip_tags
+from lxml.etree import Element, SubElement, XPath, strip_elements, strip_tags
 from lxml.html import tostring
 
 # own
@@ -545,7 +545,7 @@ def extract_content(tree, options):
     for expr in BODY_XPATH:
         # select tree if the expression has been found
         try:
-            subtree = tree.xpath(expr)[0]
+            subtree = expr(tree)[0]
         except IndexError:
             continue
         # prune the subtree
@@ -624,7 +624,7 @@ def extract_comments(tree, options):
     # potential_tags.add('div') trouble with <div class="comment-author meta">
     for expr in COMMENTS_XPATH:
         # select tree if the expression has been found
-        subtree = tree.xpath(expr)
+        subtree = expr(tree)
         if not subtree:
             continue
         subtree = subtree[0]
@@ -714,7 +714,7 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):
 
 def basic_cleaning(tree):
     "Remove a few section types from the document."
-    for elem in tree.xpath(BASIC_CLEAN_XPATH):
+    for elem in BASIC_CLEAN_XPATH(tree):
         elem.getparent().remove(elem)
     return tree
 
@@ -952,7 +952,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
         if prune_xpath is not None:
             if isinstance(prune_xpath, str):
                 prune_xpath = [prune_xpath]
-            tree = prune_unwanted_nodes(tree, prune_xpath)
+            tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath])
 
         # backup (or not) for further processing
         tree_backup_1 = deepcopy(tree) if no_fallback is False else None
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
index 090e316b..03801ac6 100644
--- a/trafilatura/htmlprocessing.py
+++ b/trafilatura/htmlprocessing.py
@@ -11,7 +11,7 @@
 from copy import deepcopy
 
 from courlan.urlutils import fix_relative_urls, get_base_url
-from lxml.etree import strip_tags
+from lxml.etree import XPath, strip_tags
 
 from .filters import duplicate_test, textfilter
 from .settings import CUT_EMPTY_ELEMS, MANUALLY_CLEANED, MANUALLY_STRIPPED
@@ -84,8 +84,8 @@ def prune_unwanted_nodes(tree, nodelist, with_backup=False):
     if with_backup is True:
         old_len = len(tree.text_content())  # ' '.join(tree.itertext())
         backup = deepcopy(tree)
-    for expr in nodelist:
-        for subtree in tree.xpath(expr):
+    for expression in nodelist:
+        for subtree in expression(tree):
             # preserve tail text from deletion
             if subtree.tail is not None:
                 previous = subtree.getprevious()
diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py
index 893eb320..3574818d 100644
--- a/trafilatura/metadata.py
+++ b/trafilatura/metadata.py
@@ -287,7 +287,7 @@ def extract_metainfo(tree, expressions, len_limit=200):
     for expression in expressions:
         # examine all results
         i = 0
-        for elem in tree.xpath(expression):
+        for elem in expression(tree):
             content = trim(' '.join(elem.itertext()))
             if content and 2 < len(content) < len_limit:
                 return content
@@ -405,7 +405,7 @@ def extract_catstags(metatype, tree):
     for catexpr in xpath_expression:
         results.extend(
             elem.text_content()
-            for elem in tree.xpath(catexpr)
+            for elem in catexpr(tree)
             if re.search(regexpr, elem.attrib['href'])
         )
         if results:
diff --git a/trafilatura/metaxpaths.py b/trafilatura/metaxpaths.py
index e51c26a2..5ba23cf1 100644
--- a/trafilatura/metaxpaths.py
+++ b/trafilatura/metaxpaths.py
@@ -5,16 +5,18 @@
 # code available from https://github.com/adbar/trafilatura/
 # under GNU GPLv3+ license
 
+from lxml.etree import XPath
+
 
 # the order or depth of XPaths could be changed after exhaustive testing
-author_xpaths = [
+author_xpaths = [XPath(x) for x in (
     '//*[(self::a or self::address or self::div or self::link or self::p or self::span or self::strong)][@rel="author" or @id="author" or @class="author" or @itemprop="author name" or rel="me" or contains(@class, "author-name") or contains(@class, "AuthorName") or contains(@class, "authorName") or contains(@class, "author name")]|//author', # specific and almost specific
     '//*[(self::a or self::div or self::h3 or self::h4 or self::p or self::span)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline" or contains(@id, "zuozhe") or contains(@class, "zuozhe") or contains(@id, "bianji") or contains(@class, "bianji") or contains(@id, "xiaobian") or contains(@class, "xiaobian") or contains(@class, "submitted-by") or contains(@class, "posted-by") or @class="username" or @class="BBL" or contains(@class, "journalist-name")]', # almost generic and generic, last ones not common
     '//*[contains(translate(@id, "A", "a"), "author") or contains(translate(@class, "A", "a"), "author") or contains(@class, "screenname") or contains(@data-component, "Byline") or contains(@itemprop, "author") or contains(@class, "writer") or contains(translate(@class, "B", "b"), "byline")]', # last resort: any element
-]
+)]
 
 
-author_discard_xpaths = [
+author_discard_xpaths = [XPath(x) for x in (
     """.//*[(self::a or self::div or self::section or self::span)][@id='comments' or @class='comments' or @class='title' or @class='date' or
     contains(@id, 'commentlist') or contains(@class, 'commentlist') or contains(@class, 'sidebar') or contains(@class, 'is-hidden') or contains(@class, 'quote')
     or contains(@id, 'comment-list') or contains(@class, 'comments-list') or contains(@class, 'embedly-instagram') or contains(@id, 'ProductReviews') or
@@ -22,10 +24,10 @@
     or starts-with(@class, 'comments') or starts-with(@class, 'Comments')
     ]""",
     '//time|//figure',
-]
+)]
 
 
-categories_xpaths = [
+categories_xpaths = [XPath(x) for x in (
     """//div[starts-with(@class, 'post-info') or starts-with(@class, 'postinfo') or
     starts-with(@class, 'post-meta') or starts-with(@class, 'postmeta') or
     starts-with(@class, 'meta') or starts-with(@class, 'entry-meta') or starts-with(@class, 'entry-info') or
@@ -35,26 +37,26 @@
     '//*[(self::li or self::span)][@class="post-category" or @class="postcategory" or @class="entry-category" or contains(@class, "cat-links")]//a[@href]',
     '//header[@class="entry-header"]//a[@href]',
     '//div[@class="row" or @class="tags"]//a[@href]',
-]
+)]
 # "//*[(self::div or self::p)][contains(@class, 'byline')]",
 
 
-tags_xpaths = [
+tags_xpaths = [XPath(x) for x in (
     '//div[@class="tags"]//a[@href]',
     "//p[starts-with(@class, 'entry-tags')]//a[@href]",
     '''//div[@class="row" or @class="jp-relatedposts" or
     @class="entry-utility" or starts-with(@class, 'tag') or
     starts-with(@class, 'postmeta') or starts-with(@class, 'meta')]//a[@href]''',
     '//*[@class="entry-meta" or contains(@class, "topics") or contains(@class, "tags-links")]//a[@href]',
-]
+)]
 # "related-topics"
 # https://github.com/grangier/python-goose/blob/develop/goose/extractors/tags.py
 
 
-title_xpaths = [
+title_xpaths = [XPath(x) for x in (
     '//*[(self::h1 or self::h2)][contains(@class, "post-title") or contains(@class, "entry-title") or contains(@class, "headline") or contains(@id, "headline") or contains(@itemprop, "headline") or contains(@class, "post__title") or contains(@class, "article-title")]',
     '//*[@class="entry-title" or @class="post-title"]',
     '//*[(self::h1 or self::h2 or self::h3)][contains(@class, "title") or contains(@id, "title")]',
-]
+)]
 # json-ld headline
 # '//header/h1',
diff --git a/trafilatura/settings.py b/trafilatura/settings.py
index 53f88e23..8e83e592 100644
--- a/trafilatura/settings.py
+++ b/trafilatura/settings.py
@@ -11,6 +11,8 @@
 from os import cpu_count
 from pathlib import Path
 
+from lxml.etree import XPath
+
 
 
 def use_config(filename=None, config=None):
@@ -72,7 +74,7 @@ def use_config(filename=None, config=None):
 ]
 # 'center', 'rb', 'wbr'
 
-BASIC_CLEAN_XPATH = ".//aside|.//footer|.//script|.//style"
+BASIC_CLEAN_XPATH = XPath(".//aside|.//footer|.//script|.//style")
 
 TAG_CATALOG = frozenset(['blockquote', 'code', 'del', 'head', 'hi', 'lb', 'list', 'p', 'pre', 'quote'])
 # + list(CUT_EMPTY_ELEMS)
diff --git a/trafilatura/xpaths.py b/trafilatura/xpaths.py
index 5687fefe..f25a1fb0 100644
--- a/trafilatura/xpaths.py
+++ b/trafilatura/xpaths.py
@@ -5,7 +5,10 @@
 ## under GNU GPL v3 license
 
 
-BODY_XPATH = [
+from lxml.etree import XPath
+
+
+BODY_XPATH = [XPath(x) for x in (
     '''.//*[(self::article or self::div or self::main or self::section)][
     @class="post" or @class="entry" or
     contains(@class, "post-text") or contains(@class, "post_text") or
@@ -46,7 +49,7 @@
     or contains(translate(@class, "CP","cp"), "page-content") or
     @id="content" or @class="content"])[1]''',
     '(.//*[(self::article or self::div or self::section)][starts-with(@class, "main") or starts-with(@id, "main") or starts-with(@role, "main")])[1]|(.//main)[1]',
-]
+)]
 # starts-with(@id, "article") or
 # or starts-with(@id, "story") or contains(@class, "story")
 # starts-with(@class, "content ") or contains(@class, " content")
@@ -58,7 +61,7 @@
 # './/span[@class=""]', # instagram?
 
 
-COMMENTS_XPATH = [
+COMMENTS_XPATH = [XPath(x) for x in (
     """.//*[(self::div or self::list or self::section)][contains(@id, 'commentlist')
     or contains(@class, 'commentlist') or contains(@class, 'comment-page') or
     contains(@id, 'comment-list') or contains(@class, 'comments-list') or
@@ -70,34 +73,34 @@
     """.//*[(self::div or self::section or self::list)][starts-with(@id, 'comol') or
     starts-with(@id, 'disqus_thread') or starts-with(@id, 'dsq-comments')]""",
     ".//*[(self::div or self::section)][starts-with(@id, 'social') or contains(@class, 'comment')]",
-]
+)]
 # or contains(@class, 'Comments')
 
 
-REMOVE_COMMENTS_XPATH = [
+REMOVE_COMMENTS_XPATH = [XPath(
     """.//*[(self::div or self::list or self::section)][
     starts-with(translate(@id, "C","c"), 'comment') or
     starts-with(translate(@class, "C","c"), 'comment') or
     contains(@class, 'article-comments') or contains(@class, 'post-comments')
     or starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread')
     or starts-with(@id, 'dsq-comments')
-    ]""",
-]
+    ]"""
+)]
 # or self::span
 # or contains(@class, 'comment') or contains(@id, 'comment')
 
 
-PAYWALL_DISCARD_XPATH = [
+PAYWALL_DISCARD_XPATH = [XPath(
     '''.//*[(self::div or self::p)][
     contains(@id, "paywall") or contains(@id, "premium") or
     contains(@class, "paid-content") or contains(@class, "paidcontent") or
     contains(@class, "obfuscated") or contains(@class, "blurred") or
     contains(@class, "restricted") or contains(@class, "overlay")
-    ]''',
-]
+    ]'''
+)]
 
 
-OVERALL_DISCARD_XPATH = [
+OVERALL_DISCARD_XPATH = [XPath(x) for x in (
     # navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
     '''.//*[(self::div or self::item or self::list
              or self::p or self::section or self::span)][
@@ -155,7 +158,7 @@
     or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint")
     or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true"
     or contains(@class, "notloaded")]''',
-]
+)]
 # conflicts:
 # contains(@id, "header") or contains(@class, "header") or
 # class contains "cats" (categories, also tags?)
@@ -166,15 +169,15 @@
 
 
 # the following conditions focus on extraction precision
-TEASER_DISCARD_XPATH = [
+TEASER_DISCARD_XPATH = [XPath(
     '''.//*[(self::div or self::item or self::list
              or self::p or self::section or self::span)][
         contains(translate(@id, "T", "t"), "teaser") or contains(translate(@class, "T", "t"), "teaser")
-    ]''',
-]
+    ]'''
+)]
 
 
-PRECISION_DISCARD_XPATH = [
+PRECISION_DISCARD_XPATH = [XPath(x) for x in (
     './/header',
     '''.//*[(self::div or self::item or self::list
              or self::p or self::section or self::span)][
@@ -182,19 +185,19 @@
         contains(@id, "link") or contains(@class, "link")
         or contains(@style, "border")
     ]''',
-]
+)]
 
 
-DISCARD_IMAGE_ELEMENTS = [
+DISCARD_IMAGE_ELEMENTS = [XPath(
     '''.//*[(self::div or self::item or self::list
              or self::p or self::section or self::span)][
              contains(@id, "caption") or contains(@class, "caption")
             ]
     '''
-]
+)]
 
 
-COMMENTS_DISCARD_XPATH = [
+COMMENTS_DISCARD_XPATH = [XPath(x) for x in (
     './/*[(self::div or self::section)][starts-with(@id, "respond")]',
     './/cite|.//quote',
     '''.//*[@class="comments-title" or contains(@class, "comments-title") or
@@ -202,4 +205,4 @@
     starts-with(@class, "reply-") or contains(@class, "-reply-") or contains(@class, "message")
     or contains(@class, "signin") or
     contains(@id, "akismet") or contains(@class, "akismet") or contains(@style, "display:none")]''',
-]
+)]