Merge branch 'master' into fix-lists-in-tables

adbar · Apr 11, 2024 · 8a65919 · 8a65919
2 parents f5b6045 + 54ad86c
commit 8a65919
Show file tree

Hide file tree

Showing 19 changed files with 789 additions and 760 deletions.
diff --git a/tests/comparison.py b/tests/comparison.py
@@ -33,8 +33,9 @@
 from trafilatura import extract
 
 try:
-    from trafilatura.core import baseline
+    from trafilatura import baseline
 except ImportError:
+    print("Cannot import baseline, using simple version")
     baseline = None
 from evaldata import EVAL_PAGES
 

diff --git a/tests/comparison_small.py b/tests/comparison_small.py
@@ -23,8 +23,9 @@
 
 from trafilatura import extract
 try:
-    from trafilatura.core import baseline, html2txt
+    from trafilatura import baseline, html2txt
 except ImportError:
+    print("Cannot import baseline, using simple version")
     baseline = None
     html2txt = None
 #from trafilatura.htmlprocessing import prune_html
@@ -155,8 +156,7 @@ def run_baseline(htmlstring):
     if baseline is not None:
         _, result, _ = baseline(htmlstring)
         return result
-    result = run_baseline_2(htmlstring)
-    return result
+    return run_baseline_2(htmlstring)
 
 
 def run_trafilatura(htmlstring):

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -31,13 +31,13 @@
                          process_record, utils, xml)
 from trafilatura.core import (Extractor, handle_formatting, handle_image,
                               handle_lists, handle_paragraphs, handle_quotes,
-                              handle_table, handle_textelem, sanitize_tree,
-                              trim)
-from trafilatura.external import try_justext
+                              handle_table, handle_textelem)
+from trafilatura.external import sanitize_tree, try_justext
 from trafilatura.filters import textfilter
 from trafilatura.meta import reset_caches
 from trafilatura.metadata import Document
 from trafilatura.settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
+from trafilatura.utils import trim
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 
@@ -116,6 +116,12 @@ def test_trim():
 
 def test_input():
     '''test if loaded strings/trees are handled properly'''
+    teststring = "高山云雾出好茶".encode("utf-8")
+    assert utils.detect_encoding(teststring) == ["utf-8"]
+    teststring = "高山云雾出好茶".encode("gb18030")
+    assert "gb18030" in utils.detect_encoding(teststring)
+    assert "gb18030" in utils.detect_encoding(teststring*1000)
+
     assert utils.is_dubious_html("This is a string.") is True
 
     htmlstring = "<!DOCTYPE html PUBLIC />\n<html></html>"
@@ -147,7 +153,8 @@ def test_input():
     # old: with pytest.raises(TypeError) as err:
     assert extract(None, 'url', '0000', target_language=None) is None
     # legacy
-    assert process_record(None, 'url', '0000', target_language=None) is None
+    with pytest.raises(SystemExit):
+        assert process_record(None, 'url', '0000', target_language=None) is None
     # GZip
     with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile:
         myinput = gzfile.read()
@@ -293,21 +300,29 @@ def test_formatting():
     my_document = html.fromstring('<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b></p></article></body></html>')
     my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
     assert my_result == '### Title\n**This here is in bold font.**'
+
+    # space between paragraphs
+    my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Paragraph 1</p><p>Paragraph 2</p></article></body></html>')
+    my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
+    assert my_result.endswith('Paragraph 1\n\nParagraph 2')
+
     # code sections
     my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code>import trafilatura</code></p></article></body></html>')
     my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
     assert my_result == """### Title
 Here is a code sample:
+
 `import trafilatura`"""
     my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code>import trafilatura\ntrafilatura.extract("")</code></p></article></body></html>')
     my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
     assert my_result == """### Title
 Here is a code sample:
+
 ```
 import trafilatura
 trafilatura.extract("")
 ```"""
-    
+
     # nested
     my_document = html.fromstring('<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>')
     my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
@@ -1255,7 +1270,17 @@ def test_lang_detection():
             assert detected == sample['expected'], f"Lang detection failed for {sample['expected']}"
 
 
+def test_config_loading():
+    "Check if the config file is read correctly."
+    with pytest.raises(FileNotFoundError):
+        config = use_config(filename="/bogus-dir/bogus-file.txt")
+
+    config = use_config(filename=os.path.join(RESOURCES_DIR, "newsettings.cfg"))
+    assert config is not None
+
+
 if __name__ == '__main__':
+    test_config_loading()
     test_trim()
     test_input()
     test_formatting()

diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py
@@ -14,7 +14,8 @@
 
 import logging
 
-from .core import bare_extraction, baseline, extract, html2txt, process_record
+from .baseline import baseline, html2txt
+from .core import bare_extraction, extract, process_record
 from .downloads import fetch_response, fetch_url
 from .metadata import extract_metadata
 from .utils import load_html

diff --git a/trafilatura/baseline.py b/trafilatura/baseline.py
@@ -0,0 +1,101 @@
+# pylint:disable-msg=E0611
+import re
+
+from lxml.etree import Element, SubElement
+
+from .settings import BASIC_CLEAN_XPATH
+from .utils import load_html, trim
+
+
+JSON_SEARCH = re.compile(r'"articlebody": *"(.+?)(?<!\\)"', re.I)
+
+
+
+def basic_cleaning(tree):
+    "Remove a few section types from the document."
+    for elem in BASIC_CLEAN_XPATH(tree):
+        elem.getparent().remove(elem)
+    return tree
+
+
+def baseline(filecontent):
+    """Use baseline extraction function targeting text paragraphs and/or JSON metadata.
+
+    Args:
+        filecontent: HTML code as binary string or string.
+
+    Returns:
+        A LXML <body> element containing the extracted paragraphs,
+        the main text as string, and its length as integer.
+
+    """
+    tree = load_html(filecontent)
+    postbody = Element('body')
+    if tree is None:
+        return postbody, '', 0
+    # scrape from json text
+    for elem in tree.iterfind('.//script[@type="application/ld+json"]'):
+        if elem.text and '"article' in elem.text:
+            mymatch = JSON_SEARCH.search(elem.text)
+            if mymatch:
+                elem = SubElement(postbody, 'p')
+                elem.text = trim(mymatch[1].replace('\\"', '"'))
+                return postbody, elem.text, len(elem.text)
+
+    tree = basic_cleaning(tree)
+
+    # scrape from article tag
+    article_elem = tree.find('.//article')
+    if article_elem is not None:
+        temp_text = trim(article_elem.text_content())
+        if len(temp_text) > 100:
+            elem = SubElement(postbody, 'p')
+            elem.text = temp_text
+            return postbody, temp_text, len(temp_text)
+    # scrape from text paragraphs
+    results = set()
+    for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
+        entry = element.text_content()
+        if entry not in results:
+            elem = SubElement(postbody, 'p')
+            elem.text = entry
+            results.add(entry)
+    temp_text = trim('\n'.join(postbody.itertext()))
+    if len(temp_text) > 100:
+        return postbody, temp_text, len(temp_text)
+    # default strategy: clean the tree and take everything
+    postbody = Element('body')
+    body_elem = tree.find('.//body')
+    if body_elem is not None:
+        # elem.text = trim(body_elem.text_content())
+        text = '\n'.join([trim(e) for e in body_elem.itertext()])
+        if len(text) > 100:
+            elem = SubElement(postbody, 'p')
+            elem.text = text
+            return postbody, text, len(text)
+    # new fallback
+    text = html2txt(tree)
+    elem = SubElement(postbody, 'p')
+    elem.text = text
+    return postbody, text, len(text)
+    # old: return postbody, '', 0
+
+
+def html2txt(content):
+    """Run basic html2txt on a document.
+
+    Args:
+        content: HTML document as string or LXML element.
+
+    Returns:
+        The extracted text in the form of a string or an empty string.
+
+    """
+    tree = load_html(content)
+    if tree is None:
+        return ""
+    body = tree.find(".//body")
+    if body is None:
+        return ""
+    tree = basic_cleaning(tree)
+    return " ".join(body.text_content().split()).strip()
diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -17,7 +17,8 @@
 
 from trafilatura import spider
 
-from .core import extract, html2txt
+from .baseline import html2txt
+from .core import extract
 from .downloads import (add_to_compressed_dict, buffered_downloads,
                         load_download_buffer)
 from .feeds import find_feed_urls
@@ -26,7 +27,7 @@
 from .meta import reset_caches
 from .settings import FILENAME_LEN, MAX_FILES_PER_DIRECTORY, use_config
 from .sitemaps import sitemap_search
-from .utils import URL_BLACKLIST_REGEX, make_chunks, uniquify_list
+from .utils import URL_BLACKLIST_REGEX, make_chunks
 
 LOGGER = logging.getLogger(__name__)
 
@@ -67,7 +68,7 @@ def load_input_urls(args):
         LOGGER.warning('No input provided')
 
     # uniq URLs while preserving order (important)
-    return uniquify_list(input_urls)
+    return list(dict.fromkeys(input_urls))
 
 
 def load_blacklist(filename):