fix: avoid faulty readability_lxml content (#635)

* fix: avoid faulty readability_lxml content * add test
adbar · Jun 27, 2024 · f5a53a8 · f5a53a8
1 parent 3abd2a1
commit f5a53a8
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 23 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -795,30 +795,34 @@ def test_precision_recall():
     '''test precision- and recall-oriented settings'''
     # the test cases could be better
     my_document = html.fromstring('<html><body><p>This here is the text.</p></body></html>')
-    assert extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True) is not None
-    assert extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True) is not None
+    assert extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, no_fallback=True) is not None
+    assert extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, no_fallback=True) is not None
 
     my_document = html.fromstring('<html><body><div class="article-body"><div class="teaser-content"><p>This here is a teaser text.</p></div><div><p>This here is the text.</p></div></body></html>')
-    assert 'teaser text' in extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
-    assert 'teaser text' not in extract(copy(my_document), config=ZERO_CONFIG, fast=True)
-    assert 'teaser text' not in extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)
+    assert 'teaser text' in extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, no_fallback=True)
+    assert 'teaser text' not in extract(copy(my_document), config=ZERO_CONFIG, no_fallback=True)
+    assert 'teaser text' not in extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, no_fallback=True)
 
     my_document = html.fromstring('<html><body><article><div><p><a href="test.html">1.</a><br/><a href="test2.html">2.</a></p></div></article></body></html>')
-    result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
+    result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, no_fallback=True)
     assert '1' not in result
-    result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)
+    result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, no_fallback=True)
     assert '1' not in result
 
     my_document = html.fromstring('<html><body><div class="article-body"><p>content</p><h2>Test</h2></div></body></html>')
-    result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)
+    result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, no_fallback=True)
     assert 'content' in result and 'Test' not in result
 
     my_document = html.fromstring('<html><body><article><aside><p>Here is the text.</p></aside></article></body></html>')
-    result = extract(copy(my_document), favor_recall=False, config=ZERO_CONFIG, fast=True)
+    result = extract(copy(my_document), favor_recall=False, config=ZERO_CONFIG, no_fallback=True)
     assert result != "Here is the text."
-    result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
+    result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, no_fallback=True)
     assert result == "Here is the text."
 
+    my_document = html.fromstring('<html><body><div><h2>Title</h2><small>Text.</small></div></body></html>')
+    result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, no_fallback=False)
+    assert len(result) > 0
+
 
 def test_table_processing():
     options = DEFAULT_OPTIONS

diff --git a/trafilatura/external.py b/trafilatura/external.py
@@ -48,56 +48,64 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
     # bypass for recall
     if options.focus == "recall" and len_text > options.min_extracted_size * 10:
         return body, text, len_text
-    algo_flag, jt_result = False, False
+
+    use_readability, jt_result = False, False
     # prior cleaning
     backup_tree = prune_unwanted_nodes(backup_tree, PAYWALL_DISCARD_XPATH)
     if options.focus == "precision":
         backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH)
+
     # try with readability
     temppost_algo = try_readability(backup_tree)
     # unicode fix necessary on certain systems (#331)
     algo_text = trim(tostring(temppost_algo, method='text', encoding='utf-8').decode('utf-8'))
     len_algo = len(algo_text)
+
     # compare
     LOGGER.debug('extracted length: %s (algorithm) %s (extraction)', len_algo, len_text)
     # conditions to use alternative algorithms
     if len_algo in (0, len_text):
-        algo_flag = False
+        use_readability = False
     elif len_text == 0 and len_algo > 0:
-        algo_flag = True
+        use_readability = True
     elif len_text > 2 * len_algo:
-        algo_flag = False
-    elif len_algo > 2 * len_text:
-        algo_flag = True
+        use_readability = False
+    # quick fix for https://github.com/adbar/trafilatura/issues/632
+    elif len_algo > 2 * len_text and not algo_text.startswith("{"):
+        use_readability = True
     # borderline cases
     elif not body.xpath('.//p//text()') and len_algo > options.min_extracted_size * 2:
-        algo_flag = True
+        use_readability = True
     elif len(body.findall('.//table')) > len(body.findall('.//p')) and len_algo > options.min_extracted_size * 2:
-        algo_flag = True
+        use_readability = True
     # https://github.com/adbar/trafilatura/issues/354
     elif options.focus == "recall" and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
-        algo_flag = True
+        use_readability = True
     else:
         LOGGER.debug('extraction values: %s %s for %s', len_text, len_algo, options.source)
-        algo_flag = False
+        use_readability = False
+
     # apply decision
-    if algo_flag:
+    if use_readability:
         body, text, len_text = temppost_algo, algo_text, len_algo
         LOGGER.debug('using generic algorithm: %s', options.source)
     else:
         LOGGER.debug('using custom extraction: %s', options.source)
+
     # override faulty extraction: try with justext
     if body.xpath(SANITIZED_XPATH) or len_text < options.min_extracted_size:  # body.find(...)
         LOGGER.debug('unclean document triggering justext examination: %s', options.source)
         # tree = prune_unwanted_sections(tree, {}, options)
         body2, text2, len_text2, jt_result = justext_rescue(tree, options, body, 0, '')
         # prevent too short documents from replacing the main text
-        if jt_result is True and not len_text > 4*len_text2:  # threshold could be adjusted
+        if jt_result and not len_text > 4*len_text2:  # threshold could be adjusted
             LOGGER.debug('using justext, length: %s', len_text2)
             body, text, len_text = body2, text2, len_text2
+
     # post-processing: remove unwanted sections
-    if algo_flag is True and jt_result is False:
+    if use_readability and not jt_result:
         body, text, len_text = sanitize_tree(body, options)
+
     return body, text, len_text