remove extra space in cell and highlighted text

adbar · Feb 9, 2025 · 2e31705 · 2e31705
1 parent 837358f
commit 2e31705
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 29 deletions.
diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py
@@ -438,12 +438,12 @@ def test_extract(xmloutput, formatting):
     if formatting is False:
         assert 'Einblicke in die Vision zukünftiger Softwaregenerationen.' in result
     else:
-        assert 'Einblicke in die **Vision zukünftiger Softwaregenerationen** .\n' in result
+        assert 'Einblicke in die **Vision zukünftiger Softwaregenerationen**.\n' in result
     assert 'Frage 4: Welche Rolle spielt Big Data in Bezug auf Assistenz-Systeme und KI?' in result
     if formatting is False:
         assert 'von The unbelievable Machine Company (*um) zur Verfügung gestellt.' in result
     else:
-        assert 'von **The unbelievable Machine Company (*um)**  zur Verfügung gestellt.\n' in result
+        assert 'von **The unbelievable Machine Company (*um)** zur Verfügung gestellt.\n' in result
     assert 'Matthias Weber ist ERP-Experte mit langjähriger Berufserfahrung.' not in result
     assert 'Die Top 5 digitalen Trends für den Mittelstand' not in result
     assert ', leading edge,' not in result  # and 'Lesen Sie hier einen weiteren spannenden Beitrag' not in result
@@ -514,7 +514,7 @@ def test_extract_links_formatting():
     result = load_mock_page('http://www.pcgamer.com/2012/08/09/skyrim-part-1/', formatting=True, links=True)
     assert 'In [Skyrim](https://www.pcgamer.com/best-skyrim-mods/), a mage' in result
     # the original has the space at the end of the em tag
-    assert "*Legends * don't destroy *houses* ," in result
+    assert "*Legends *don't destroy *houses*," in result
 
 
 def test_pages():

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -333,7 +333,7 @@ def test_formatting():
     my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b>Non-bold here</p></article></body></html>'
     my_document = html.fromstring(my_string)
     my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
-    assert my_result == '### Title\n\n**This here is in bold font.** Non-bold here'
+    assert my_result == '### Title\n\n**This here is in bold font.**Non-bold here'
     assert extract(my_string, output_format='markdown', config=ZERO_CONFIG) == my_result
     assert '<hi rend="#b">' in etree.tostring(bare_extraction(my_string, output_format='markdown', config=ZERO_CONFIG).body, encoding="unicode")
 
@@ -437,10 +437,10 @@ def test_formatting():
     assert my_result == 'bold, italics, tt, deleted, underlined, link and additional text to bypass detection.'
 
     my_result = extract(copy(my_document), fast=True, include_formatting=True, config=ZERO_CONFIG)
-    assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , link and additional text to bypass detection.'
+    assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'
 
     my_result = extract(copy(my_document), fast=True, include_links=True, include_formatting=True, config=ZERO_CONFIG)
-    assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , [link](test.html) and additional text to bypass detection.'
+    assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, [link](test.html) and additional text to bypass detection.'
 
     my_result = extract(copy(my_document), output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
     assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, link and additional text to bypass detection.</p>' in my_result
@@ -449,7 +449,7 @@ def test_formatting():
     my_result = extract(copy(my_document), output_format='xml', include_formatting=True, include_links=True, fast=True, config=ZERO_CONFIG)
     assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, <ref target="test.html">link</ref> and additional text to bypass detection.</p>' in my_result
     my_result = extract(my_document, output_format='txt', fast=True, include_formatting=True, config=ZERO_CONFIG)
-    assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , link and additional text to bypass detection.'
+    assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'
 
     # double <p>-elems
     # could be solved by keeping the elements instead of reconstructing them
@@ -1153,7 +1153,7 @@ def test_table_processing():
         <cell>you buy</cell>
         <cell>they buy</cell>
       </row>''' in my_result
-    assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense  | I buy  | you buy  |")
+    assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |")
     # table with links
     # todo: further tests and adjustments
     htmlstring = '<html><body><article><table><tr><td><a href="test.html">' + 'ABCD'*100 + '</a></td></tr></table></article></body></html>'
@@ -1248,7 +1248,7 @@ def test_table_processing():
     # remove new lines in table cells in text format
     htmlstring = '<html><body><article><table><tr><td>cell<br>1</td><td>cell<p>2</p></td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "| cell 1  | cell 2  |" in result
+    assert "| cell 1 | cell 2 |" in result
 
     # only one header row is allowed in text format
     htmlstring = '<html><body><article><table><tr><th>a</th><th>b</th></tr><tr><th>c</th><th>d</th></tr></table></article></body></html>'
@@ -1258,15 +1258,15 @@ def test_table_processing():
     # handle colspan by appending columns in text format
     htmlstring = '<html><body><article><table><tr><td colspan="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "| a  | b  | |" in result
+    assert "| a | b | |" in result
 
     htmlstring = '<html><body><article><table><tr><td span="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "| a  | b  | |" in result
+    assert "| a | b | |" in result
 
     htmlstring = '<html><body><article><table><tr><td span="2.1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "| a  | b  | |" in result
+    assert "| a | b | |" in result
 
     # MemoryError: https://github.com/adbar/trafilatura/issues/657
     htmlstring = '<html><body><article><table><tr><td colspan="9007199254740991">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
@@ -1280,16 +1280,16 @@ def test_table_processing():
     # wrong span info
     htmlstring = '<html><body><article><table><tr><td span="-1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "| a  | b  | |" in result
+    assert "| a | b | |" in result
 
     htmlstring = '<html><body><article><table><tr><td span="abc">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "| a  | b  | |" in result
+    assert "| a | b | |" in result
 
     # links: this gets through (for now)
     htmlstring = '<html><body><article><table><tr><td><a href="link.html">a</a></td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert result == "| a  |"
+    assert result == "| a |"
 
     # link: this is filtered out
     htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{"abc"*100}</a></td></tr></table></article></body></html>'
@@ -1311,7 +1311,7 @@ def test_table_processing():
                  </article></body></html>
                  """
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert result == "| a  | b  | c  | \n| a  | b c  | |"
+    assert result == "| a | b | c | \n| a | b c | |"
 
     htmlstring = """
                  <html><body><article>
@@ -1329,7 +1329,7 @@ def test_table_processing():
                  </article></body></html>
                  """
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert result == "| a  | b  | c  | \n| a  | b c  | |\n| a  | b c  | |"
+    assert result == "| a | b | c | \n| a | b c | |\n| a | b c | |"
 
     htmlstring = """
                  <html><body><article>
@@ -1345,7 +1345,7 @@ def test_table_processing():
                  """
     result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
                      include_images=True, include_tables=True)
-    assert result == "| a  | b  | c  | \n| a ![img](http://aa.bb/c.jpg) a  | b c  | d  |"
+    assert result == "| a | b | c | \n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
 
     htmlstring = """
                  <html><body><article>
@@ -1361,7 +1361,7 @@ def test_table_processing():
                  """
     result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
                      include_images=True, include_tables=True)
-    assert result == "| a  | b  | c  | \n| ![img](http://aa.bb/c.jpg) a  | b c  | d  |"
+    assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"
 
     htmlstring = """
                  <html><body><article>
@@ -1377,7 +1377,7 @@ def test_table_processing():
                  """
     result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
                      include_images=True, include_tables=True)
-    assert result == "| a  | b  | c  | \n| ![img](http://aa.bb/c.jpg) a  | b c  | d  |"
+    assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"
 
     htmlstring = """
                  <html><body><article>
@@ -1393,7 +1393,7 @@ def test_table_processing():
                  """
     result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
                      include_images=True, include_tables=True)
-    assert result == "| a  | b  | c  | \n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg)  | b c  | d  |"
+    assert result == "| a | b | c | \n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
 
 
 def test_list_processing():

diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py
@@ -481,7 +481,7 @@ def test_replace_element_text():
     elem = Element("hi")
     elem.text = "Text"
     elem.set("rend", "#b")
-    assert replace_element_text(elem, True) == "**Text** "
+    assert replace_element_text(elem, True) == "**Text**"
 
     elem = Element("item")
     elem.text = "Test text"

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -465,3 +465,18 @@ def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:
 def is_in_table_cell(elem: _Element) -> bool:
     '''Check whether an element is in a table cell'''
     return bool(elem.xpath('//ancestor::cell'))
+
+
+def is_last_element_in_cell(elem: _Element) -> bool:
+    '''Check whether an element is the last element in table cell'''
+    if elem.tag == "cell":
+        children = list(elem)
+        if not children:
+            return True
+        else:
+            return children[-1] == elem
+    elif is_in_table_cell(elem):
+        cell = elem.getparent()
+        children = list(cell)
+
+        return children and children[-1] == elem
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -17,7 +17,7 @@
                         fromstring, tostring, DTD)
 
 from .settings import Document, Extractor
-from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test
+from .utils import is_in_table_cell, is_last_element_in_cell, sanitize, sanitize_tree, text_chars_test
 
 
 LOGGER = logging.getLogger(__name__)
@@ -281,8 +281,8 @@ def is_last_element_in_item(element: _Element) -> bool:
 
 
 def replace_element_text(element: _Element, include_formatting: bool) -> str:
+    """Determine element text based on just the text of the element. One must deal with the tail separately."""
     elem_text = element.text or ""
-    "Determine element text based on just the text of the element. One must deal with the tail separately."
     # handle formatting: convert to markdown
     if include_formatting and element.text:
         if element.tag in ('article', 'list', 'table'):
@@ -298,8 +298,7 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
         elif element.tag == "hi":
             rend = element.get("rend")
             if rend in HI_FORMATTING:
-                # force space after highlight text to make it more compatible with various md renderers
-                elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]} "
+                elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}"
         elif element.tag == "code":
             if "\n" in elem_text or element.xpath(".//lb"):  # Handle <br> inside <code>
                 # Convert <br> to \n within code blocks
@@ -322,11 +321,12 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
         else:
             LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
     # cells
-    if element.tag == "cell":
+    if element.tag == 'cell':
         elem_text = elem_text.strip()
 
-        if elem_text:
+        if elem_text and not is_last_element_in_cell(element):
             elem_text = f"{elem_text} "
+
     # within lists
     if is_first_element_in_item(element) and not is_in_table_cell(element):
         elem_text = f"- {elem_text}"
@@ -391,7 +391,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
         returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
     elif element.tag == "cell":
         returnlist.append(" | ")
-    elif element.tag not in SPECIAL_FORMATTING:
+    elif element.tag not in SPECIAL_FORMATTING and not is_last_element_in_cell(element): #  and not is_in_table_cell(element)
         returnlist.append(" ")
 
     # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS