Skip to content

Commit

Permalink
remove extra space in cell and highlighted text
Browse files Browse the repository at this point in the history
  • Loading branch information
unsleepy22 committed Feb 9, 2025
1 parent 837358f commit 2e31705
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 29 deletions.
6 changes: 3 additions & 3 deletions tests/realworld_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,12 +438,12 @@ def test_extract(xmloutput, formatting):
if formatting is False:
assert 'Einblicke in die Vision zukünftiger Softwaregenerationen.' in result
else:
assert 'Einblicke in die **Vision zukünftiger Softwaregenerationen** .\n' in result
assert 'Einblicke in die **Vision zukünftiger Softwaregenerationen**.\n' in result
assert 'Frage 4: Welche Rolle spielt Big Data in Bezug auf Assistenz-Systeme und KI?' in result
if formatting is False:
assert 'von The unbelievable Machine Company (*um) zur Verfügung gestellt.' in result
else:
assert 'von **The unbelievable Machine Company (*um)** zur Verfügung gestellt.\n' in result
assert 'von **The unbelievable Machine Company (*um)** zur Verfügung gestellt.\n' in result
assert 'Matthias Weber ist ERP-Experte mit langjähriger Berufserfahrung.' not in result
assert 'Die Top 5 digitalen Trends für den Mittelstand' not in result
assert ', leading edge,' not in result # and 'Lesen Sie hier einen weiteren spannenden Beitrag' not in result
Expand Down Expand Up @@ -514,7 +514,7 @@ def test_extract_links_formatting():
result = load_mock_page('http://www.pcgamer.com/2012/08/09/skyrim-part-1/', formatting=True, links=True)
assert 'In [Skyrim](https://www.pcgamer.com/best-skyrim-mods/), a mage' in result
# the original has the space at the end of the em tag
assert "*Legends * don't destroy *houses* ," in result
assert "*Legends *don't destroy *houses*," in result


def test_pages():
Expand Down
36 changes: 18 additions & 18 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def test_formatting():
my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b>Non-bold here</p></article></body></html>'
my_document = html.fromstring(my_string)
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result == '### Title\n\n**This here is in bold font.** Non-bold here'
assert my_result == '### Title\n\n**This here is in bold font.**Non-bold here'
assert extract(my_string, output_format='markdown', config=ZERO_CONFIG) == my_result
assert '<hi rend="#b">' in etree.tostring(bare_extraction(my_string, output_format='markdown', config=ZERO_CONFIG).body, encoding="unicode")

Expand Down Expand Up @@ -437,10 +437,10 @@ def test_formatting():
assert my_result == 'bold, italics, tt, deleted, underlined, link and additional text to bypass detection.'

my_result = extract(copy(my_document), fast=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , link and additional text to bypass detection.'
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'

my_result = extract(copy(my_document), fast=True, include_links=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , [link](test.html) and additional text to bypass detection.'
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, [link](test.html) and additional text to bypass detection.'

my_result = extract(copy(my_document), output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, link and additional text to bypass detection.</p>' in my_result
Expand All @@ -449,7 +449,7 @@ def test_formatting():
my_result = extract(copy(my_document), output_format='xml', include_formatting=True, include_links=True, fast=True, config=ZERO_CONFIG)
assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, <ref target="test.html">link</ref> and additional text to bypass detection.</p>' in my_result
my_result = extract(my_document, output_format='txt', fast=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , link and additional text to bypass detection.'
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'

# double <p>-elems
# could be solved by keeping the elements instead of reconstructing them
Expand Down Expand Up @@ -1153,7 +1153,7 @@ def test_table_processing():
<cell>you buy</cell>
<cell>they buy</cell>
</row>''' in my_result
assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |")
assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |")
# table with links
# todo: further tests and adjustments
htmlstring = '<html><body><article><table><tr><td><a href="test.html">' + 'ABCD'*100 + '</a></td></tr></table></article></body></html>'
Expand Down Expand Up @@ -1248,7 +1248,7 @@ def test_table_processing():
# remove new lines in table cells in text format
htmlstring = '<html><body><article><table><tr><td>cell<br>1</td><td>cell<p>2</p></td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| cell 1 | cell 2 |" in result
assert "| cell 1 | cell 2 |" in result

# only one header row is allowed in text format
htmlstring = '<html><body><article><table><tr><th>a</th><th>b</th></tr><tr><th>c</th><th>d</th></tr></table></article></body></html>'
Expand All @@ -1258,15 +1258,15 @@ def test_table_processing():
# handle colspan by appending columns in text format
htmlstring = '<html><body><article><table><tr><td colspan="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="2.1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

# MemoryError: https://github.com/adbar/trafilatura/issues/657
htmlstring = '<html><body><article><table><tr><td colspan="9007199254740991">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
Expand All @@ -1280,16 +1280,16 @@ def test_table_processing():
# wrong span info
htmlstring = '<html><body><article><table><tr><td span="-1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="abc">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

# links: this gets through (for now)
htmlstring = '<html><body><article><table><tr><td><a href="link.html">a</a></td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a |"
assert result == "| a |"

# link: this is filtered out
htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{"abc"*100}</a></td></tr></table></article></body></html>'
Expand All @@ -1311,7 +1311,7 @@ def test_table_processing():
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c | \n| a | b c | |"
assert result == "| a | b | c | \n| a | b c | |"

htmlstring = """
<html><body><article>
Expand All @@ -1329,7 +1329,7 @@ def test_table_processing():
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c | \n| a | b c | |\n| a | b c | |"
assert result == "| a | b | c | \n| a | b c | |\n| a | b c | |"

htmlstring = """
<html><body><article>
Expand All @@ -1345,7 +1345,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c | \n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| a ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1361,7 +1361,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1377,7 +1377,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1393,7 +1393,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c | \n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
assert result == "| a | b | c | \n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"


def test_list_processing():
Expand Down
2 changes: 1 addition & 1 deletion tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def test_replace_element_text():
elem = Element("hi")
elem.text = "Text"
elem.set("rend", "#b")
assert replace_element_text(elem, True) == "**Text** "
assert replace_element_text(elem, True) == "**Text**"

elem = Element("item")
elem.text = "Test text"
Expand Down
15 changes: 15 additions & 0 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,3 +465,18 @@ def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:
def is_in_table_cell(elem: _Element) -> bool:
'''Check whether an element is in a table cell'''
return bool(elem.xpath('//ancestor::cell'))


def is_last_element_in_cell(elem: _Element) -> bool:
'''Check whether an element is the last element in table cell'''
if elem.tag == "cell":
children = list(elem)
if not children:
return True
else:
return children[-1] == elem
elif is_in_table_cell(elem):
cell = elem.getparent()
children = list(cell)

return children and children[-1] == elem
14 changes: 7 additions & 7 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
fromstring, tostring, DTD)

from .settings import Document, Extractor
from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test
from .utils import is_in_table_cell, is_last_element_in_cell, sanitize, sanitize_tree, text_chars_test


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -281,8 +281,8 @@ def is_last_element_in_item(element: _Element) -> bool:


def replace_element_text(element: _Element, include_formatting: bool) -> str:
"""Determine element text based on just the text of the element. One must deal with the tail separately."""
elem_text = element.text or ""
"Determine element text based on just the text of the element. One must deal with the tail separately."
# handle formatting: convert to markdown
if include_formatting and element.text:
if element.tag in ('article', 'list', 'table'):
Expand All @@ -298,8 +298,7 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
elif element.tag == "hi":
rend = element.get("rend")
if rend in HI_FORMATTING:
# force space after highlight text to make it more compatible with various md renderers
elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]} "
elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}"
elif element.tag == "code":
if "\n" in elem_text or element.xpath(".//lb"): # Handle <br> inside <code>
# Convert <br> to \n within code blocks
Expand All @@ -322,11 +321,12 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
else:
LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
# cells
if element.tag == "cell":
if element.tag == 'cell':
elem_text = elem_text.strip()

if elem_text:
if elem_text and not is_last_element_in_cell(element):
elem_text = f"{elem_text} "

# within lists
if is_first_element_in_item(element) and not is_in_table_cell(element):
elem_text = f"- {elem_text}"
Expand Down Expand Up @@ -391,7 +391,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
elif element.tag == "cell":
returnlist.append(" | ")
elif element.tag not in SPECIAL_FORMATTING:
elif element.tag not in SPECIAL_FORMATTING and not is_last_element_in_cell(element): # and not is_in_table_cell(element)
returnlist.append(" ")

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
Expand Down

0 comments on commit 2e31705

Please sign in to comment.