Skip to content

Commit

Permalink
set preserve_space as default option
Browse files Browse the repository at this point in the history
  • Loading branch information
CodyInnowhere authored and CodyInnowhere committed Feb 7, 2025
1 parent bc83c56 commit 4cb750e
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 52 deletions.
50 changes: 25 additions & 25 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,10 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
# titles as markdown
my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b></p></article></body></html>'
my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b>Non-bold here</p></article></body></html>'
my_document = html.fromstring(my_string)
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result == '### Title\n\n**This here is in bold font.**'
assert my_result == '### Title\n\n**This here is in bold font.** Non-bold here'
assert extract(my_string, output_format='markdown', config=ZERO_CONFIG) == my_result
assert '<hi rend="#b">' in etree.tostring(bare_extraction(my_string, output_format='markdown', config=ZERO_CONFIG).body, encoding="unicode")

Expand Down Expand Up @@ -415,20 +415,20 @@ def test_formatting():
<li>Number 3</li>
<li><p>Number 4</p> n4</li>
</ul>
<p>Test</p></article></body></html>
Test</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_links=True, config=ZERO_CONFIG)
assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html) n2\n- Number 3\n- Number 4 n4\n\nTest'
assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html)n2\n- Number 3\n- Number 4 n4\n\nTest'
# XML and Markdown formatting within <p>-tag
my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
my_result = extract(copy(my_document), fast=True, include_formatting=False, config=ZERO_CONFIG)
assert my_result == 'bold, italics, tt, deleted, underlined, link and additional text to bypass detection.'

my_result = extract(copy(my_document), fast=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'
assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , link and additional text to bypass detection.'

my_result = extract(copy(my_document), fast=True, include_links=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, [link](test.html) and additional text to bypass detection.'
assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , [link](test.html) and additional text to bypass detection.'

my_result = extract(copy(my_document), output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, link and additional text to bypass detection.</p>' in my_result
Expand All @@ -437,7 +437,7 @@ def test_formatting():
my_result = extract(copy(my_document), output_format='xml', include_formatting=True, include_links=True, fast=True, config=ZERO_CONFIG)
assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, <ref target="test.html">link</ref> and additional text to bypass detection.</p>' in my_result
my_result = extract(my_document, output_format='txt', fast=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'
assert my_result == '**bold** , *italics* , `tt` , ~~deleted~~, __underlined__ , link and additional text to bypass detection.'

# double <p>-elems
# could be solved by keeping the elements instead of reconstructing them
Expand All @@ -456,19 +456,19 @@ def test_formatting():

my_document = html.fromstring("""
<html><head><body><article>python code below:
<pre><code>
<pre><code>
def test:
print('hello')
print('world')
</code></pre>
</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert "python code below:\n```\ndef test:\nprint('hello')\nprint('world')\n```" == my_result
assert "python code below:\n```\ndef test:\n print('hello')\n print('world')\n \n```" == my_result

my_result = extract(my_document, output_format='markdown', include_formatting=True, preserve_space=True)
my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert """python code below:
```
```
def test:
print('hello')
print('world')
Expand Down Expand Up @@ -1141,7 +1141,7 @@ def test_table_processing():
<cell>you buy</cell>
<cell>they buy</cell>
</row>''' in my_result
assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |")
assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |")
# table with links
# todo: further tests and adjustments
htmlstring = '<html><body><article><table><tr><td><a href="test.html">' + 'ABCD'*100 + '</a></td></tr></table></article></body></html>'
Expand Down Expand Up @@ -1236,7 +1236,7 @@ def test_table_processing():
# remove new lines in table cells in text format
htmlstring = '<html><body><article><table><tr><td>cell<br>1</td><td>cell<p>2</p></td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| cell 1 | cell 2 |" in result
assert "| cell 1 | cell 2 |" in result

# only one header row is allowed in text format
htmlstring = '<html><body><article><table><tr><th>a</th><th>b</th></tr><tr><th>c</th><th>d</th></tr></table></article></body></html>'
Expand All @@ -1246,15 +1246,15 @@ def test_table_processing():
# handle colspan by appending columns in text format
htmlstring = '<html><body><article><table><tr><td colspan="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="2.1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

# MemoryError: https://github.com/adbar/trafilatura/issues/657
htmlstring = '<html><body><article><table><tr><td colspan="9007199254740991">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
Expand All @@ -1268,16 +1268,16 @@ def test_table_processing():
# wrong span info
htmlstring = '<html><body><article><table><tr><td span="-1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="abc">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "| a | b | |" in result
assert "| a | b | |" in result

# links: this gets through (for now)
htmlstring = '<html><body><article><table><tr><td><a href="link.html">a</a></td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a |"
assert result == "| a |"

# link: this is filtered out
htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{"abc"*100}</a></td></tr></table></article></body></html>'
Expand All @@ -1299,7 +1299,7 @@ def test_table_processing():
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |"
assert result == "| a | b | c | \n| a | b c | |"

htmlstring = """
<html><body><article>
Expand All @@ -1317,7 +1317,7 @@ def test_table_processing():
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"
assert result == "| a | b | c | \n| a | b c | |\n| a | b c | |"

htmlstring = """
<html><body><article>
Expand All @@ -1333,7 +1333,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| a ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1349,7 +1349,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1365,7 +1365,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1381,7 +1381,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
assert result == "| a | b | c | \n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"


def test_list_processing():
Expand Down
2 changes: 1 addition & 1 deletion tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ def test_replace_element_text():
elem = Element("item")
elem.text = "Test text"
elem.tag = "item"
assert replace_element_text(elem, True) == "- Test text "
assert replace_element_text(elem, True) == "- Test text"

elem = Element("ref")
elem.text = "Link"
Expand Down
17 changes: 3 additions & 14 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ def determine_returnstring(document: Document, options: Extractor) -> str:
header += "---\n"
else:
header = ""
returnstring = f"{header}{xmltotxt(document.body, options.formatting, options.preserve_space)}"
returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
if document.commentsbody is not None:
returnstring = \
f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting, options.preserve_space)}".strip()
f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
# normalize Unicode format (defaults to NFC)
return normalize_unicode(returnstring)

Expand Down Expand Up @@ -141,7 +141,6 @@ def bare_extraction(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -173,7 +172,6 @@ def bare_extraction(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(present in XML format, converted to markdown otherwise).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
Expand Down Expand Up @@ -208,7 +206,6 @@ def bare_extraction(
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
preserve_space=preserve_space,
links=include_links,
images=include_images,
tables=include_tables,
Expand Down Expand Up @@ -365,7 +362,6 @@ def extract(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -399,7 +395,6 @@ def extract(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
Expand Down Expand Up @@ -433,7 +428,6 @@ def extract(
include_tables=include_tables,
include_images=include_images,
include_formatting=include_formatting,
preserve_space=preserve_space,
include_links=include_links,
deduplicate=deduplicate,
date_extraction_params=date_extraction_params,
Expand Down Expand Up @@ -463,7 +457,6 @@ def extract_with_metadata(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand Down Expand Up @@ -495,8 +488,7 @@ def extract_with_metadata(
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
preserve_space: Preserve space when formatting text.
include_links: Keep links along with their targets (experimental).
= include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
Expand Down Expand Up @@ -524,7 +516,6 @@ def extract_with_metadata(
include_tables=include_tables,
include_images=include_images,
include_formatting=include_formatting,
preserve_space=preserve_space,
include_links=include_links,
deduplicate=deduplicate,
date_extraction_params=date_extraction_params,
Expand Down Expand Up @@ -574,7 +565,6 @@ def _internal_extraction(
include_tables: bool = True,
include_images: bool = False,
include_formatting: bool = False,
preserve_space: bool = False,
include_links: bool = False,
deduplicate: bool = False,
date_extraction_params: Optional[Dict[str, Any]] = None,
Expand All @@ -601,7 +591,6 @@ def _internal_extraction(
recall=favor_recall,
comments=include_comments,
formatting=include_formatting,
preserve_space=preserve_space,
links=include_links,
images=include_images,
tables=include_tables,
Expand Down
3 changes: 0 additions & 3 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ class Extractor:
"focus",
"comments",
"formatting",
"preserve_space",
"links",
"images",
"tables",
Expand Down Expand Up @@ -109,7 +108,6 @@ def __init__(
recall: bool = False,
comments: bool = True,
formatting: bool = False,
preserve_space: bool = False,
links: bool = False,
images: bool = False,
tables: bool = True,
Expand All @@ -133,7 +131,6 @@ def __init__(
)
self.comments: bool = comments
self.formatting: bool = formatting or self.format == "markdown"
self.preserve_space: bool = preserve_space
self.links: bool = links
self.images: bool = images
self.tables: bool = tables
Expand Down
Loading

0 comments on commit 4cb750e

Please sign in to comment.