From eed8c0bcfbfc04f9b2d067ab1a313d3d61094dd0 Mon Sep 17 00:00:00 2001 From: CodyInnowhere Date: Tue, 24 Dec 2024 20:15:11 +0800 Subject: [PATCH] add preserve_space option to keep original format in code blocks --- tests/unit_tests.py | 20 ++++++++++++++++++++ trafilatura/core.py | 16 ++++++++++++++-- trafilatura/settings.py | 3 +++ trafilatura/xml.py | 4 ++-- 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index b43362c8..1c266d33 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -442,6 +442,26 @@ def test_formatting(): my_result = extract(my_document, output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG) assert '1) The in Operator' in my_result and '

The easiest way to check if a Python string contains a substring is to use the in operator. The in operator is used to check data structures for membership in Python. It returns a Boolean (either True or False) and can be used as follows:

' in my_result + my_document = html.fromstring(""" +
python code below: + ```python +def test: + print('hello') + print('world') + ``` +
+ """) + my_result = extract(my_document, output_format='markdown', include_formatting=True) + assert "python code below:\n```python\ndef test:\nprint('hello')\nprint('world')\n```" == my_result + + my_result = extract(my_document, output_format='markdown', include_formatting=True, preserve_space=True) + assert """python code below: + ```python +def test: + print('hello') + print('world') + ```""" == my_result + def test_extract_with_metadata(): '''Test extract_with_metadata method''' diff --git a/trafilatura/core.py b/trafilatura/core.py index dc1a01fd..464f29bc 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -91,9 +91,10 @@ def determine_returnstring(document: Document, options: Extractor) -> str: header += "---\n" else: header = "" - returnstring = f"{header}{xmltotxt(document.body, options.formatting)}" + returnstring = f"{header}{xmltotxt(document.body, options.formatting, options.preserve_space)}" if document.commentsbody is not None: - returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip() + returnstring = \ + f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting, options.preserve_space)}".strip() # normalize Unicode format (defaults to NFC) return normalize_unicode(returnstring) @@ -140,6 +141,7 @@ def bare_extraction( include_tables: bool = True, include_images: bool = False, include_formatting: bool = False, + preserve_space: bool = False, include_links: bool = False, deduplicate: bool = False, date_extraction_params: Optional[Dict[str, Any]] = None, @@ -171,6 +173,7 @@ def bare_extraction( include_images: Take images into account (experimental). include_formatting: Keep structural elements related to formatting (present in XML format, converted to markdown otherwise). + preserve_space: Preserve space when formatting text. include_links: Keep links along with their targets (experimental). deduplicate: Remove duplicate segments and documents. date_extraction_params: Provide extraction parameters to htmldate as dict(). @@ -205,6 +208,7 @@ def bare_extraction( recall=favor_recall, comments=include_comments, formatting=include_formatting, + preserve_space=preserve_space, links=include_links, images=include_images, tables=include_tables, @@ -361,6 +365,7 @@ def extract( include_tables: bool = True, include_images: bool = False, include_formatting: bool = False, + preserve_space: bool = False, include_links: bool = False, deduplicate: bool = False, date_extraction_params: Optional[Dict[str, Any]] = None, @@ -394,6 +399,7 @@ def extract( include_images: Take images into account (experimental). include_formatting: Keep structural elements related to formatting (only valuable if output_format is set to XML). + preserve_space: Preserve space when formatting text. include_links: Keep links along with their targets (experimental). deduplicate: Remove duplicate segments and documents. date_extraction_params: Provide extraction parameters to htmldate as dict(). @@ -427,6 +433,7 @@ def extract( include_tables=include_tables, include_images=include_images, include_formatting=include_formatting, + preserve_space=preserve_space, include_links=include_links, deduplicate=deduplicate, date_extraction_params=date_extraction_params, @@ -456,6 +463,7 @@ def extract_with_metadata( include_tables: bool = True, include_images: bool = False, include_formatting: bool = False, + preserve_space: bool = False, include_links: bool = False, deduplicate: bool = False, date_extraction_params: Optional[Dict[str, Any]] = None, @@ -487,6 +495,7 @@ def extract_with_metadata( include_images: Take images into account (experimental). include_formatting: Keep structural elements related to formatting (only valuable if output_format is set to XML). + preserve_space: Preserve space when formatting text. include_links: Keep links along with their targets (experimental). deduplicate: Remove duplicate segments and documents. date_extraction_params: Provide extraction parameters to htmldate as dict(). @@ -515,6 +524,7 @@ def extract_with_metadata( include_tables=include_tables, include_images=include_images, include_formatting=include_formatting, + preserve_space=preserve_space, include_links=include_links, deduplicate=deduplicate, date_extraction_params=date_extraction_params, @@ -564,6 +574,7 @@ def _internal_extraction( include_tables: bool = True, include_images: bool = False, include_formatting: bool = False, + preserve_space: bool = False, include_links: bool = False, deduplicate: bool = False, date_extraction_params: Optional[Dict[str, Any]] = None, @@ -590,6 +601,7 @@ def _internal_extraction( recall=favor_recall, comments=include_comments, formatting=include_formatting, + preserve_space=preserve_space, links=include_links, images=include_images, tables=include_tables, diff --git a/trafilatura/settings.py b/trafilatura/settings.py index 778543ae..543e74b2 100644 --- a/trafilatura/settings.py +++ b/trafilatura/settings.py @@ -70,6 +70,7 @@ class Extractor: "focus", "comments", "formatting", + "preserve_space", "links", "images", "tables", @@ -108,6 +109,7 @@ def __init__( recall: bool = False, comments: bool = True, formatting: bool = False, + preserve_space: bool = False, links: bool = False, images: bool = False, tables: bool = True, @@ -131,6 +133,7 @@ def __init__( ) self.comments: bool = comments self.formatting: bool = formatting or self.format == "markdown" + self.preserve_space: bool = preserve_space self.links: bool = links self.images: bool = images self.tables: bool = tables diff --git a/trafilatura/xml.py b/trafilatura/xml.py index a31e70da..02cc886b 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -361,7 +361,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting returnlist.append(element.tail) -def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str: +def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool, preserve_space: bool = False) -> str: "Convert to plain text format and optionally preserve formatting as markdown." if xmloutput is None: return "" @@ -370,7 +370,7 @@ def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str: process_element(xmloutput, returnlist, include_formatting) - return unescape(sanitize("".join(returnlist)) or "") + return unescape(sanitize("".join(returnlist), preserve_space) or "") def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str: