From 6cc2c69d81593ef7d767fee485448a4678d384d1 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 7 Feb 2024 11:42:08 +0100 Subject: [PATCH] add code formatting in TXT/Markdown output (#498) * add code formatting in TXT/Markdown output * distinguish between inline code and code snippets * use f-strings --- tests/unit_tests.py | 15 +++++++++++++++ trafilatura/xml.py | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 9f5a536f..987aa694 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -293,6 +293,21 @@ def test_formatting(): my_document = html.fromstring('

Title

This here is in bold font.

') my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG) assert my_result == '### Title\n**This here is in bold font.**' + # code sections + my_document = html.fromstring('

Title

Here is a code sample:

import trafilatura

') + my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG) + assert my_result == """### Title +Here is a code sample: +`import trafilatura`""" + my_document = html.fromstring('

Title

Here is a code sample:

import trafilatura\ntrafilatura.extract("")

') + my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG) + assert my_result == """### Title +Here is a code sample: +``` +import trafilatura +trafilatura.extract("") +```""" + # nested my_document = html.fromstring('

This here is in bold and italic font.

') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 5a1814e5..3b577bb0 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -230,6 +230,11 @@ def replace_element_text(element, include_formatting): rend = element.get('rend') if rend in HI_FORMATTING: element.text = f'{HI_FORMATTING[rend]}{element.text}{HI_FORMATTING[rend]}' + elif element.tag == 'code': + if '\n' in element.text: + element.text = f'```\n{element.text}\n```' + else: + element.text = f'`{element.text}`' # handle links if element.tag == 'ref': if element.text is not None: