From 6cc2c69d81593ef7d767fee485448a4678d384d1 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi
Date: Wed, 7 Feb 2024 11:42:08 +0100
Subject: [PATCH] add code formatting in TXT/Markdown output (#498)
* add code formatting in TXT/Markdown output
* distinguish between inline code and code snippets
* use f-strings
---
tests/unit_tests.py | 15 +++++++++++++++
trafilatura/xml.py | 5 +++++
2 files changed, 20 insertions(+)
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 9f5a536f..987aa694 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -293,6 +293,21 @@ def test_formatting():
my_document = html.fromstring('Title
This here is in bold font.
')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result == '### Title\n**This here is in bold font.**'
+ # code sections
+ my_document = html.fromstring('Title
Here is a code sample:
import trafilatura
')
+ my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
+ assert my_result == """### Title
+Here is a code sample:
+`import trafilatura`"""
+ my_document = html.fromstring('Title
Here is a code sample:
import trafilatura\ntrafilatura.extract("")
')
+ my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
+ assert my_result == """### Title
+Here is a code sample:
+```
+import trafilatura
+trafilatura.extract("")
+```"""
+
# nested
my_document = html.fromstring('This here is in bold and italic font.
')
my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
index 5a1814e5..3b577bb0 100644
--- a/trafilatura/xml.py
+++ b/trafilatura/xml.py
@@ -230,6 +230,11 @@ def replace_element_text(element, include_formatting):
rend = element.get('rend')
if rend in HI_FORMATTING:
element.text = f'{HI_FORMATTING[rend]}{element.text}{HI_FORMATTING[rend]}'
+ elif element.tag == 'code':
+ if '\n' in element.text:
+ element.text = f'```\n{element.text}\n```'
+ else:
+ element.text = f'`{element.text}`'
# handle links
if element.tag == 'ref':
if element.text is not None: