Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect code in pre-tag using indicators and fix code formatting issues #776

Merged
merged 8 commits into from
Feb 7, 2025
14 changes: 13 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,15 +354,27 @@ def test_formatting():
Here is a code sample:

`import trafilatura`"""
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code>import trafilatura\ntrafilatura.extract("")</code></p></article></body></html>')
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\n trafilatura.extract("")</pre></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
print(my_result)
assert my_result == """### Title

Here is a code sample:

```
import something
something.run("somewhere")
```
Sometimes code is wrapped using `pre` and `code`:

```
import trafilatura
trafilatura.extract("")
```
Less often code is wrapped using just `pre`:

```
trafilatura.extract("")
```"""

# nested
Expand Down
12 changes: 12 additions & 0 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

PRESERVE_IMG_CLEANING = {"figure", "picture", "source"}

CODE_INDICATORS = ["{", "(\"", "('", "\n "]


def tree_cleaning(tree: HtmlElement, options: Extractor) -> HtmlElement:
"Prune the tree by discarding unwanted elements."
Expand Down Expand Up @@ -315,8 +317,18 @@ def convert_quotes(elem: _Element) -> None:
code_flag = True
for subelem in code_elems:
subelem.attrib.clear()
if _is_code_block(elem.text):
code_flag = True
elem.tag = "code" if code_flag else "quote"

def _is_code_block(text: Optional[str]) -> bool:
"Check if the element text is part of a code block."
if not text:
return False
for indicator in CODE_INDICATORS:
if indicator in text:
return True
return False

def convert_headings(elem: _Element) -> None:
"Add head tags and delete attributes."
Expand Down
14 changes: 9 additions & 5 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@

CONTROL_PARSER = XMLParser(remove_blank_text=True)

NEWLINE_ELEMS = {'code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref'}
WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}

Expand Down Expand Up @@ -251,8 +251,8 @@ def validate_tei(xmldoc: _Element) -> bool:


def replace_element_text(element: _Element, include_formatting: bool) -> str:
"Determine element text based on just the text of the element. One must deal with the tail separately."
elem_text = element.text or ""
"Determine element text based on just the text of the element. One must deal with the tail separately."
# handle formatting: convert to markdown
if include_formatting and element.text:
if element.tag == "head":
Expand All @@ -268,8 +268,12 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if rend in HI_FORMATTING:
elem_text = f"{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}"
elif element.tag == "code":
if "\n" in element.text:
elem_text = f"```\n{elem_text}\n```"
if "\n" in elem_text or element.xpath(".//lb"): # Handle <br> inside <code>
# Convert <br> to \n within code blocks
for lb in element.xpath(".//lb"):
elem_text = f"{elem_text}\n{lb.tail}"
lb.getparent().remove(lb)
elem_text = f"```\n{elem_text}\n```\n"
else:
elem_text = f"`{elem_text}`"
# handle links
Expand Down
Loading