Skip to content

Commit

Permalink
Uses faster approach to detect code
Browse files Browse the repository at this point in the history
  • Loading branch information
steineggerroland committed Jan 21, 2025
1 parent 8309d65 commit cb70ec4
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,17 @@ def convert_quotes(elem: _Element) -> None:
code_flag = True
for subelem in code_elems:
subelem.attrib.clear()
if elem.text and any([code_indicator in elem.text for code_indicator in CODE_INDICATORS]):
if _is_code_block(elem.text):
code_flag = True
elem.tag = "code" if code_flag else "quote"

def _is_code_block(text: str) -> bool:
if not text:
return False
for indicator in CODE_INDICATORS:
if indicator in text:
return True
return False

def convert_headings(elem: _Element) -> None:
"Add head tags and delete attributes."
Expand Down

0 comments on commit cb70ec4

Please sign in to comment.