Skip to content

Commit

Permalink
Merge branch 'master' into better_xpaths
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Mar 28, 2024
2 parents 9f56ef8 + fa972ab commit fdf1f2f
Show file tree
Hide file tree
Showing 7 changed files with 1,964 additions and 148 deletions.
1,758 changes: 1,758 additions & 0 deletions tests/cache/pcgamer.com.skyrim.html

Large diffs are not rendered by default.

234 changes: 131 additions & 103 deletions tests/realworld_tests.py

Large diffs are not rendered by default.

9 changes: 7 additions & 2 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,13 @@ def test_formatting():
# XML and Markdown formatting within <p>-tag
my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
my_result = extract(copy(my_document), no_fallback=True, include_formatting=False, config=ZERO_CONFIG)
# TXT: newline problem here
assert my_result == 'bold, italics, tt,\ndeleted, underlined, link and additional text to bypass detection.'
assert my_result == 'bold, italics, tt, deleted, underlined, link and additional text to bypass detection.'

my_result = extract(copy(my_document), no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'

my_result = extract(copy(my_document), no_fallback=True, include_links=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, [link](test.html) and additional text to bypass detection.'

my_result = extract(copy(my_document), output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, link and additional text to bypass detection.</p>' in my_result
Expand Down
10 changes: 7 additions & 3 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import warnings
from copy import deepcopy

from lxml.etree import Element, SubElement, XPath, strip_elements, strip_tags
from lxml.etree import Element, SubElement, XPath, strip_elements, strip_tags, tostring
from lxml.html import tostring

# own
Expand Down Expand Up @@ -256,7 +256,7 @@ def handle_other_elements(element, potential_tags, options):
if element.tag == 'div':
# make a copy and prune it in case it contains sub-elements handled on their own?
# divcopy = deepcopy(element)
processed_element = handle_textnode(element, options, comments_fix=False)
processed_element = handle_textnode(element, options, comments_fix=False, preserve_spaces=True)
if processed_element is not None and text_chars_test(processed_element.text) is True:
processed_element.attrib.clear()
# small div-correction # could be moved elsewhere
Expand Down Expand Up @@ -1019,6 +1019,9 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
document.text = xmltotxt(postbody, include_formatting)
if include_comments is True:
document.comments = xmltotxt(commentsbody, include_formatting)
document.commentsbody = commentsbody
document.raw_text = document.text
document.body = postbody
else:
document.raw_text, document.body, document.commentsbody = temp_text, postbody, commentsbody
if as_dict is True:
Expand Down Expand Up @@ -1123,7 +1126,8 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
# add record ID to metadata
document.id = record_id
# calculate fingerprint
document.fingerprint = content_fingerprint(str(document.title) + " " + document.raw_text)
if document.raw_text is not None:
document.fingerprint = content_fingerprint(str(document.title) + " " + str(document.raw_text))

# return
return determine_returnstring(document, output_format, include_formatting, tei_validation)
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,8 @@ def handle_textnode(element, options, comments_fix=True, preserve_spaces=False):
return None
# lb bypass
if comments_fix is False and element.tag == 'lb':
element.tail = trim(element.tail)
if preserve_spaces is False:
element.tail = trim(element.tail)
# if textfilter(element) is True:
# return None
# duplicate_test(subelement)?
Expand Down
4 changes: 2 additions & 2 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,8 @@ def line_processing(line, preserve_space=False, trailing_space=False):
if all(map(str.isspace, new_line)):
new_line = None
elif trailing_space:
space_before = " " if line[0] == " " else ""
space_after = " " if line[-1] == " " else ""
space_before = " " if line[0].isspace() else ""
space_after = " " if line[-1].isspace() else ""
new_line = "".join([space_before, new_line, space_after])
return new_line

Expand Down
94 changes: 57 additions & 37 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
'item': '\n- ',
**{tag: '\n' for tag in ['code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table']}
}
SPECIAL_FORMATTING = {'del', 'head', 'hi'}
SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
WITH_ATTRIBUTES = {'cell', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}

NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}
Expand Down Expand Up @@ -211,7 +211,8 @@ def validate_tei(xmldoc): # , filename=""


def replace_element_text(element, include_formatting):
'''Determine element text based on text and tail'''
'''Determine element text based on **just the text** of the element. You must deal with the tail separately.'''
elem_text = element.text
# handle formatting: convert to markdown
if include_formatting is True and element.text is not None:
if element.tag in ('del', 'head'):
Expand All @@ -220,31 +221,31 @@ def replace_element_text(element, include_formatting):
number = int(element.get('rend')[1])
except (TypeError, ValueError):
number = 2
element.text = f'{"#" * number} {element.text}'
elem_text = f'{"#" * number} {elem_text}'
elif element.tag == 'del':
element.text = f'~~{element.text}~~'
elem_text = f'~~{elem_text}~~'
elif element.tag == 'hi':
rend = element.get('rend')
if rend in HI_FORMATTING:
element.text = f'{HI_FORMATTING[rend]}{element.text}{HI_FORMATTING[rend]}'
elem_text = f'{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}'
elif element.tag == 'code':
if '\n' in element.text:
element.text = f'```\n{element.text}\n```'
elem_text = f'```\n{elem_text}\n```'
else:
element.text = f'`{element.text}`'
elem_text = f'`{elem_text}`'
# handle links
if element.tag == 'ref':
if element.text is not None:
link_text = f'[{element.text}]'
if elem_text is not None:
link_text = f'[{elem_text}]'
if element.get('target') is not None:
element.text = f"{link_text}({element.get('target')})"
elem_text = f"{link_text}({element.get('target')})"
else:
LOGGER.warning('missing link attribute: %s %s', element.text, element.attrib)
element.text = link_text
LOGGER.warning('missing link attribute: %s %s', elem_text, element.attrib)
elem_text = link_text
else:
LOGGER.warning('empty link: %s %s', element.text, element.attrib)
LOGGER.warning('empty link: %s %s', elem_text, element.attrib)
# handle text
return (element.text or '') + (element.tail or '')
return (elem_text or '')


def merge_with_parent(element, include_formatting=False):
Expand All @@ -254,6 +255,8 @@ def merge_with_parent(element, include_formatting=False):
return

full_text = replace_element_text(element, include_formatting)
if element.tail is not None:
full_text = f'{full_text}{element.tail}'

previous = element.getprevious()
if previous is not None:
Expand All @@ -269,32 +272,49 @@ def merge_with_parent(element, include_formatting=False):
parent.remove(element)


def process_element(element, returnlist, include_formatting):
# Process children recursively
if element.text is not None:
# this is the text that comes before the first child
textelement = replace_element_text(element, include_formatting)
returnlist.append(textelement)

for child in element:
process_element(child, returnlist, include_formatting)

if element.text is None and element.tail is None:
if element.tag == 'graphic':
# add source, default to ''
text = f'{element.get("title", "")} {element.get("alt", "")}'
returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')'])
# newlines for textless elements
if element.tag in ('graphic', 'row', 'table'):
returnlist.append('\n')
return # Nothing more to do with textless elements

# Process text

# Common elements (Now processes end-tag logic correctly)
if element.tag in NEWLINE_ELEMS:
returnlist.extend([NEWLINE_ELEMS[element.tag], '\n'])
elif element.tag == 'comments':
returnlist.append('\n\n')
else:
if element.tag not in SPECIAL_FORMATTING:
LOGGER.debug('unprocessed element in output: %s', element.tag)
returnlist.extend([' '])

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
if element.tail is not None:
returnlist.append(element.tail)


def xmltotxt(xmloutput, include_formatting):
'''Convert to plain text format and optionally preserve formatting as markdown.'''
returnlist = []
# strip_tags(xmloutput, 'div', 'main', 'span')
# iterate and convert to list of strings
for element in xmloutput.iter('*'):
if element.text is None and element.tail is None:
if element.tag == 'graphic':
# add source, default to ''
text = f'{element.get("title", "")} {element.get("alt", "")}'
returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')'])
# newlines for textless elements
if element.tag in ('graphic', 'row', 'table'):
returnlist.append('\n')
continue
# process text
textelement = replace_element_text(element, include_formatting)
# common elements
if element.tag in NEWLINE_ELEMS:
returnlist.extend([NEWLINE_ELEMS[element.tag], textelement, '\n'])
elif element.tag == 'comments':
returnlist.append('\n\n')
else:
if element.tag not in SPECIAL_FORMATTING:
LOGGER.debug('unprocessed element in output: %s', element.tag)
returnlist.extend([textelement, ' '])

process_element(xmloutput, returnlist, include_formatting)

return unescape(sanitize(''.join(returnlist)))


Expand Down

0 comments on commit fdf1f2f

Please sign in to comment.