Merge branch 'master' into better_xpaths

adbar · Mar 28, 2024 · fdf1f2f · fdf1f2f
2 parents 9f56ef8 + fa972ab
commit fdf1f2f
Show file tree

Hide file tree

Showing 7 changed files with 1,964 additions and 148 deletions.
diff --git a/tests/cache/pcgamer.com.skyrim.html b/tests/cache/pcgamer.com.skyrim.html
diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -353,8 +353,13 @@ def test_formatting():
     # XML and Markdown formatting within <p>-tag
     my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
     my_result = extract(copy(my_document), no_fallback=True, include_formatting=False, config=ZERO_CONFIG)
-    # TXT: newline problem here
-    assert my_result == 'bold, italics, tt,\ndeleted, underlined, link and additional text to bypass detection.'
+    assert my_result == 'bold, italics, tt, deleted, underlined, link and additional text to bypass detection.'
+
+    my_result = extract(copy(my_document), no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
+    assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'
+
+    my_result = extract(copy(my_document), no_fallback=True, include_links=True, include_formatting=True, config=ZERO_CONFIG)
+    assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, [link](test.html) and additional text to bypass detection.'
 
     my_result = extract(copy(my_document), output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
     assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, link and additional text to bypass detection.</p>' in my_result

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -8,7 +8,7 @@
 import warnings
 from copy import deepcopy
 
-from lxml.etree import Element, SubElement, XPath, strip_elements, strip_tags
+from lxml.etree import Element, SubElement, XPath, strip_elements, strip_tags, tostring
 from lxml.html import tostring
 
 # own
@@ -256,7 +256,7 @@ def handle_other_elements(element, potential_tags, options):
     if element.tag == 'div':
         # make a copy and prune it in case it contains sub-elements handled on their own?
         # divcopy = deepcopy(element)
-        processed_element = handle_textnode(element, options, comments_fix=False)
+        processed_element = handle_textnode(element, options, comments_fix=False, preserve_spaces=True)
         if processed_element is not None and text_chars_test(processed_element.text) is True:
             processed_element.attrib.clear()
             # small div-correction # could be moved elsewhere
@@ -1019,6 +1019,9 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
         document.text = xmltotxt(postbody, include_formatting)
         if include_comments is True:
             document.comments = xmltotxt(commentsbody, include_formatting)
+            document.commentsbody = commentsbody
+        document.raw_text = document.text
+        document.body = postbody
     else:
         document.raw_text, document.body, document.commentsbody = temp_text, postbody, commentsbody
     if as_dict is True:
@@ -1123,7 +1126,8 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
         # add record ID to metadata
         document.id = record_id
         # calculate fingerprint
-        document.fingerprint = content_fingerprint(str(document.title) + " " + document.raw_text)
+        if document.raw_text is not None:
+            document.fingerprint = content_fingerprint(str(document.title) + " " + str(document.raw_text))
 
     # return
     return determine_returnstring(document, output_format, include_formatting, tei_validation)

diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -312,7 +312,8 @@ def handle_textnode(element, options, comments_fix=True, preserve_spaces=False):
         return None
     # lb bypass
     if comments_fix is False and element.tag == 'lb':
-        element.tail = trim(element.tail)
+        if preserve_spaces is False:
+            element.tail = trim(element.tail)
         # if textfilter(element) is True:
         #     return None
         # duplicate_test(subelement)?

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -268,8 +268,8 @@ def line_processing(line, preserve_space=False, trailing_space=False):
         if all(map(str.isspace, new_line)):
             new_line = None
         elif trailing_space:
-            space_before = " " if line[0] == " " else ""
-            space_after = " " if line[-1] == " " else ""
+            space_before = " " if line[0].isspace() else ""
+            space_after = " " if line[-1].isspace() else ""
             new_line = "".join([space_before, new_line, space_after])
     return new_line
 

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -44,7 +44,7 @@
     'item': '\n- ',
     **{tag: '\n' for tag in ['code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table']}
 }
-SPECIAL_FORMATTING = {'del', 'head', 'hi'}
+SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
 WITH_ATTRIBUTES = {'cell', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
 
 NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}
@@ -211,7 +211,8 @@ def validate_tei(xmldoc):  # , filename=""
 
 
 def replace_element_text(element, include_formatting):
-    '''Determine element text based on text and tail'''
+    '''Determine element text based on **just the text** of the element. You must deal with the tail separately.'''
+    elem_text = element.text
     # handle formatting: convert to markdown
     if include_formatting is True and element.text is not None:
         if element.tag in ('del', 'head'):
@@ -220,31 +221,31 @@ def replace_element_text(element, include_formatting):
                     number = int(element.get('rend')[1])
                 except (TypeError, ValueError):
                     number = 2
-                element.text = f'{"#" * number} {element.text}'
+                elem_text = f'{"#" * number} {elem_text}'
             elif element.tag == 'del':
-                element.text = f'~~{element.text}~~'
+                elem_text = f'~~{elem_text}~~'
         elif element.tag == 'hi':
             rend = element.get('rend')
             if rend in HI_FORMATTING:
-                element.text = f'{HI_FORMATTING[rend]}{element.text}{HI_FORMATTING[rend]}'
+                elem_text = f'{HI_FORMATTING[rend]}{elem_text}{HI_FORMATTING[rend]}'
         elif element.tag == 'code':
             if '\n' in element.text:
-                element.text = f'```\n{element.text}\n```'
+                elem_text = f'```\n{elem_text}\n```'
             else:
-                element.text = f'`{element.text}`'
+                elem_text = f'`{elem_text}`'
     # handle links
     if element.tag == 'ref':
-        if element.text is not None:
-            link_text = f'[{element.text}]'
+        if elem_text is not None:
+            link_text = f'[{elem_text}]'
             if element.get('target') is not None:
-                element.text = f"{link_text}({element.get('target')})"
+                elem_text = f"{link_text}({element.get('target')})"
             else:
-                LOGGER.warning('missing link attribute: %s %s', element.text, element.attrib)
-                element.text = link_text
+                LOGGER.warning('missing link attribute: %s %s', elem_text, element.attrib)
+                elem_text = link_text
         else:
-            LOGGER.warning('empty link: %s %s', element.text, element.attrib)
+            LOGGER.warning('empty link: %s %s', elem_text, element.attrib)
     # handle text
-    return (element.text or '') + (element.tail or '')
+    return (elem_text or '')
 
 
 def merge_with_parent(element, include_formatting=False):
@@ -254,6 +255,8 @@ def merge_with_parent(element, include_formatting=False):
         return
 
     full_text = replace_element_text(element, include_formatting)
+    if element.tail is not None:
+        full_text = f'{full_text}{element.tail}'
 
     previous = element.getprevious()
     if previous is not None:
@@ -269,32 +272,49 @@ def merge_with_parent(element, include_formatting=False):
     parent.remove(element)
 
 
+def process_element(element, returnlist, include_formatting):
+    # Process children recursively
+    if element.text is not None:
+        # this is the text that comes before the first child
+        textelement = replace_element_text(element, include_formatting)
+        returnlist.append(textelement)
+
+    for child in element:
+        process_element(child, returnlist, include_formatting)
+
+    if element.text is None and element.tail is None:
+        if element.tag == 'graphic':
+            # add source, default to ''
+            text = f'{element.get("title", "")} {element.get("alt", "")}'
+            returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')'])
+        # newlines for textless elements
+        if element.tag in ('graphic', 'row', 'table'):
+            returnlist.append('\n')
+        return  # Nothing more to do with textless elements
+
+    # Process text
+
+    # Common elements (Now processes end-tag logic correctly)
+    if element.tag in NEWLINE_ELEMS:
+        returnlist.extend([NEWLINE_ELEMS[element.tag], '\n'])
+    elif element.tag == 'comments':
+        returnlist.append('\n\n')
+    else:
+        if element.tag not in SPECIAL_FORMATTING:
+            LOGGER.debug('unprocessed element in output: %s', element.tag)
+            returnlist.extend([' '])
+
+    # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
+    if element.tail is not None:
+        returnlist.append(element.tail)
+
+
 def xmltotxt(xmloutput, include_formatting):
     '''Convert to plain text format and optionally preserve formatting as markdown.'''
     returnlist = []
-    # strip_tags(xmloutput, 'div', 'main', 'span')
-    # iterate and convert to list of strings
-    for element in xmloutput.iter('*'):
-        if element.text is None and element.tail is None:
-            if element.tag == 'graphic':
-                # add source, default to ''
-                text = f'{element.get("title", "")} {element.get("alt", "")}'
-                returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')'])
-            # newlines for textless elements
-            if element.tag in ('graphic', 'row', 'table'):
-                returnlist.append('\n')
-            continue
-        # process text
-        textelement = replace_element_text(element, include_formatting)
-        # common elements
-        if element.tag in NEWLINE_ELEMS:
-            returnlist.extend([NEWLINE_ELEMS[element.tag], textelement, '\n'])
-        elif element.tag == 'comments':
-            returnlist.append('\n\n')
-        else:
-            if element.tag not in SPECIAL_FORMATTING:
-                LOGGER.debug('unprocessed element in output: %s', element.tag)
-            returnlist.extend([textelement, ' '])
+
+    process_element(xmloutput, returnlist, include_formatting)
+
     return unescape(sanitize(''.join(returnlist)))