diff --git a/setup.py b/setup.py index 9149ca7d..54e93ffa 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ def get_long_description(): "justext >= 3.0.0", # see tests on Github Actions "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'", - "lxml == 4.9.4 ; platform_system != 'Darwin' or python_version > '3.8'", + "lxml >= 4.9.4, < 6; platform_system != 'Darwin' or python_version > '3.8'", "urllib3 >= 1.26, < 2; python_version < '3.7'", "urllib3 >= 1.26, < 3; python_version >= '3.7'", ], diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 8a6d6b3b..9847afc0 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -114,13 +114,27 @@ def test_trim(): def test_input(): '''test if loaded strings/trees are handled properly''' - assert utils.is_dubious_html('This is a string.') is True - htmlstring = "\n" + assert utils.is_dubious_html("This is a string.") is True + + htmlstring = "\n" beginning = htmlstring[:50].lower() - assert utils.strip_faulty_doctypes(htmlstring, beginning) == "\n" + assert utils.repair_faulty_html(htmlstring, beginning) == "\n" + htmlstring = "\n" beginning = htmlstring[:50].lower() - assert utils.strip_faulty_doctypes(htmlstring, beginning) == htmlstring + assert utils.repair_faulty_html(htmlstring, beginning) == htmlstring + + htmlstring = "\n" + beginning = htmlstring[:50].lower() + assert utils.repair_faulty_html(htmlstring, beginning) == "\n" + + htmlstring = '\n\n\n\n' + beginning = htmlstring[:50].lower() + assert ( + utils.repair_faulty_html(htmlstring, beginning) + == '\n\n\n\n' + ) + with pytest.raises(TypeError) as err: assert utils.load_html(123) is None assert 'incompatible' in str(err.value) diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py index 57c32166..1efdc592 100644 --- a/tests/xml_tei_tests.py +++ b/tests/xml_tei_tests.py @@ -316,9 +316,9 @@ def test_ab_with_p_parent_resolved(): """ ) cleaned = check_tei(xml_doc, "fake_url") - result = [(elem.tag, elem.text, elem.tail) for elem in xml_doc.iter(["p", "ab"])] + result = [(elem.tag, elem.text, elem.tail if elem.tail is None else elem.tail.strip()) for elem in xml_doc.iter(["p", "ab"])] assert result == [ - ("p", "text1", None), + ("p", "text1", ""), ("ab", "text2", None), ("p", "text3", None), ("ab", "text4", None), @@ -339,10 +339,10 @@ def test_ab_with_p_parent_resolved(): """ ) cleaned = check_tei(xml_doc, "fake_url") - result = [(elem.tag, elem.text, elem.tail) for elem in xml_doc.iter(["p", "ab"])] + result = [(elem.tag, elem.text, elem.tail if elem.tail is None else elem.tail.strip()) for elem in xml_doc.iter(["p", "ab"])] assert result == [ - ("p", "text0", None), - ("ab", "text1", None), + ("p", "text0", ""), + ("ab", "text1", ""), ("p", None, None), ("ab", "text3", None), ("p", "text4", None), diff --git a/trafilatura/utils.py b/trafilatura/utils.py index 58b007ef..f0c451b8 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -39,6 +39,7 @@ UNICODE_ALIASES = {'utf-8', 'utf_8'} DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I) +FAULTY_HTML = re.compile(r"(", re.I) # note: htmldate could use HTML comments # huge_tree=True, remove_blank_text=True @@ -168,12 +169,19 @@ def is_dubious_html(beginning: str) -> bool: return "html" not in beginning -def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str: - "Repair faulty doctype strings to make then palatable for libxml2." +def repair_faulty_html(htmlstring: str, beginning: str) -> str: + "Repair faulty HTML strings to make then palatable for libxml2." # libxml2/LXML issue: https://bugs.launchpad.net/lxml/+bug/1955915 if "doctype" in beginning: firstline, _, rest = htmlstring.partition("\n") - return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest + htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest + # other issue with malformed documents: check first three lines + for i, line in enumerate(iter(htmlstring.splitlines())): + if ""): + htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1) + break + if i > 2: + break return htmlstring @@ -181,9 +189,9 @@ def fromstring_bytes(htmlobject): "Try to pass bytes to LXML parser." tree = None try: - tree = fromstring(htmlobject.encode('utf8', 'surrogatepass'), parser=HTML_PARSER) + tree = fromstring(htmlobject.encode("utf8", "surrogatepass"), parser=HTML_PARSER) except Exception as err: - LOGGER.error('lxml parser bytestring %s', err) + LOGGER.error("lxml parser bytestring %s", err) return tree @@ -195,11 +203,11 @@ def load_html(htmlobject): if isinstance(htmlobject, HtmlElement): return htmlobject # use trafilatura or urllib3 responses directly - if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, 'data'): + if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, "data"): htmlobject = htmlobject.data # do not accept any other type after this point if not isinstance(htmlobject, (bytes, str)): - raise TypeError('incompatible input type', type(htmlobject)) + raise TypeError("incompatible input type", type(htmlobject)) # start processing tree = None # try to guess encoding and decode file: if None then keep original @@ -208,7 +216,7 @@ def load_html(htmlobject): beginning = htmlobject[:50].lower() check_flag = is_dubious_html(beginning) # repair first - htmlobject = strip_faulty_doctypes(htmlobject, beginning) + htmlobject = repair_faulty_html(htmlobject, beginning) # first pass: use Unicode string fallback_parse = False try: @@ -217,15 +225,17 @@ def load_html(htmlobject): # "Unicode strings with encoding declaration are not supported." tree = fromstring_bytes(htmlobject) fallback_parse = True - except Exception as err: - LOGGER.error('lxml parsing failed: %s', err) + except Exception as err: # pragma: no cover + LOGGER.error("lxml parsing failed: %s", err) # second pass: try passing bytes to LXML if (tree is None or len(tree) < 1) and not fallback_parse: tree = fromstring_bytes(htmlobject) # rejection test: is it (well-formed) HTML at all? # log parsing errors if tree is not None and check_flag is True and len(tree) < 2: - LOGGER.error('parsed tree length: %s, wrong data type or not valid HTML', len(tree)) + LOGGER.error( + "parsed tree length: %s, wrong data type or not valid HTML", len(tree) + ) tree = None return tree diff --git a/trafilatura/xml.py b/trafilatura/xml.py index e3448edc..29750b7d 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -246,7 +246,7 @@ def replace_element_text(element, include_formatting): def merge_with_parent(element, include_formatting=False): '''Merge element with its parent and convert formatting to markdown.''' parent = element.getparent() - if not parent: + if parent is None: return full_text = replace_element_text(element, include_formatting) @@ -491,16 +491,28 @@ def _wrap_unwanted_siblings_of_div(div_element): def _move_element_one_level_up(element): + """ + Fix TEI compatibility issues by moving certain p-elems up in the XML tree. + There is always a n+2 nesting for p-elements with the minimal structure ./TEI/text/body/p + """ parent = element.getparent() + grand_parent = parent.getparent() + new_elem = Element("p") new_elem.extend(sibling for sibling in element.itersiblings()) - parent.addnext(element) + grand_parent.insert(grand_parent.index(parent) + 1, element) - if element.tail is not None and element.tail.strip(): + if element.tail and element.tail.strip(): new_elem.text = element.tail.strip() element.tail = None - if len(new_elem) != 0 or new_elem.text: - element.addnext(new_elem) + + if parent.tail and parent.tail.strip(): + new_elem.tail = parent.tail.strip() + parent.tail = None + + if len(new_elem) != 0 or new_elem.text or new_elem.tail: + grand_parent.insert(grand_parent.index(element) + 1, new_elem) + if len(parent) == 0 and parent.text is None: - parent.getparent().remove(parent) + grand_parent.remove(parent)