Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactoring: add type hints #723

Merged
merged 10 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tests/metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ def test_meta():
assert metadata.title == 'Title'

# catch errors
assert extract_metadata('') is None
metadata = extract_metadata('')
assert all(getattr(metadata, a) is None for a in metadata.__slots__)
metadata = extract_metadata('<html><title></title></html>')
assert metadata.sitename is None
metadata = extract_metadata('<html><head><title>' + 'AAA'*10000 + '</title></head></html>')
Expand Down
1 change: 1 addition & 0 deletions tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def test_extraction():
#sitemap.handle_link(url) # (url, '0')

# safety belts
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', None) is False
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', b'\x1f\x8bABC') is False
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml', 'ABC') is False
assert sitemaps.is_plausible_sitemap('http://test.org/sitemap.xml', '<!DOCTYPE html><html><body/></html>') is False
Expand Down
5 changes: 5 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,8 @@ def test_tei():

def test_htmlprocessing():
'''test html-related functions'''
assert xml.xmltotxt(None, include_formatting=False) == ""

options = DEFAULT_OPTIONS
options.tables = True
assert trafilatura.htmlprocessing.tree_cleaning(etree.Element('html'), options) is not None
Expand Down Expand Up @@ -819,6 +821,7 @@ def test_extraction_options():
assert extract(my_html, only_with_metadata=False, output_format='xml', config=ZERO_CONFIG) is not None
assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', no_fallback=True, config=ZERO_CONFIG) is None
assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
# assert extract(my_html) is None

Expand Down Expand Up @@ -1383,6 +1386,8 @@ def test_is_probably_readerable():
"""
Test is_probably_readerable function.
"""
assert not is_probably_readerable("ABC")

very_small_str = "hello there"
small_str = "hello there " * 11
large_str = "hello there " * 12
Expand Down
14 changes: 8 additions & 6 deletions trafilatura/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
from typing import Any, Tuple

from lxml.etree import _Element, Element, SubElement
from lxml.html import HtmlElement

from .settings import BASIC_CLEAN_XPATH
from .utils import load_html, trim
from .xml import delete_element


def basic_cleaning(tree: _Element) -> _Element:
def basic_cleaning(tree: HtmlElement) -> HtmlElement:
"Remove a few section types from the document."
for elem in BASIC_CLEAN_XPATH(tree):
delete_element(elem)
Expand Down Expand Up @@ -62,7 +63,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
# scrape from article tag
temp_text = ""
for article_elem in tree.iterfind('.//article'):
text = trim(article_elem.text_content())
text = trim(article_elem.text_content()) or ""
if len(text) > 100:
SubElement(postbody, 'p').text = text
temp_text += " " + text if temp_text else text
Expand All @@ -75,7 +76,7 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
temp_text = ""
# postbody = Element('body')
for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
entry = trim(element.text_content())
entry = trim(element.text_content()) or ""
if entry not in results:
SubElement(postbody, 'p').text = entry
temp_text += " " + entry if temp_text else entry
Expand All @@ -88,10 +89,11 @@ def baseline(filecontent: Any) -> Tuple[_Element, str, int]:
postbody = Element('body')
body_elem = tree.find('.//body')
if body_elem is not None:
elem = SubElement(postbody, 'p')
p_elem = SubElement(postbody, 'p')
# todo: sanitize?
elem.text = '\n'.join([trim(e) for e in body_elem.itertext()])
return postbody, elem.text, len(elem.text)
text_elems = [trim(e) for e in body_elem.itertext()]
p_elem.text = '\n'.join([e for e in text_elems if e])
return postbody, p_elem.text, len(p_elem.text)

# new fallback
text = html2txt(tree, clean=False)
Expand Down
5 changes: 3 additions & 2 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
url_processing_pipeline, write_result)
from .settings import PARALLEL_CORES, SUPPORTED_FMT_CLI


# fix output encoding on some systems
if sys.stdout.encoding != 'UTF-8':
if sys.stdout.encoding != 'UTF-8' and hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
if sys.stderr.encoding != 'UTF-8':
if sys.stderr.encoding != 'UTF-8' and hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8')


Expand Down
6 changes: 3 additions & 3 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .baseline import html2txt
from .core import extract
from .deduplication import generate_bow_hash
from .downloads import add_to_compressed_dict, buffered_downloads, load_download_buffer
from .downloads import Response, add_to_compressed_dict, buffered_downloads, load_download_buffer
from .feeds import find_feed_urls
from .meta import reset_caches
from .settings import (
Expand Down Expand Up @@ -272,7 +272,7 @@ def download_queue_processing(
bufferlist, args.parallel, options=options
):
# handle result
if result:
if result and isinstance(result, str):
options.url = url
counter = process_result(result, args, counter, options)
else:
Expand Down Expand Up @@ -380,7 +380,7 @@ def cli_crawler(
for url, result in buffered_downloads(
bufferlist, args.parallel, decode=False, options=options
):
if result is not None:
if result and isinstance(result, Response):
spider.process_response(result, param_dict[get_base_url(url)])
# early exit if maximum count is reached
if any(c >= n for c in spider.URL_STORE.get_all_counts()):
Expand Down
Loading
Loading