diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ca82b852..e990ae57 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,7 +18,7 @@ jobs: matrix: os: [ubuntu-latest] # https://github.com/actions/python-versions/blob/main/versions-manifest.json - python-version: ["3.9", "3.11"] # "3.13", "3.14-dev" + python-version: ["3.9", "3.11", "3.13"] # "3.14-dev" env: - MINIMAL: "true" PROXY_TEST: "false" @@ -57,7 +57,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + run: python -m pip install --upgrade pip - name: Get pip cache dir id: pip-cache @@ -75,35 +75,32 @@ jobs: # package setup - uses: actions/checkout@v4 - # only where prebuilt wheels do not exist - # - name: Install LXML dependencies - # if: ${{ matrix.python-version == '3.13-dev' }} - # run: | - # sudo apt-get update - # sudo apt-get install libxml2-dev libxslt-dev - - name: Install dependencies run: python -m pip install -e ".[dev]" + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + # pycurl installation fix - name: Install packages required by pycurl - if: ${{ matrix.env.MINIMAL == 'false'}} + if: ${{ matrix.env.MINIMAL == 'false' }} run: | sudo apt-get update sudo apt-get install libcurl4-gnutls-dev libgnutls28-dev # alternatively: sudo apt-get install libcurl4-openssl-dev libssl-dev - name: Install full dependencies - if: ${{ matrix.env.MINIMAL == 'false'}} + if: ${{ matrix.env.MINIMAL == 'false' }} run: python -m pip install -e ".[all]" - # tests - - name: Lint with flake8 + - name: Type checking + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + mypy -p trafilatura - name: Test with pytest run: | @@ -113,7 +110,7 @@ jobs: # coverage - name: Upload coverage to Codecov - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.11' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} uses: codecov/codecov-action@v4 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/MANIFEST.in b/MANIFEST.in index c4c8c161..91ba57d5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include CITATION.cff CONTRIBUTING.md HISTORY.md README.rst LICENSE graft trafilatura/data/ include trafilatura/settings.cfg +include trafilatura/py.typed include tests/__init__.py include tests/*test*.py diff --git a/pyproject.toml b/pyproject.toml index a08a5943..0d352adc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,8 +94,11 @@ trafilatura = "trafilatura.cli:main" [project.optional-dependencies] dev = [ "flake8", + "mypy", "pytest", "pytest-cov", + "types-lxml", + "types-urllib3", ] all = [ "brotli", diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index 2bae7fe7..9475b0d7 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -15,7 +15,6 @@ from time import sleep from typing import ( Any, - ByteString, Callable, Dict, Generator, @@ -73,7 +72,7 @@ def create_pool(**args: Any) -> Union[urllib3.PoolManager, Any]: return manager_class(**manager_args, **args) # type: ignore[arg-type] -DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True) +DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True) # type: ignore[no-untyped-call] USER_AGENT = ( "trafilatura/" + version("trafilatura") + " (+https://github.com/adbar/trafilatura)" ) @@ -106,7 +105,7 @@ class Response: "Store information gathered in a HTTP response object." __slots__ = ["data", "headers", "html", "status", "url"] - def __init__(self, data: ByteString, status: int, url: str) -> None: + def __init__(self, data: bytes, status: int, url: str) -> None: self.data = data self.headers: Optional[Dict[str, str]] = None self.html: Optional[str] = None @@ -332,14 +331,14 @@ def _pycurl_is_live_page(url: str) -> bool: curl.setopt(pycurl.SSL_VERIFYPEER, 0) curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Set option to avoid getting the response body - curl.setopt(curl.NOBODY, True) # type: ignore[attr-defined] + curl.setopt(curl.NOBODY, True) if PROXY_URL: curl.setopt(pycurl.PRE_PROXY, PROXY_URL) # Perform the request try: curl.perform() # Get the response code - page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400 # type: ignore[attr-defined] + page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400 except pycurl.error as err: LOGGER.debug("pycurl HEAD error: %s %s", url, err) page_exists = False @@ -503,7 +502,7 @@ def _send_pycurl_request( # ip_info = curl.getinfo(curl.PRIMARY_IP) resp = Response( - bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL) # type: ignore[attr-defined] + bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL) ) curl.close() diff --git a/trafilatura/external.py b/trafilatura/external.py index 72c45741..49801869 100644 --- a/trafilatura/external.py +++ b/trafilatura/external.py @@ -103,7 +103,7 @@ def compare_extraction(tree: HtmlElement, backup_tree: HtmlElement, body: _Eleme # post-processing: remove unwanted sections if use_readability and not jt_result: - body, text, len_text = sanitize_tree(body, options) + body, text, len_text = sanitize_tree(body, options) # type: ignore[arg-type] return body, text, len_text diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index 34b228c7..af855ee2 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -291,7 +291,7 @@ def convert_lists(elem: _Element) -> None: for subelem in elem.iter("dd", "dt", "li"): # keep track of dd/dt items if subelem.tag in ("dd", "dt"): - subelem.set("rend", f"{subelem.tag}-{i}") + subelem.set("rend", f"{str(subelem.tag)}-{i}") # increment counter after
elements if nothing found or text too short # todo: test precision and recall settings here - if len(result_body) == 0 or len(temp_text) < options.min_extracted_size: + if len(result_body) == 0 or len(temp_text) < options.min_extracted_size: # type: ignore[attr-defined] result_body = recover_wild_text(backup_tree, result_body, options, potential_tags) temp_text = ' '.join(result_body.itertext()).strip() # filter output @@ -632,7 +632,7 @@ def extract_content(cleaned_tree: HtmlElement, options: Any) -> Tuple[_Element, return result_body, temp_text, len(temp_text) -def process_comments_node(elem: _Element, potential_tags: Any, options: Any) -> Optional[_Element]: +def process_comments_node(elem: _Element, potential_tags: Set[str], options: Extractor) -> Optional[_Element]: '''Process comment node and determine how to deal with its content''' if elem.tag in potential_tags: # print(elem.tag, elem.text_content()) @@ -646,7 +646,7 @@ def process_comments_node(elem: _Element, potential_tags: Any, options: Any) -> return None -def extract_comments(tree: HtmlElement, options: Any) -> Tuple[_Element, str, int, HtmlElement]: +def extract_comments(tree: HtmlElement, options: Extractor) -> Tuple[_Element, str, int, HtmlElement]: "Try to extract comments out of potential sections in the HTML." comments_body = Element("body") # define iteration strategy @@ -668,7 +668,7 @@ def extract_comments(tree: HtmlElement, options: Any) -> Tuple[_Element, str, in # comments_body.append(processed_elem) # processed_elems = (process_comments_node(elem, potential_tags, options) for elem in # subtree.xpath('.//*')) - comments_body.extend(filter(lambda x: x is not None, (process_comments_node(e, potential_tags, options) for e in subtree.xpath(".//*")))) + comments_body.extend(filter(lambda x: x is not None, (process_comments_node(e, potential_tags, options) for e in subtree.xpath(".//*")))) # type: ignore[arg-type] # control if len(comments_body) > 0: # if it has children LOGGER.debug(expr) diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py index 09dbd2dd..f6fe6d8c 100644 --- a/trafilatura/metadata.py +++ b/trafilatura/metadata.py @@ -38,6 +38,7 @@ TITLE_XPATHS, ) +__all__ = ["Document"] LOGGER = logging.getLogger(__name__) logging.getLogger("htmldate").setLevel(logging.WARNING) @@ -309,7 +310,8 @@ def examine_meta(tree: HtmlElement) -> Document: # backups metadata.sitename = metadata.sitename or backup_sitename # copy - metadata.set_attributes(tags=tags) + metadata.tags = tags + # metadata.set_attributes(tags=tags) return metadata diff --git a/trafilatura/py.typed b/trafilatura/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py index 5ccfb9f4..96742bd0 100644 --- a/trafilatura/readability_lxml.py +++ b/trafilatura/readability_lxml.py @@ -269,7 +269,8 @@ def class_weight(self, elem: HtmlElement) -> float: def score_node(self, elem: HtmlElement) -> Candidate: score = self.class_weight(elem) - name = elem.tag.lower() + tag = str(elem.tag) + name = tag.lower() if name in DIV_SCORES: score += 5 elif name in BLOCK_SCORES: diff --git a/trafilatura/settings.py b/trafilatura/settings.py index 2341f7e9..778543ae 100644 --- a/trafilatura/settings.py +++ b/trafilatura/settings.py @@ -279,19 +279,13 @@ def __init__( self.filedate: Optional[str] = filedate @classmethod - def from_dict(cls: Any, data: Dict[str, Any]) -> Any: + def from_dict(cls, data: Dict[str, Any]) -> 'Document': "Set a series of attributes using a dictionary." doc = cls() for key, value in data.items(): setattr(doc, key, value) return doc - def set_attributes(self, **kwargs: Optional[Dict[str, Any]]) -> None: - "Helper function to (re-)set a series of attributes." - for key, value in kwargs.items(): - if value: - setattr(self, key, value) - def clean_and_trim(self) -> None: "Limit text length and trim the attributes." for slot in self.__slots__: diff --git a/trafilatura/utils.py b/trafilatura/utils.py index fce29955..aae37d7f 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -32,7 +32,7 @@ HAS_BROTLI = False try: - import zstandard # type: ignore + import zstandard HAS_ZSTD = True except ImportError: HAS_ZSTD = False @@ -114,7 +114,7 @@ def handle_compressed_file(filecontent: bytes) -> bytes: # try brotli if HAS_BROTLI: try: - return brotli.decompress(filecontent) + return brotli.decompress(filecontent) # type: ignore[no-any-return] except brotli.error: pass # logging.debug('invalid Brotli file') # try zlib/deflate @@ -408,7 +408,7 @@ def language_classifier(temp_text: str, temp_comments: str) -> Optional[str]: else: LOGGER.warning('Language detector not installed, skipping detection') result = None - return result + return result # type: ignore[no-any-return] def language_filter(temp_text: str, temp_comments: str, target_language: str, docmeta: Any) -> Tuple[bool, Any]: