From 36caa71fc07308b90ef42c403da0067cd3a288ba Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Wed, 20 Nov 2024 17:06:35 +0100 Subject: [PATCH] fix errors and update setup --- .github/workflows/tests.yml | 16 ++++++++-------- trafilatura/htmlprocessing.py | 15 ++++++++------- trafilatura/main_extractor.py | 4 ++-- trafilatura/readability_lxml.py | 3 ++- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f01bc9f8..f6713267 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,7 +18,7 @@ jobs: matrix: os: [ubuntu-latest] # https://github.com/actions/python-versions/blob/main/versions-manifest.json - python-version: ["3.9", "3.11"] # "3.13", "3.14-dev" + python-version: ["3.9", "3.11", "3.13"] # "3.13", "3.14-dev" env: - MINIMAL: "true" PROXY_TEST: "false" @@ -57,7 +57,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Upgrade pip - run: python -m pip install --upgrade pip setuptools wheel + run: python -m pip install --upgrade pip - name: Get pip cache dir id: pip-cache @@ -76,11 +76,11 @@ jobs: - uses: actions/checkout@v4 # only where prebuilt wheels do not exist - # - name: Install LXML dependencies - # if: ${{ matrix.python-version == '3.13-dev' }} - # run: | - # sudo apt-get update - # sudo apt-get install libxml2-dev libxslt-dev + - name: Install LXML dependencies + if: ${{ matrix.python-version == '3.13' }} + run: | + sudo apt-get update + sudo apt-get install libxml2-dev libxslt-dev - name: Install dependencies run: python -m pip install -e ".[dev]" @@ -105,7 +105,7 @@ jobs: run: python -m pip install -e ".[all]" - name: Type checking - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.11' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} run: | mypy -p trafilatura diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index 34b228c7..af855ee2 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -291,7 +291,7 @@ def convert_lists(elem: _Element) -> None: for subelem in elem.iter("dd", "dt", "li"): # keep track of dd/dt items if subelem.tag in ("dd", "dt"): - subelem.set("rend", f"{subelem.tag}-{i}") + subelem.set("rend", f"{str(subelem.tag)}-{i}") # increment counter after
in description list if subelem.tag == "dd": i += 1 @@ -397,7 +397,7 @@ def convert_tags( convert_link(elem, base_url) if options.formatting: - for elem in tree.iter(REND_TAG_MAPPING.keys()): # type: ignore[call-overload] + for elem in tree.iter(REND_TAG_MAPPING.keys()): elem.attrib.clear() elem.set("rend", REND_TAG_MAPPING[elem.tag]) elem.tag = "hi" @@ -405,7 +405,7 @@ def convert_tags( strip_tags(tree, *REND_TAG_MAPPING.keys()) # iterate over all concerned elements - for elem in tree.iter(CONVERSIONS.keys()): # type: ignore[call-overload] + for elem in tree.iter(CONVERSIONS.keys()): CONVERSIONS[elem.tag](elem) # images if options.images: @@ -430,12 +430,13 @@ def convert_tags( def convert_to_html(tree: _Element) -> _Element: "Convert XML to simplified HTML." - for elem in tree.iter(HTML_CONVERSIONS.keys()): # type: ignore[call-overload] + for elem in tree.iter(HTML_CONVERSIONS.keys()): + conversion = HTML_CONVERSIONS[str(elem.tag)] # apply function or straight conversion - if callable(HTML_CONVERSIONS[elem.tag]): - elem.tag = HTML_CONVERSIONS[elem.tag](elem) # type: ignore[operator] + if callable(conversion): + elem.tag = conversion(elem) else: - elem.tag = HTML_CONVERSIONS[elem.tag] + elem.tag = conversion # type: ignore[assignment] # handle attributes if elem.tag == "a": elem.set("href", elem.attrib.pop("target", "")) diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py index 2bd8d60b..eb50338e 100644 --- a/trafilatura/main_extractor.py +++ b/trafilatura/main_extractor.py @@ -35,7 +35,7 @@ NOT_AT_THE_END = {'head', 'ref'} -def _log_event(msg: str, tag: str, text: Optional[Union[bytes, str]]) -> None: +def _log_event(msg: str, tag: Any, text: Optional[Union[bytes, str]]) -> None: "Format extraction event for debugging purposes." LOGGER.debug("%s: %s %s", msg, tag, trim(text or "") or "None") @@ -365,7 +365,7 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac # calculate maximum number of columns per row, includin colspan max_cols = 0 for tr in table_elem.iter('tr'): - max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS))) # type: ignore + max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS))) # explore sub-elements seen_header_row = False diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py index 5ccfb9f4..96742bd0 100644 --- a/trafilatura/readability_lxml.py +++ b/trafilatura/readability_lxml.py @@ -269,7 +269,8 @@ def class_weight(self, elem: HtmlElement) -> float: def score_node(self, elem: HtmlElement) -> Candidate: score = self.class_weight(elem) - name = elem.tag.lower() + tag = str(elem.tag) + name = tag.lower() if name in DIV_SCORES: score += 5 elif name in BLOCK_SCORES: