From 36caa71fc07308b90ef42c403da0067cd3a288ba Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <barbaresi@bbaw.de>
Date: Wed, 20 Nov 2024 17:06:35 +0100
Subject: [PATCH] fix errors and update setup

---
 .github/workflows/tests.yml     | 16 ++++++++--------
 trafilatura/htmlprocessing.py   | 15 ++++++++-------
 trafilatura/main_extractor.py   |  4 ++--
 trafilatura/readability_lxml.py |  3 ++-
 4 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f01bc9f8..f6713267 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -18,7 +18,7 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         # https://github.com/actions/python-versions/blob/main/versions-manifest.json
-        python-version: ["3.9", "3.11"]  # "3.13", "3.14-dev"
+        python-version: ["3.9", "3.11", "3.13"]  # "3.13", "3.14-dev"
         env:
           - MINIMAL: "true"
             PROXY_TEST: "false"
@@ -57,7 +57,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
 
     - name: Upgrade pip
-      run: python -m pip install --upgrade pip setuptools wheel
+      run: python -m pip install --upgrade pip
 
     - name: Get pip cache dir
       id: pip-cache
@@ -76,11 +76,11 @@ jobs:
     - uses: actions/checkout@v4
 
     # only where prebuilt wheels do not exist
-    # - name: Install LXML dependencies
-    #   if: ${{ matrix.python-version == '3.13-dev' }}
-    #   run: |
-    #     sudo apt-get update
-    #     sudo apt-get install libxml2-dev libxslt-dev
+    - name: Install LXML dependencies
+       if: ${{ matrix.python-version == '3.13' }}
+       run: |
+         sudo apt-get update
+         sudo apt-get install libxml2-dev libxslt-dev
 
     - name: Install dependencies
       run: python -m pip install -e ".[dev]"
@@ -105,7 +105,7 @@ jobs:
       run: python -m pip install -e ".[all]"
 
     - name: Type checking
-      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.11' }}
+      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
       run: |
         mypy -p trafilatura
 
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
index 34b228c7..af855ee2 100644
--- a/trafilatura/htmlprocessing.py
+++ b/trafilatura/htmlprocessing.py
@@ -291,7 +291,7 @@ def convert_lists(elem: _Element) -> None:
     for subelem in elem.iter("dd", "dt", "li"):
         # keep track of dd/dt items
         if subelem.tag in ("dd", "dt"):
-            subelem.set("rend", f"{subelem.tag}-{i}")
+            subelem.set("rend", f"{str(subelem.tag)}-{i}")
             # increment counter after <dd> in description list
             if subelem.tag == "dd":
                 i += 1
@@ -397,7 +397,7 @@ def convert_tags(
             convert_link(elem, base_url)
 
     if options.formatting:
-        for elem in tree.iter(REND_TAG_MAPPING.keys()):  # type: ignore[call-overload]
+        for elem in tree.iter(REND_TAG_MAPPING.keys()):
             elem.attrib.clear()
             elem.set("rend", REND_TAG_MAPPING[elem.tag])
             elem.tag = "hi"
@@ -405,7 +405,7 @@ def convert_tags(
         strip_tags(tree, *REND_TAG_MAPPING.keys())
 
     # iterate over all concerned elements
-    for elem in tree.iter(CONVERSIONS.keys()):  # type: ignore[call-overload]
+    for elem in tree.iter(CONVERSIONS.keys()):
         CONVERSIONS[elem.tag](elem)
     # images
     if options.images:
@@ -430,12 +430,13 @@ def convert_tags(
 
 def convert_to_html(tree: _Element) -> _Element:
     "Convert XML to simplified HTML."
-    for elem in tree.iter(HTML_CONVERSIONS.keys()):  # type: ignore[call-overload]
+    for elem in tree.iter(HTML_CONVERSIONS.keys()):
+        conversion = HTML_CONVERSIONS[str(elem.tag)]
         # apply function or straight conversion
-        if callable(HTML_CONVERSIONS[elem.tag]):
-            elem.tag = HTML_CONVERSIONS[elem.tag](elem)  # type: ignore[operator]
+        if callable(conversion):
+            elem.tag = conversion(elem)
         else:
-            elem.tag = HTML_CONVERSIONS[elem.tag]
+            elem.tag = conversion  # type: ignore[assignment]
         # handle attributes
         if elem.tag == "a":
             elem.set("href", elem.attrib.pop("target", ""))
diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
index 2bd8d60b..eb50338e 100644
--- a/trafilatura/main_extractor.py
+++ b/trafilatura/main_extractor.py
@@ -35,7 +35,7 @@
 NOT_AT_THE_END = {'head', 'ref'}
 
 
-def _log_event(msg: str, tag: str, text: Optional[Union[bytes, str]]) -> None:
+def _log_event(msg: str, tag: Any, text: Optional[Union[bytes, str]]) -> None:
     "Format extraction event for debugging purposes."
     LOGGER.debug("%s: %s %s", msg, tag, trim(text or "") or "None")
 
@@ -365,7 +365,7 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
     # calculate maximum number of columns per row, includin colspan
     max_cols = 0
     for tr in table_elem.iter('tr'):
-        max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))  # type: ignore
+        max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))
 
     # explore sub-elements
     seen_header_row = False
diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py
index 5ccfb9f4..96742bd0 100644
--- a/trafilatura/readability_lxml.py
+++ b/trafilatura/readability_lxml.py
@@ -269,7 +269,8 @@ def class_weight(self, elem: HtmlElement) -> float:
 
     def score_node(self, elem: HtmlElement) -> Candidate:
         score = self.class_weight(elem)
-        name = elem.tag.lower()
+        tag = str(elem.tag)
+        name = tag.lower()
         if name in DIV_SCORES:
             score += 5
         elif name in BLOCK_SCORES: