Skip to content

Commit

Permalink
fix errors and update setup
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 20, 2024
1 parent 1cb1ed2 commit 36caa71
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 18 deletions.
16 changes: 8 additions & 8 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
matrix:
os: [ubuntu-latest]
# https://github.com/actions/python-versions/blob/main/versions-manifest.json
python-version: ["3.9", "3.11"] # "3.13", "3.14-dev"
python-version: ["3.9", "3.11", "3.13"] # "3.13", "3.14-dev"
env:
- MINIMAL: "true"
PROXY_TEST: "false"
Expand Down Expand Up @@ -57,7 +57,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Upgrade pip
run: python -m pip install --upgrade pip setuptools wheel
run: python -m pip install --upgrade pip

- name: Get pip cache dir
id: pip-cache
Expand All @@ -76,11 +76,11 @@ jobs:
- uses: actions/checkout@v4

# only where prebuilt wheels do not exist
# - name: Install LXML dependencies
# if: ${{ matrix.python-version == '3.13-dev' }}
# run: |
# sudo apt-get update
# sudo apt-get install libxml2-dev libxslt-dev
- name: Install LXML dependencies
if: ${{ matrix.python-version == '3.13' }}
run: |
sudo apt-get update
sudo apt-get install libxml2-dev libxslt-dev
- name: Install dependencies
run: python -m pip install -e ".[dev]"
Expand All @@ -105,7 +105,7 @@ jobs:
run: python -m pip install -e ".[all]"

- name: Type checking
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.11' }}
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
run: |
mypy -p trafilatura
Expand Down
15 changes: 8 additions & 7 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def convert_lists(elem: _Element) -> None:
for subelem in elem.iter("dd", "dt", "li"):
# keep track of dd/dt items
if subelem.tag in ("dd", "dt"):
subelem.set("rend", f"{subelem.tag}-{i}")
subelem.set("rend", f"{str(subelem.tag)}-{i}")
# increment counter after <dd> in description list
if subelem.tag == "dd":
i += 1
Expand Down Expand Up @@ -397,15 +397,15 @@ def convert_tags(
convert_link(elem, base_url)

if options.formatting:
for elem in tree.iter(REND_TAG_MAPPING.keys()): # type: ignore[call-overload]
for elem in tree.iter(REND_TAG_MAPPING.keys()):
elem.attrib.clear()
elem.set("rend", REND_TAG_MAPPING[elem.tag])
elem.tag = "hi"
else:
strip_tags(tree, *REND_TAG_MAPPING.keys())

# iterate over all concerned elements
for elem in tree.iter(CONVERSIONS.keys()): # type: ignore[call-overload]
for elem in tree.iter(CONVERSIONS.keys()):
CONVERSIONS[elem.tag](elem)
# images
if options.images:
Expand All @@ -430,12 +430,13 @@ def convert_tags(

def convert_to_html(tree: _Element) -> _Element:
"Convert XML to simplified HTML."
for elem in tree.iter(HTML_CONVERSIONS.keys()): # type: ignore[call-overload]
for elem in tree.iter(HTML_CONVERSIONS.keys()):
conversion = HTML_CONVERSIONS[str(elem.tag)]
# apply function or straight conversion
if callable(HTML_CONVERSIONS[elem.tag]):
elem.tag = HTML_CONVERSIONS[elem.tag](elem) # type: ignore[operator]
if callable(conversion):
elem.tag = conversion(elem)
else:
elem.tag = HTML_CONVERSIONS[elem.tag]
elem.tag = conversion # type: ignore[assignment]
# handle attributes
if elem.tag == "a":
elem.set("href", elem.attrib.pop("target", ""))
Expand Down
4 changes: 2 additions & 2 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
NOT_AT_THE_END = {'head', 'ref'}


def _log_event(msg: str, tag: str, text: Optional[Union[bytes, str]]) -> None:
def _log_event(msg: str, tag: Any, text: Optional[Union[bytes, str]]) -> None:
"Format extraction event for debugging purposes."
LOGGER.debug("%s: %s %s", msg, tag, trim(text or "") or "None")

Expand Down Expand Up @@ -365,7 +365,7 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
# calculate maximum number of columns per row, includin colspan
max_cols = 0
for tr in table_elem.iter('tr'):
max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS))) # type: ignore
max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))

# explore sub-elements
seen_header_row = False
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/readability_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ def class_weight(self, elem: HtmlElement) -> float:

def score_node(self, elem: HtmlElement) -> Candidate:
score = self.class_weight(elem)
name = elem.tag.lower()
tag = str(elem.tag)
name = tag.lower()
if name in DIV_SCORES:
score += 5
elif name in BLOCK_SCORES:
Expand Down

0 comments on commit 36caa71

Please sign in to comment.