From 2639b2417c6db8e4df1d4f3b42f454076f7fa140 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 30 Mar 2023 16:39:45 +0200 Subject: [PATCH] setup: prepare version 1.5.0 (#317) * prepare version 1.5.0 * complete changelog --- HISTORY.md | 18 ++++++++++++++++++ setup.py | 4 ++-- tests/eval-requirements.txt | 10 +++++----- trafilatura/__init__.py | 2 +- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 69660711..2ab54191 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,24 @@ ## History / Changelog +### 1.5.0 + + +Extraction: +- fixes for metadata extraction with @felipehertzer (#295, #296), @andremacola (#282, #310), and @edkrueger (#303) +- pagetype and image urls added to metadata by @andremacola (#282, #310) +- add as_dict method to Document class with @edkrueger in #306 +- XML output fix with @knit-bee in #315 +- various smaller fixes: lists (#309), XPaths, metadata hardening + +Navigation: +- transfer URL management to courlan.UrlStore (#232, #312) +- fixes for spider module + +Maintenance: +- simplify code and extend tests +- underlying packages htmldate and courlan, update setup and docs + ### 1.4.1 diff --git a/setup.py b/setup.py index 8e0e2f48..e8877280 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ def get_long_description(): "brotli", "cchardet >= 2.1.7; python_version < '3.11'", # build issue "faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build - "htmldate[speed] >= 1.4.1", + "htmldate[speed] >= 1.4.2", "py3langid >= 0.2.2", "pycurl >= 7.45.2", ], @@ -110,7 +110,7 @@ def get_long_description(): "charset_normalizer >= 3.0.1; python_version < '3.7'", "charset_normalizer >= 3.1.0; python_version >= '3.7'", "courlan >= 0.9.0", - "htmldate >= 1.4.1", + "htmldate >= 1.4.2", "justext >= 3.0.0", "lxml >= 4.9.2", "urllib3 >= 1.26, < 2", diff --git a/tests/eval-requirements.txt b/tests/eval-requirements.txt index b098d426..0b35508e 100644 --- a/tests/eval-requirements.txt +++ b/tests/eval-requirements.txt @@ -1,19 +1,19 @@ -trafilatura==1.4.0 +trafilatura==1.5.0 # alternatives -beautifulsoup4==4.11.1 +beautifulsoup4==4.12.1 boilerpy3==1.0.6 #dragnet==2.0.4 # unmaintained! -goose3==3.1.12 +goose3==3.1.13 html2text==2020.1.16 html-text==0.5.2 -inscriptis==2.3.1 +inscriptis==2.3.2 justext==3.0.0 newspaper3k==0.2.8 news-please==1.5.22 readabilipy==0.2.0 readability-lxml==0.8.1 -resiliparse==0.13.7 +resiliparse==0.14.3 # additional data #jparser==0.0.20 diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py index 4d8059a3..1d6a47a8 100644 --- a/trafilatura/__init__.py +++ b/trafilatura/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Adrien Barbaresi and contributors' __license__ = 'GNU GPL v3+' __copyright__ = 'Copyright 2019-2023, Adrien Barbaresi' -__version__ = '1.4.1' +__version__ = '1.5.0' import logging