From 60647e5eeff77df9ef4b7432a936f580736b3075 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 27 Jun 2024 15:59:56 +0200 Subject: [PATCH] prepare v1.11.0 (#631) * prepare v1.11.0 * update docstrings and docs * explaing change * add last fix --- HISTORY.md | 23 +++++++++++++++++++++++ README.md | 9 ++++----- docs/index.rst | 9 ++++----- docs/usage-cli.rst | 9 +++++---- docs/usage-python.rst | 2 +- trafilatura/__init__.py | 2 +- trafilatura/cli.py | 6 +++--- trafilatura/core.py | 8 ++++---- 8 files changed, 45 insertions(+), 23 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6cae0a34..1cc8be5d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,29 @@ ## History / Changelog +### 1.11.0 + +Breaking change: +- metadata now skipped by default (#613), to trigger inclusion in all output formats: + - `with_metadata=True` (Python) + - `--with-metadata` (CLI) + +Extraction: +- add HTML as output format (#614) +- better and faster baseline extraction (#619) +- better handling of HTML/XML elements (#628) +- XPath rules added with @felipehertzer (#540) +- fix: avoid faulty readability_lxml content (#635) + +Evaluation: +- new scripts and data with @LydiaKoerber (#606, #615) +- additional data with @swetepete (#197) + +Maintenance: +- docs extended and updated, added page on deduplication (#618) +- review code, add tests and types in part of the submodules (#620, #623, #624, #625) + + ### 1.10.0 Breaking changes: diff --git a/README.md b/README.md index 3904b268..8c7dc89b 100644 --- a/README.md +++ b/README.md @@ -60,11 +60,10 @@ search engine optimization, and information security). - Optional elements: comments, links, images, tables - Multiple output formats: - - Text - - Markdown (with formatting) - - CSV (with metadata) - - JSON (with metadata) - - XML or [XML-TEI](https://tei-c.org/) (with metadata, text formatting and page structure) + - TXT and Markdown + - CSV + - JSON + - HTML, XML and [XML-TEI](https://tei-c.org/) - Optional add-ons: - Language detection on extracted content diff --git a/docs/index.rst b/docs/index.rst index 0374924d..8199245f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -60,11 +60,10 @@ Features - Formatting and structure: paragraphs, titles, lists, quotes, code, line breaks, in-line text formatting - Optional elements: comments, links, images, tables - Multiple output formats: - - Text - - Markdown (with formatting) - - CSV (with metadata) - - JSON (with metadata) - - XML or `XML-TEI `_ (with metadata, text formatting and page structure) + - TXT and Markdown + - CSV + - JSON + - HTML, XML and [XML-TEI](https://tei-c.org/) - Optional add-ons: - Language detection on extracted content - Graphical user interface (GUI) diff --git a/docs/usage-cli.rst b/docs/usage-cli.rst index a61b368e..e918c959 100644 --- a/docs/usage-cli.rst +++ b/docs/usage-cli.rst @@ -283,8 +283,8 @@ For all usage instructions see ``trafilatura -h``: [--no-tables] [--only-with-metadata] [--target-language TARGET_LANGUAGE] [--deduplicate] [--config-file CONFIG_FILE] [--precision] [--recall] - [-out {txt,csv,json,markdown,xml,xmltei} | --csv | --json | - --markdown | --xml | --xmltei] + [-out {txt,csv,html,json,markdown,xml,xmltei} | --csv | --html | + --json | --markdown | --xml | --xmltei] [--validate-tei] [-v] [--version] @@ -343,7 +343,7 @@ Extraction: --no-comments don't output any comments --no-tables don't output any table elements --only-with-metadata only output those documents with title, URL and date - (for formats supporting metadata) + --with-metadata extract and add metadata to the output --target-language TARGET_LANGUAGE select a target language (ISO 639-1 codes) --deduplicate filter out duplicate documents and sections @@ -360,8 +360,9 @@ Format: .. code-block:: bash - -out {txt,csv,json,markdown,xml,xmltei}, --output-format {txt,csv,json,markdown,xml,xmltei} determine output format + -out {txt,csv,html,json,markdown,xml,xmltei}, --output-format {txt,csv,html,json,markdown,xml,xmltei} --csv shorthand for CSV output + --html shorthand for HTML output --json shorthand for JSON output --markdown shorthand for MD output --xml shorthand for XML output diff --git a/docs/usage-python.rst b/docs/usage-python.rst index d63641b4..a9b2380e 100644 --- a/docs/usage-python.rst +++ b/docs/usage-python.rst @@ -33,7 +33,7 @@ For the basics see `quickstart documentation page `_. Default output is set to TXT (bare text) without metadata. -The following formats are available: bare text, Markdown (from version 1.9 onwards), CSV, JSON, XML, and XML following the guidelines of the Text Encoding Initiative (TEI). +The following formats are available: bare text, Markdown (from version 1.9 onwards), HTML (from version 1.11 onwards), CSV, JSON, XML, and XML following the guidelines of the Text Encoding Initiative (TEI). .. hint:: diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py index defa0e2e..a0245c6c 100644 --- a/trafilatura/__init__.py +++ b/trafilatura/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Adrien Barbaresi and contributors' __license__ = "Apache-2.0" __copyright__ = 'Copyright 2019-2024, Adrien Barbaresi' -__version__ = '1.10.0' +__version__ = '1.11.0' import logging diff --git a/trafilatura/cli.py b/trafilatura/cli.py index 151d7e6d..fd8f3775 100644 --- a/trafilatura/cli.py +++ b/trafilatura/cli.py @@ -138,11 +138,11 @@ def add_args(parser): help="don't output any table elements", action="store_false") # false = no tables group4.add_argument("--only-with-metadata", - help="only output those documents with title, URL and date (for formats supporting metadata)", + help="only output those documents with title, URL and date", action="store_true") group4.add_argument("--with-metadata", - help=argparse.SUPPRESS, - action="store_true") # will be deprecated + help="extract and add metadata to the output", + action="store_true") group4.add_argument("--target-language", help="select a target language (ISO 639-1 codes)", type=str) diff --git a/trafilatura/core.py b/trafilatura/core.py index 2dfd974c..3839d12a 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -89,7 +89,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, include_comments: Extract comments along with the main text. output_format: Define an output format, Python being the default and the interest of this internal function. - Other values: "csv", "json", "markdown", "txt", "xml", and "xmltei". + Other values: "csv", "html", "json", "markdown", "txt", "xml", and "xmltei". target_language: Define a language to discard invalid documents (ISO 639-1 format). include_tables: Take into account information within the HTML element. include_images: Take images into account (experimental). @@ -98,7 +98,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, include_links: Keep links along with their targets (experimental). deduplicate: Remove duplicate segments and documents. date_extraction_params: Provide extraction parameters to htmldate as dict(). - with_metadata: Extract metadata fields and add them to the output (available soon). + with_metadata: Extract metadata fields and add them to the output. only_with_metadata: Only keep documents featuring all essential metadata (date, title, url). max_tree_size: Discard documents with too many elements. @@ -278,7 +278,7 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False, favor_recall: when unsure, prefer more text. include_comments: Extract comments along with the main text. output_format: Define an output format: - "csv", "json", "markdown", "txt", "xml", and "xmltei". + "csv", "html", "json", "markdown", "txt", "xml", and "xmltei". tei_validation: Validate the XML-TEI output with respect to the TEI standard. target_language: Define a language to discard invalid documents (ISO 639-1 format). include_tables: Take into account information within the HTML
element. @@ -288,7 +288,7 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False, include_links: Keep links along with their targets (experimental). deduplicate: Remove duplicate segments and documents. date_extraction_params: Provide extraction parameters to htmldate as dict(). - with_metadata: Extract metadata fields and add them to the output (available soon). + with_metadata: Extract metadata fields and add them to the output. only_with_metadata: Only keep documents featuring all essential metadata (date, title, url). max_tree_size: Discard documents with too many elements.