From 0f05e0b007a98b63f8b89edfc54127c32774e06f Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 22 Nov 2024 13:55:09 +0100 Subject: [PATCH] CLI: add 126 exit code for high error ratio (#747) * CLI: add 126 exit code for high error ratio * improve redability and debugging * add tests --- tests/cli_tests.py | 6 +++++- trafilatura/cli.py | 19 +++++++------------ trafilatura/cli_utils.py | 29 ++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/tests/cli_tests.py b/tests/cli_tests.py index 42bd2167..aeba580a 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -240,6 +240,10 @@ def test_sysoutput(): def test_download(): """test page download and command-line interface""" + assert cli_utils._define_exit_code([], 0) == 0 + assert cli_utils._define_exit_code(["a"], 1) == 126 + assert cli_utils._define_exit_code(["a"], 2) == 1 + testargs = ["", "-v"] with patch.object(sys, "argv", testargs): args = cli.parse_args(testargs) @@ -264,7 +268,7 @@ def test_download(): args = cli.parse_args(testargs) with pytest.raises(SystemExit) as e: cli.process_args(args) - assert e.type == SystemExit and e.value.code == 1 + assert e.type == SystemExit and e.value.code == 126 # @patch('trafilatura.settings.MAX_FILES_PER_DIRECTORY', 1) diff --git a/trafilatura/cli.py b/trafilatura/cli.py index 4fed7c4f..e69a6d5b 100644 --- a/trafilatura/cli.py +++ b/trafilatura/cli.py @@ -197,7 +197,7 @@ def main() -> None: def process_args(args: Any) -> None: """Perform the actual processing according to the arguments""" - error_caught = False + exit_code = 0 if args.verbose == 1: logging.basicConfig(stream=sys.stdout, level=logging.WARNING) @@ -211,7 +211,7 @@ def process_args(args: Any) -> None: # fetch urls from a feed or a sitemap if args.explore or args.feed or args.sitemap: - cli_discovery(args) + exit_code = cli_discovery(args) # activate crawler/spider elif args.crawl: @@ -225,15 +225,10 @@ def process_args(args: Any) -> None: elif args.input_dir: file_processing_pipeline(args) - # read url list from input file - elif args.input_file: + # read url list from input file or process input URL + elif args.input_file or args.URL: url_store = load_input_dict(args) - error_caught = url_processing_pipeline(args, url_store) - - # process input URL - elif args.URL: - url_store = load_input_dict(args) - error_caught = url_processing_pipeline(args, url_store) # process single url + exit_code = url_processing_pipeline(args, url_store) # read input on STDIN directly else: @@ -241,8 +236,8 @@ def process_args(args: Any) -> None: write_result(result, args) # change exit code if there are errors - if error_caught: - sys.exit(1) + if exit_code != 0: + sys.exit(exit_code) if __name__ == '__main__': diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index e01337bb..7fe46f4d 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -288,7 +288,7 @@ def download_queue_processing( return errors, counter -def cli_discovery(args: Any) -> None: +def cli_discovery(args: Any) -> int: "Group CLI functions dedicated to URL discovery." url_store = load_input_dict(args) input_urls = url_store.dump_urls() @@ -320,7 +320,7 @@ def cli_discovery(args: Any) -> None: reset_caches() # process the (rest of the) links found - error_caught = url_processing_pipeline(args, url_store) + exit_code = url_processing_pipeline(args, url_store) # activate site explorer if args.explore: @@ -328,6 +328,8 @@ def cli_discovery(args: Any) -> None: control_dict = build_exploration_dict(url_store, input_urls, args) cli_crawler(args, url_store=control_dict, options=options) + return exit_code + def build_exploration_dict( url_store: UrlStore, input_urls: List[str], args: Any @@ -417,18 +419,31 @@ def probe_homepage(args: Any) -> None: print(url, flush=True) -def url_processing_pipeline(args: Any, url_store: UrlStore) -> bool: +def _define_exit_code(errors: List[str], total: int) -> int: + """Compute exit code based on the number of errors: + 0 if there are no errors, 126 if there are too many, 1 otherwise.""" + ratio = len(errors) / total if total > 0 else 0 + + if ratio > 0.99: + return 126 + if errors: + return 1 + return 0 + + +def url_processing_pipeline(args: Any, url_store: UrlStore) -> int: "Aggregated functions to show a list and download and process an input list." if args.list: url_store.print_unvisited_urls() # and not write_result() return False # and not sys.exit(0) options = args_to_extractor(args) - counter = 0 if url_store.total_url_number() > MAX_FILES_PER_DIRECTORY else -1 + url_count = url_store.total_url_number() + counter = 0 if url_count > MAX_FILES_PER_DIRECTORY else -1 # download strategy errors, counter = download_queue_processing(url_store, args, counter, options) - LOGGER.debug("%s URLs could not be found", len(errors)) + LOGGER.debug("%s / %s URLs could not be found", len(errors), url_count) if args.archived is True: url_store = UrlStore() @@ -443,9 +458,9 @@ def url_processing_pipeline(args: Any, url_store: UrlStore) -> bool: len(errors), ) # pass information along if URLs are missing - return bool(archived_errors) + return _define_exit_code(archived_errors, url_store.total_url_number()) - return bool(errors) + return _define_exit_code(errors, url_count) def file_processing_pipeline(args: Any) -> None: