Skip to content

Commit

Permalink
CLI: add 126 exit code for high error ratio (#747)
Browse files Browse the repository at this point in the history
* CLI: add 126 exit code for high error ratio

* improve redability and debugging

* add tests
  • Loading branch information
adbar authored Nov 22, 2024
1 parent 34ae955 commit 0f05e0b
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 20 deletions.
6 changes: 5 additions & 1 deletion tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,10 @@ def test_sysoutput():

def test_download():
"""test page download and command-line interface"""
assert cli_utils._define_exit_code([], 0) == 0
assert cli_utils._define_exit_code(["a"], 1) == 126
assert cli_utils._define_exit_code(["a"], 2) == 1

testargs = ["", "-v"]
with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
Expand All @@ -264,7 +268,7 @@ def test_download():
args = cli.parse_args(testargs)
with pytest.raises(SystemExit) as e:
cli.process_args(args)
assert e.type == SystemExit and e.value.code == 1
assert e.type == SystemExit and e.value.code == 126


# @patch('trafilatura.settings.MAX_FILES_PER_DIRECTORY', 1)
Expand Down
19 changes: 7 additions & 12 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def main() -> None:

def process_args(args: Any) -> None:
"""Perform the actual processing according to the arguments"""
error_caught = False
exit_code = 0

if args.verbose == 1:
logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
Expand All @@ -211,7 +211,7 @@ def process_args(args: Any) -> None:

# fetch urls from a feed or a sitemap
if args.explore or args.feed or args.sitemap:
cli_discovery(args)
exit_code = cli_discovery(args)

# activate crawler/spider
elif args.crawl:
Expand All @@ -225,24 +225,19 @@ def process_args(args: Any) -> None:
elif args.input_dir:
file_processing_pipeline(args)

# read url list from input file
elif args.input_file:
# read url list from input file or process input URL
elif args.input_file or args.URL:
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store)

# process input URL
elif args.URL:
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store) # process single url
exit_code = url_processing_pipeline(args, url_store)

# read input on STDIN directly
else:
result = examine(sys.stdin.buffer.read(), args, url=args.URL)
write_result(result, args)

# change exit code if there are errors
if error_caught:
sys.exit(1)
if exit_code != 0:
sys.exit(exit_code)


if __name__ == '__main__':
Expand Down
29 changes: 22 additions & 7 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def download_queue_processing(
return errors, counter


def cli_discovery(args: Any) -> None:
def cli_discovery(args: Any) -> int:
"Group CLI functions dedicated to URL discovery."
url_store = load_input_dict(args)
input_urls = url_store.dump_urls()
Expand Down Expand Up @@ -320,14 +320,16 @@ def cli_discovery(args: Any) -> None:
reset_caches()

# process the (rest of the) links found
error_caught = url_processing_pipeline(args, url_store)
exit_code = url_processing_pipeline(args, url_store)

# activate site explorer
if args.explore:
# add to compressed dict and crawl the remaining websites
control_dict = build_exploration_dict(url_store, input_urls, args)
cli_crawler(args, url_store=control_dict, options=options)

return exit_code


def build_exploration_dict(
url_store: UrlStore, input_urls: List[str], args: Any
Expand Down Expand Up @@ -417,18 +419,31 @@ def probe_homepage(args: Any) -> None:
print(url, flush=True)


def url_processing_pipeline(args: Any, url_store: UrlStore) -> bool:
def _define_exit_code(errors: List[str], total: int) -> int:
"""Compute exit code based on the number of errors:
0 if there are no errors, 126 if there are too many, 1 otherwise."""
ratio = len(errors) / total if total > 0 else 0

if ratio > 0.99:
return 126
if errors:
return 1
return 0


def url_processing_pipeline(args: Any, url_store: UrlStore) -> int:
"Aggregated functions to show a list and download and process an input list."
if args.list:
url_store.print_unvisited_urls() # and not write_result()
return False # and not sys.exit(0)

options = args_to_extractor(args)
counter = 0 if url_store.total_url_number() > MAX_FILES_PER_DIRECTORY else -1
url_count = url_store.total_url_number()
counter = 0 if url_count > MAX_FILES_PER_DIRECTORY else -1

# download strategy
errors, counter = download_queue_processing(url_store, args, counter, options)
LOGGER.debug("%s URLs could not be found", len(errors))
LOGGER.debug("%s / %s URLs could not be found", len(errors), url_count)

if args.archived is True:
url_store = UrlStore()
Expand All @@ -443,9 +458,9 @@ def url_processing_pipeline(args: Any, url_store: UrlStore) -> bool:
len(errors),
)
# pass information along if URLs are missing
return bool(archived_errors)
return _define_exit_code(archived_errors, url_store.total_url_number())

return bool(errors)
return _define_exit_code(errors, url_count)


def file_processing_pipeline(args: Any) -> None:
Expand Down

0 comments on commit 0f05e0b

Please sign in to comment.