Skip to content

Commit

Permalink
Handle when child process is killed by OOM killer
Browse files Browse the repository at this point in the history
  • Loading branch information
johandahlberg committed Jun 4, 2024
1 parent 206c806 commit beb7058
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 3 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

* Fix a bug where `a_pixels_per_b_pixel` summary statistics where equal to the `b_pixels_per_a_pixel` statistics.
* `collapse` will return exit code 137 when one of the child processes is killed by the system (e.g. because it is
to much memory). This allows e.g. Nextflow to retry the process with more memory automatically.

## [0.17.1] - 2024-05-27

Expand Down
39 changes: 36 additions & 3 deletions src/pixelator/cli/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Copyright © 2022 Pixelgen Technologies AB.
"""

import sys
from collections import defaultdict
from concurrent import futures
from pathlib import Path
Expand All @@ -25,6 +26,38 @@
)


def _handle_errors(jobs, executor):
for job in jobs:
exception = job.exception()
if exception is None:
continue

Check warning on line 33 in src/pixelator/cli/collapse.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/cli/collapse.py#L30-L33

Added lines #L30 - L33 were not covered by tests

logger.error(

Check warning on line 35 in src/pixelator/cli/collapse.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/cli/collapse.py#L35

Added line #L35 was not covered by tests
"Found an issue in the process pool. Trying to determine what went wrong and set the correct exit code. Exception was: %s",
exception,
)
process_map = executor._processes
for pid in process_map.keys():
exit_code = process_map[pid].exitcode
if exit_code is not None and exit_code != 0:
logger.error(

Check warning on line 43 in src/pixelator/cli/collapse.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/cli/collapse.py#L39-L43

Added lines #L39 - L43 were not covered by tests
"The child process in the process pool returned a non-zero exit code: %s.",
exit_code,
)
# If we have an out of memory exception, make sure we exit with that.
if abs(exit_code) == 9:
logger.error(

Check warning on line 49 in src/pixelator/cli/collapse.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/cli/collapse.py#L48-L49

Added lines #L48 - L49 were not covered by tests
"One of the child processes was killed (exit code: 9). Usually this is caused "
"by a child process using to much memory. We will return an exit code of 137 "
"to indicate that the process was killed by the out of memory killer."
)
sys.exit(137)
logger.error(

Check warning on line 55 in src/pixelator/cli/collapse.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/cli/collapse.py#L54-L55

Added lines #L54 - L55 were not covered by tests
"Was unable to determine what when wrong in process pool. Will raise original exception."
)
raise exception

Check warning on line 58 in src/pixelator/cli/collapse.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/cli/collapse.py#L58

Added line #L58 was not covered by tests


@click.command(
"collapse",
short_help=(
Expand Down Expand Up @@ -238,12 +271,12 @@ def collapse(
min_count=min_count,
)
)
jobs = list(futures.as_completed(jobs))
_handle_errors(jobs, executor)

Check warning on line 275 in src/pixelator/cli/collapse.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/cli/collapse.py#L274-L275

Added lines #L274 - L275 were not covered by tests

total_input_reads = 0
tmp_files = []
for job in futures.as_completed(jobs):
if job.exception() is not None:
raise job.exception()
for job in jobs:

Check warning on line 279 in src/pixelator/cli/collapse.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/cli/collapse.py#L279

Added line #L279 was not covered by tests
# the worker returns a path to a file (temp antibody edge list)
tmp_file, input_reads_count = job.result()
if tmp_file is not None:
Expand Down
5 changes: 5 additions & 0 deletions src/pixelator/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ def log_exception(exc_type, exc_value, traceback_obj):
# and thus we ignore them here.
return False

if issubclass(exc_type, SystemExit):
# SystemExit is raised when the application has been explicitly
# directed to exit, so we don't what a trace dumped for that.
return False

Check warning on line 220 in src/pixelator/logging.py

View check run for this annotation

Codecov / codecov/patch

src/pixelator/logging.py#L220

Added line #L220 was not covered by tests

self._root_logger.critical(
"Unhandled exception of type: {}".format(exc_type.__name__)
)
Expand Down

0 comments on commit beb7058

Please sign in to comment.