From 5f5155712d647e1d52dd1cde648b674e1bb04ea7 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 21 Jan 2025 12:25:11 -0700 Subject: [PATCH] feat: use pybwa --- README.md | 4 +- ...stallation-and-developers-documentation.md | 89 +++++++ poetry.lock | 36 ++- prymer.yml | 1 - prymer/model.py | 2 +- prymer/offtarget/__init__.py | 4 +- prymer/offtarget/bwa.py | 183 ++++--------- prymer/offtarget/offtarget_detector.py | 14 +- prymer/primer3/primer3_parameters.py | 2 + pyproject.toml | 1 + tests/offtarget/test_bwa.py | 240 ++++++++---------- tests/offtarget/test_offtarget.py | 2 - 12 files changed, 289 insertions(+), 289 deletions(-) create mode 100644 docs/installation-and-developers-documentation.md diff --git a/README.md b/README.md index 46b531a..800ec3c 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,7 @@ ## Recommended Installation -The package `prymer` requires installation of [interactive `bwa`](https://github.com/fulcrumgenomics/bwa-aln-interactive). - -To satisfy these requirements, it is recommended to install using [bioconda](https://bioconda.github.io/): +It is recommended to install using [bioconda](https://bioconda.github.io/): ```console mamba install -c bioconda prymer diff --git a/docs/installation-and-developers-documentation.md b/docs/installation-and-developers-documentation.md new file mode 100644 index 0000000..309949d --- /dev/null +++ b/docs/installation-and-developers-documentation.md @@ -0,0 +1,89 @@ +# Installation and Developer's Documentation + +## Recommended Installation + +It is recommended to install using [bioconda](https://bioconda.github.io/): + +```console +mamba install -c bioconda prymer +``` + +## Installation for Development and Release + +1. Install the environment manager [`mamba`](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html) +2. Install the Python build tool [`poetry`](https://python-poetry.org/docs/#installing-with-the-official-installer) + + ```console + mamba env create -y -f prymer.yml + ``` + +3. Activate the environment: + + ```console + mamba activate prymer + ``` + +4. Configure `poetry` to install into pre-existing virtual environments: + + ```console + poetry config virtualenvs.create false + ``` + +5. Install `prymer` into the virtual environment: + + ```console + poetry install + ``` + +## Checking the Build + +Use `poetry` to test your code. + +```console +poetry run pytest +``` + +Note that `poetry run pytest` will run `mypy` checks, `ruff` checks, `pytest` unit tests, and will provide a unit test coverage report. +However, `pytest` will neither run the ruff formatter nor apply `ruff`'s automatic lint fixes, which can be done by calling `ruff` directly. + +```console +poetry run ruff format && poetry run ruff check --fix +``` + +## Building the Documentation + +Use `mkdocs` to build and serve the documentation. + +```console +poetry run mkdocs build && poetry run mkdocs serve +``` + +## Creating a Release on PyPi + +1. Clone the repository recursively and ensure you are on the `main` (un-dirty) branch +2. Checkout a new branch to prepare the library for release +3. Bump the version of the library to the desired SemVer with `poetry version #.#.#` +4. Commit the version bump changes with a Git commit message like `chore(release): bump to #.#.#` +5. Push the commit to the upstream remote, open a PR, ensure tests pass, and seek reviews +6. Squash merge the PR +7. Tag the new commit on the main branch of the origin repository with the new SemVer + +> [!NOTE] +> This project follows [Semantic Versioning](https://semver.org/). +> In brief: +> +> * `MAJOR` version when you make incompatible API changes +> * `MINOR` version when you add functionality in a backwards compatible manner +> * `PATCH` version when you make backwards compatible bug fixes + +GitHub Actions will take care of the remainder of the deployment and release process with: + +1. Unit tests will be run for safety-sake +2. A source distribution will be built +3. Multi-arch multi-Python binary distributions will be built +4. Assets will be deployed to PyPi with the new SemVer +5. A [Conventional Commit](https://www.conventionalcommits.org/en/v1.0.0/)-aware changelog will be drafted +6. A GitHub release will be created with the new SemVer and the drafted changelog + +> [!IMPORTANT] +> Consider editing the changelog if there are any errors or necessary enhancements. diff --git a/poetry.lock b/poetry.lock index 100fa2b..8b592cf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1001,13 +1001,13 @@ mkdocs = ">=1.0.3" [[package]] name = "mkdocs-material" -version = "9.5.49" +version = "9.5.50" description = "Documentation that simply works" optional = false python-versions = ">=3.8" files = [ - {file = "mkdocs_material-9.5.49-py3-none-any.whl", hash = "sha256:c3c2d8176b18198435d3a3e119011922f3e11424074645c24019c2dcf08a360e"}, - {file = "mkdocs_material-9.5.49.tar.gz", hash = "sha256:3671bb282b4f53a1c72e08adbe04d2481a98f85fed392530051f80ff94a9621d"}, + {file = "mkdocs_material-9.5.50-py3-none-any.whl", hash = "sha256:f24100f234741f4d423a9d672a909d859668a4f404796be3cf035f10d6050385"}, + {file = "mkdocs_material-9.5.50.tar.gz", hash = "sha256:ae5fe16f3d7c9ccd05bb6916a7da7420cf99a9ce5e33debd9d40403a090d5825"}, ] [package.dependencies] @@ -1024,7 +1024,7 @@ regex = ">=2022.4" requests = ">=2.26,<3.0" [package.extras] -git = ["mkdocs-git-committers-plugin-2 (>=1.1,<2.0)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"] +git = ["mkdocs-git-committers-plugin-2 (>=1.1,<3)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"] imaging = ["cairosvg (>=2.6,<3.0)", "pillow (>=10.2,<11.0)"] recommended = ["mkdocs-minify-plugin (>=0.7,<1.0)", "mkdocs-redirects (>=1.2,<2.0)", "mkdocs-rss-plugin (>=1.6,<2.0)"] @@ -1617,6 +1617,32 @@ attrs = ">=23.0.0,<24.0.0" [package.extras] docs = ["sphinx (>=7.0.0,<8.0.0)"] +[[package]] +name = "pybwa" +version = "1.1.3" +description = "Python bindings for BWA" +optional = false +python-versions = "<4.0,>=3.9.0" +files = [ + {file = "pybwa-1.1.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:a570dcb82750477b325bd2cb34bcf4af751a914e760b6276aed584f3426ae569"}, + {file = "pybwa-1.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17f6da8926cab908dccd76dac1dabbdcb78da9d787885754a6ff77d952535b08"}, + {file = "pybwa-1.1.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:907e0da885a131a20ffa686de96a558c766a12d3021c62ae8b585ea65f514a5a"}, + {file = "pybwa-1.1.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:7fe8ac8145ff209e3d1edc76d778b2d286cfd78b4e553fbaaea3cd06466b326b"}, + {file = "pybwa-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b836ac54567bbd4b7e5af0b7b648911751fc0fb2ac1d1bc947b7434e365ac1e3"}, + {file = "pybwa-1.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1dab5b8ea2b856ee25249356dddbfe042208c4beaba551b399033827423a9bf8"}, + {file = "pybwa-1.1.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:ebbc23367d9890b50c2a7549a2cae0b40cf7e41d3502d70a4c94b3d17529f97a"}, + {file = "pybwa-1.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9a6af374f841e1cdf0b5346455b85855f608b94a4d9c7d31608809cbc11b705"}, + {file = "pybwa-1.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f231848c2e182ed6a95a82a52ec65f93ec4243a880418c520df4e61428910eb4"}, + {file = "pybwa-1.1.3-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:144ecd087837e5cb02884ecb89447f9f7327dd32ddcfb948204e117147da36dc"}, + {file = "pybwa-1.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11c71898af4a1c1417d37f250ddd803d98e9e952b606f7a3d886ebf8cbfc2bf0"}, + {file = "pybwa-1.1.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f8d40da48489edacf2c64a3b7bde2d214991c37060f67408287aedb14b94b387"}, + {file = "pybwa-1.1.3.tar.gz", hash = "sha256:be393a4e7daac3dd74d475fe1baea7ce4eb346c08d8f7f16bc1e8db0b838299c"}, +] + +[package.dependencies] +fgpyo = ">=0.7.0" +pysam = ">=0.22.1" + [[package]] name = "pycparser" version = "2.22" @@ -2432,4 +2458,4 @@ test = ["pytest"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "c401a1d5773fb02e2c3603ff3aa18d7509bd4da6b31e5a67386feaf12607f1db" +content-hash = "79043c3dfda5909453207569e0ffd5b2d942f0a5f14d893f10e7c096cdb75809" diff --git a/prymer.yml b/prymer.yml index 5b9584a..a012113 100644 --- a/prymer.yml +++ b/prymer.yml @@ -3,5 +3,4 @@ channels: - bioconda - conda-forge dependencies: - - bioconda::bwa-aln-interactive=0.7.18 - conda-forge::python>=3.11.* diff --git a/prymer/model.py b/prymer/model.py index 292fe2e..0c75a8d 100644 --- a/prymer/model.py +++ b/prymer/model.py @@ -212,7 +212,7 @@ def from_string(cls, line: str) -> "Span": strand = Strand(strand_symbol[0]) except ValueError as e: raise ValueError( - "Did not find valid strand information; " f"received {strand_symbol}" + f"Did not find valid strand information; received {strand_symbol}" ) from e elif len(parts) == 2: refname, range_ = parts diff --git a/prymer/offtarget/__init__.py b/prymer/offtarget/__init__.py index 2500200..30108d1 100644 --- a/prymer/offtarget/__init__.py +++ b/prymer/offtarget/__init__.py @@ -1,4 +1,4 @@ -from prymer.offtarget.bwa import BwaAlnInteractive +from prymer.offtarget.bwa import Bwa from prymer.offtarget.bwa import BwaHit from prymer.offtarget.bwa import BwaResult from prymer.offtarget.bwa import Query @@ -6,7 +6,7 @@ from prymer.offtarget.offtarget_detector import OffTargetResult __all__ = [ - "BwaAlnInteractive", + "Bwa", "Query", "BwaHit", "BwaResult", diff --git a/prymer/offtarget/bwa.py b/prymer/offtarget/bwa.py index 563ae81..c15b116 100644 --- a/prymer/offtarget/bwa.py +++ b/prymer/offtarget/bwa.py @@ -27,7 +27,7 @@ >>> from pathlib import Path >>> ref_fasta = Path("./tests/offtarget/data/miniref.fa") >>> query = Query(bases="TCTACTAAAAATACAAAAAATTAGCTGGGCATGATGGCATGCACCTGTAATCCCGCTACT", id="NA") ->>> bwa = BwaAlnInteractive(ref=ref_fasta, max_hits=1) +>>> bwa = Bwa(ref=ref_fasta, max_hits=1) >>> result = bwa.map_one(query=query.bases, id=query.id) >>> result.hit_count 1 @@ -38,18 +38,13 @@ >>> query = Query(bases="AAAAAA", id="NA") >>> bwa.map_all(queries=[query]) [BwaResult(query=Query(id='NA', bases='AAAAAA'), hit_count=3968, hits=[])] ->>> bwa.close() -True ``` """ # noqa: E501 -import logging import os -import subprocess from dataclasses import dataclass from pathlib import Path -from threading import Thread from typing import ClassVar from typing import Optional from typing import cast @@ -57,15 +52,14 @@ import pysam from fgpyo import sequence from fgpyo.sam import Cigar +from pybwa import BwaAln +from pybwa import BwaAlnOptions from pysam import AlignedSegment -from pysam import AlignmentHeader -from typing_extensions import override +from pysam import AlignmentFile +from pysam.libcalignmentfile import AlignmentHeader +from pysam.libcfaidx import FastxRecord from prymer.api import coordmath -from prymer.util.executable_runner import ExecutableRunner - -BWA_EXECUTABLE_NAME: str = "bwa-aln-interactive" -"""The executable name for the interactive build of bwa aln.""" @dataclass(init=True, frozen=True) @@ -204,19 +198,10 @@ class BwaResult: """The file extensions for BWA index files""" -class BwaAlnInteractive(ExecutableRunner): - """Wrapper around a novel mode of 'bwa aln' that allows for "interactive" use of bwa to keep - the process running and be able to send it chunks of reads periodically and get alignments - back without waiting for a full batch of reads to be sent. - - See: - - - [https://github.com/fulcrumgenomics/bwa-aln-interactive](https://github.com/fulcrumgenomics/bwa-aln-interactive) - - [https://bioconda.github.io/recipes/bwa-aln-interactive/README.html](https://bioconda.github.io/recipes/bwa-aln-interactive/README.html) +class Bwa: + """Wrapper around `bwa aln` via `pybwa`. Attributes: - max_hits: the maximum number of hits to report - if more than this number of seed hits - are found, report only the count and not each hit. reverse_complement: reverse complement each query sequence before alignment. include_alt_hits: if True include hits to references with names ending in _alt, otherwise do not include them. @@ -227,7 +212,6 @@ def __init__( self, ref: Path, max_hits: int, - executable: str | Path = BWA_EXECUTABLE_NAME, max_mismatches: int = 3, max_mismatches_in_seed: int = 3, max_gap_opens: int = 0, @@ -243,7 +227,6 @@ def __init__( ref: the path to the reference FASTA, which must be indexed with bwa. max_hits: the maximum number of hits to report - if more than this number of seed hits are found, report only the count and not each hit. - executable: string or Path representation of the `bwa-aln-interactive` executable path max_mismatches: the maximum number of mismatches allowed in the full query sequence max_mismatches_in_seed: the maximum number of mismatches allowed in the seed region max_gap_opens: the maximum number of gap opens allowed in the full query sequence @@ -257,12 +240,6 @@ def __init__( otherwise do not include them. threads: the number of threads to use. If `None`, use all available CPUs. """ - threads = os.cpu_count() if threads is None else threads - executable_path = ExecutableRunner.validate_executable_path(executable=executable) - self.reverse_complement: bool = reverse_complement - self.include_alt_hits: bool = include_alt_hits - self.max_hits: int = max_hits - missing_aux_paths = [] for aux_ext in BWA_AUX_EXTENSIONS: aux_path = Path(f"{ref}{aux_ext}") @@ -275,81 +252,27 @@ def __init__( else: message = "BWA index file does not exist:\n\t" message += "\t\n".join(f"{p}" for p in missing_aux_paths) - raise FileNotFoundError(f"{message}\nIndex with: `{executable_path} index {ref}`") - - # -N = non-iterative mode: search for all n-difference hits (slooow) - # -S = output SAM (run samse) - # -Z = interactive mode (no input buffer and force processing with empty lines between recs) - command: list[str] = [ - f"{executable_path}", - "aln", - "-t", - f"{threads}", - "-n", - f"{max_mismatches}", - "-o", - f"{max_gap_opens}", - "-e", - f"{max_gap_extensions}", - "-i", - f"{min_indel_to_end_distance}", - "-l", - f"{seed_length}", - "-k", - f"{max_mismatches_in_seed}", - "-X", - f"{max_hits}", - "-N", - "-S", - "-Z", - "-D", - f"{ref}", - "/dev/stdin", - ] + raise FileNotFoundError(f"{message}\nIndex with: `bwa index {ref}`") - super().__init__(command=command, stderr=subprocess.PIPE) - - header = [] - for line in self._subprocess.stdout: - if line.startswith("@"): - header.append(line) - if line.startswith("@PG"): - break - - self.header = AlignmentHeader.from_text("".join(header)) - - # NB: ExecutableRunner will default to redirecting stderr to /dev/null. However, we would - # like to preserve stderr messages from bwa for potential debugging. To do this, we create - # a single thread to continuously read from stderr and redirect text lines to a debug - # logger. The close() method of this class will additionally join the stderr logging thread. - self._logger = logging.getLogger(self.__class__.__qualname__) - self._stderr_thread = Thread( - daemon=True, - target=self._stream_to_sink, - args=(self._subprocess.stderr, self._logger.debug), - ) - self._stderr_thread.start() - - def __signal_bwa(self) -> None: - """Signals BWA to process the queries.""" - self._subprocess.stdin.flush() - # NB: the executable compiled on different platforms requires a different number of newlines - # NB: it is not understood why, but 16 newlines seems to work for all platforms tested - self._subprocess.stdin.write("\n" * 16) - self._subprocess.stdin.flush() - - @override - def close(self) -> bool: - """ - Gracefully terminates the underlying subprocess if it is still running. + seq_dict = ref.with_suffix(".dict") + with seq_dict.open("r") as fh: + with AlignmentFile(fh) as reader: + self.header: AlignmentHeader = reader.header - Returns: - True: if the subprocess was terminated successfully - False: if the subprocess failed to terminate or was not already running - """ - safely_closed: bool = super().close() - self._stderr_thread.join() - return safely_closed + threads = os.cpu_count() if threads is None else threads + self.opt = BwaAlnOptions( + max_hits=max_hits, + max_mismatches=max_mismatches, + max_mismatches_in_seed=max_mismatches_in_seed, + max_gap_opens=max_gap_opens, + max_gap_extensions=max_gap_extensions, + min_indel_to_end_distance=min_indel_to_end_distance, + seed_length=seed_length, + threads=threads, + ) + self.bwa = BwaAln(prefix=ref) + self.reverse_complement: bool = reverse_complement + self.include_alt_hits: bool = include_alt_hits def map_one(self, query: str, id: str = "unknown") -> BwaResult: """Maps a single query to the genome and returns the result. @@ -375,22 +298,21 @@ def map_all(self, queries: list[Query]) -> list[BwaResult]: if len(queries) == 0: return [] - # Send the reads to BWA - for query in queries: - fastq_str = query.to_fastq(reverse_complement=self.reverse_complement) - self._subprocess.stdin.write(fastq_str) - - # Force the input to be sent to the underlying process. - self.__signal_bwa() - - # Read back the results - results: list[BwaResult] = [] - for query in queries: - # get the next alignment and convert to a result - line: str = next(self._subprocess.stdout).strip() - assert not line.startswith("@"), f"SAM record must not start with '@'! {line}" - alignment = AlignedSegment.fromstring(line, self.header) - results.append(self._to_result(query=query, rec=alignment)) + # Map the reads + fastxs = [ + FastxRecord( + name=q.id, + sequence=sequence.reverse_complement(q.bases) + if self.reverse_complement + else q.bases, + ) + for q in queries + ] # + records = self.bwa.align(queries=fastxs, opt=self.opt) + results: list[BwaResult] = [ + self._to_result(query=query, rec=record) + for query, record in zip(queries, records, strict=True) + ] return results @@ -404,21 +326,20 @@ def _to_result(self, query: Query, rec: pysam.AlignedSegment) -> BwaResult: """ if query.id != rec.query_name: raise ValueError( - "Query and Results are out of order" f"Query=${query.id}, Result=${rec.query_name}" + f"Query and Results are out of orderQuery={query.id}, Result={rec.query_name}" ) - num_hits: int = int(rec.get_tag("HN")) if rec.has_tag("HN") else 0 - # `to_hits()` removes artifactual hits which span the boundary between concatenated - # reference sequences. If we are reporting a list of hits, the number of hits should match - # the size of this list. Otherwise, we either have zero hits, or more than the maximum - # number of hits. In both of the latter cases, we have to rely on the count reported in the - # `HN` tag. - hits: list[BwaHit] - if 0 < num_hits <= self.max_hits: + # FIXME: use the HN tag to get the # of hits, as the XA tag may not report all hits + # requires: https://github.com/fulcrumgenomics/pybwa/issues/17 + # which requires: https://github.com/lh3/bwa/pull/438 + hits: list[BwaHit] = [] + num_hits = 0 + if rec.has_tag("XA") or rec.is_mapped: hits = self.to_hits(rec=rec) num_hits = len(hits) - else: - hits = [] + if num_hits > self.opt.max_hits: + hits = [] + num_hits = 0 return BwaResult(query=query, hit_count=num_hits, hits=hits) diff --git a/prymer/offtarget/offtarget_detector.py b/prymer/offtarget/offtarget_detector.py index 44ae46f..5f14564 100644 --- a/prymer/offtarget/offtarget_detector.py +++ b/prymer/offtarget/offtarget_detector.py @@ -18,11 +18,9 @@ >>> detector = OffTargetDetector(ref=ref_fasta, max_primer_hits=204, max_primer_pair_hits=1, three_prime_region_length=20, max_mismatches_in_three_prime_region=0, max_mismatches=0, max_amplicon_size=250) >>> len(detector.filter(primers=[left_primer, right_primer])) # keep all 2 ->>> detector.close() >>> detector = OffTargetDetector(ref=ref_fasta, max_primer_hits=203, max_primer_pair_hits=1, three_prime_region_length=20, max_mismatches_in_three_prime_region=0, max_mismatches=0, max_amplicon_size=250) >>> len(detector.filter(primers=[left_primer, right_primer])) # keep none 0 ->>> detector.close() ``` @@ -93,8 +91,7 @@ from prymer.model import PrimerPair from prymer.model import Span from prymer.model import Strand -from prymer.offtarget.bwa import BWA_EXECUTABLE_NAME -from prymer.offtarget.bwa import BwaAlnInteractive +from prymer.offtarget.bwa import Bwa from prymer.offtarget.bwa import BwaHit from prymer.offtarget.bwa import BwaResult from prymer.offtarget.bwa import Query @@ -176,7 +173,6 @@ def __init__( # noqa: C901 threads: Optional[int] = None, keep_spans: bool = True, keep_primer_spans: bool = True, - executable: str | Path = BWA_EXECUTABLE_NAME, ) -> None: """ Initialize an [[OffTargetDetector]]. @@ -231,7 +227,6 @@ def __init__( # noqa: C901 populated, otherwise not keep_primer_spans: if True, [[OffTargetResult]] objects will be reported with left and right primer spans - executable: string or Path representation of the `bwa` executable path Raises: ValueError: If `max_amplicon_size` is not greater than 0. @@ -294,8 +289,7 @@ def __init__( # noqa: C901 self._primer_cache: dict[str, BwaResult] = {} self._primer_pair_cache: dict[PrimerPair, OffTargetResult] = {} - self._bwa = BwaAlnInteractive( - executable=executable, + self._bwa = Bwa( ref=ref, reverse_complement=True, threads=threads, @@ -636,9 +630,6 @@ def _hit_to_span(hit: BwaHit) -> Span: """Converts a Bwa Hit object to a Span.""" return Span(refname=hit.refname, start=hit.start, end=hit.end) - def close(self) -> None: - self._bwa.close() - def __enter__(self) -> Self: return self @@ -650,4 +641,3 @@ def __exit__( ) -> None: """Gracefully terminates any running subprocesses.""" super().__exit__(exc_type, exc_value, traceback) - self.close() diff --git a/prymer/primer3/primer3_parameters.py b/prymer/primer3/primer3_parameters.py index bad5e1c..b9b5e62 100644 --- a/prymer/primer3/primer3_parameters.py +++ b/prymer/primer3/primer3_parameters.py @@ -94,6 +94,8 @@ class stores user input for internal probe design and maps it to the correct Pri class Primer3Parameters(ABC): + """Base class for all primer3 parameters""" + @property @abstractmethod def max_dinuc_bases(self) -> int: diff --git a/pyproject.toml b/pyproject.toml index f55bf51..c6da796 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ fgpyo = "^0.8.0" pysam = "^0.22.1" ordered-set = "^4.1.0" primer3-py = "^2.0.3" +pybwa = "^1.1.3" [tool.poetry.group.dev.dependencies] poetry = "^1.8.2" diff --git a/tests/offtarget/test_bwa.py b/tests/offtarget/test_bwa.py index 4461032..8de5621 100644 --- a/tests/offtarget/test_bwa.py +++ b/tests/offtarget/test_bwa.py @@ -1,17 +1,15 @@ -import logging import shutil from dataclasses import replace from pathlib import Path from tempfile import NamedTemporaryFile from typing import TypeAlias -from typing import cast import pytest from fgpyo.sam import Cigar from fgpyo.sam.builder import SamBuilder from prymer.offtarget.bwa import BWA_AUX_EXTENSIONS -from prymer.offtarget.bwa import BwaAlnInteractive +from prymer.offtarget.bwa import Bwa from prymer.offtarget.bwa import BwaHit from prymer.offtarget.bwa import Query @@ -48,7 +46,7 @@ def test_missing_index_files(genome_ref: Path) -> None: # no files exist with pytest.raises(FileNotFoundError, match="BWA index files do not exist"): - BwaAlnInteractive(ref=ref_fasta, max_hits=1) + Bwa(ref=ref_fasta, max_hits=1) # all but one missing for aux_ext in BWA_AUX_EXTENSIONS[1:]: @@ -56,7 +54,7 @@ def test_missing_index_files(genome_ref: Path) -> None: with aux_path.open("w"): pass with pytest.raises(FileNotFoundError, match="BWA index file does not exist"): - BwaAlnInteractive(ref=ref_fasta, max_hits=1) + Bwa(ref=ref_fasta, max_hits=1) def test_hit_build_rc() -> None: @@ -76,101 +74,79 @@ def test_hit_build_rc() -> None: def test_header_is_properly_constructed(ref_fasta: Path) -> None: """Tests that bwa will return a properly constructed header.""" - with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa: - header: SamHeaderType = bwa.header.to_dict() - assert set(header.keys()) == {"HD", "SQ", "PG"} - assert header["HD"] == {"GO": "query", "SO": "unsorted", "VN": "1.5"} - assert header["SQ"] == [{"LN": 10001, "SN": "chr1"}] - assert len(header["PG"]) == 1 - program_group: dict[str, str] = cast(list[dict[str, str]], header["PG"])[0] - assert program_group["ID"] == "bwa" - assert program_group["PN"] == "bwa" + bwa = Bwa(ref=ref_fasta, max_hits=1) + header: SamHeaderType = bwa.header.to_dict() + assert set(header.keys()) == {"HD", "SQ"} + assert header["HD"] == {"VN": "1.5"} + assert header["SQ"][0]["LN"] == 10001 # type: ignore + assert header["SQ"][0]["SN"] == "chr1" # type: ignore def test_map_one_uniquely_mapped(ref_fasta: Path) -> None: """Tests that bwa maps one hit when a query uniquely maps.""" query = Query(bases="TCTACTAAAAATACAAAAAATTAGCTGGGCATGATGGCATGCACCTGTAATCCCGCTACT", id="NA") - with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa: - result = bwa.map_one(query=query.bases, id=query.id) - assert result.hit_count == 1 - assert result.hits[0].refname == "chr1" - assert result.hits[0].start == 61 - assert result.hits[0].negative is False - assert f"{result.hits[0].cigar}" == "60M" - assert result.query == query - - -def test_stderr_redirected_to_logger(ref_fasta: Path, caplog: pytest.LogCaptureFixture) -> None: - """Tests that we redirect the stderr of the bwa executable to a logger..""" - caplog.set_level(logging.DEBUG) - query = Query(bases="TCTACTAAAAATACAAAAAATTAGCTGGGCATGATGGCATGCACCTGTAATCCCGCTACT", id="NA") - with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa: - result = bwa.map_one(query=query.bases, id=query.id) - assert result.hit_count == 1 - assert result.hits[0].refname == "chr1" - assert result.hits[0].start == 61 - assert result.hits[0].negative is False - assert f"{result.hits[0].cigar}" == "60M" - assert result.query == query - assert "[bwa_aln_core] calculate SA coordinate..." in caplog.text - assert "[bwa_aln_core] convert to sequence coordinate..." in caplog.text - assert "[bwa_aln_core] refine gapped alignments..." in caplog.text - assert "[bwa_aln_core] print alignments..." in caplog.text - assert "[bwa_aln_core] 1 sequences have been processed" in caplog.text + bwa = Bwa(ref=ref_fasta, max_hits=1) + result = bwa.map_one(query=query.bases, id=query.id) + assert result.hit_count == 1 + assert result.hits[0].refname == "chr1" + assert result.hits[0].start == 61 + assert result.hits[0].negative is False + assert f"{result.hits[0].cigar}" == "60M" + assert result.query == query def test_map_one_unmapped(ref_fasta: Path) -> None: """Tests that bwa returns an unmapped alignment. The hit count should be zero and the list of hits empty.""" - with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa: - query = Query(bases="A" * 50, id="NA") - result = bwa.map_one(query=query.bases, id=query.id) - assert result.hit_count == 0 - assert len(result.hits) == 0 - assert result.query == query + bwa = Bwa(ref=ref_fasta, max_hits=1) + query = Query(bases="A" * 50, id="NA") + result = bwa.map_one(query=query.bases, id=query.id) + assert result.hit_count == 0 + assert len(result.hits) == 0 + assert result.query == query def test_map_one_multi_mapped_max_hits_one(ref_fasta: Path) -> None: """Tests that a query that returns too many hits (>max_hits) returns the number of hits but not the list of hits themselves.""" - with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa: - query = Query(bases="A" * 5, id="NA") - result = bwa.map_one(query=query.bases, id=query.id) - assert result.hit_count == 7508 - assert len(result.hits) == 0 # hit_count > max_hits - assert result.query == query + bwa = Bwa(ref=ref_fasta, max_hits=1) + query = Query(bases="A" * 5, id="NA") + result = bwa.map_one(query=query.bases, id=query.id) + assert result.hit_count == 7508 + assert len(result.hits) == 0 # hit_count > max_hits + assert result.query == query def test_map_one_multi_mapped_max_hits_many(ref_fasta: Path) -> None: """Tests a query that aligns to many locations, but fewer than max_hits, returns the number of hits and the hits themselves""" - with BwaAlnInteractive(ref=ref_fasta, max_hits=10000) as bwa: - query = Query(bases="A" * 5, id="NA") - result = bwa.map_one(query=query.bases, id=query.id) - assert result.hit_count == 7504 - assert len(result.hits) == 7504 # hit_count <= max_hits - assert result.query == query + bwa = Bwa(ref=ref_fasta, max_hits=10000) + query = Query(bases="A" * 5, id="NA") + result = bwa.map_one(query=query.bases, id=query.id) + assert result.hit_count == 7504 + assert len(result.hits) == 7504 # hit_count <= max_hits + assert result.query == query def test_map_all(ref_fasta: Path) -> None: """Tests aligning multiple queries.""" - with BwaAlnInteractive(ref=ref_fasta, max_hits=10000) as bwa: - # empty queries - assert bwa.map_all([]) == [] - - # many queries - queries = [] - for length in range(5, 101): - queries.append(Query(bases="A" * length, id="NA")) - results = bwa.map_all(queries) - assert len(results) == len(queries) - assert all(results[i].query == queries[i] for i in range(len(results))) - assert results[0].hit_count == 7504 - assert len(results[0].hits) == 7504 # hit_count <= max_hits - assert results[0].query == queries[0] - assert results[-1].hit_count == 0 - assert len(results[-1].hits) == 0 - assert results[-1].query == queries[-1] + bwa = Bwa(ref=ref_fasta, max_hits=10000) + # empty queries + assert bwa.map_all([]) == [] + + # many queries + queries = [] + for length in range(5, 101): + queries.append(Query(bases="A" * length, id="NA")) + results = bwa.map_all(queries) + assert len(results) == len(queries) + assert all(results[i].query == queries[i] for i in range(len(results))) + assert results[0].hit_count == 7504 + assert len(results[0].hits) == 7504 # hit_count <= max_hits + assert results[0].query == queries[0] + assert results[-1].hit_count == 0 + assert len(results[-1].hits) == 0 + assert results[-1].query == queries[-1] _PERFECT_BASES: str = "AGTGATGCTAAGGGTCAAATAAGTCACCAGCAAATACACAGCACACATCTCATGATGTGC" @@ -217,26 +193,26 @@ def test_map_single_hit( ) -> None: """Tests that bwa maps one hit when a query uniquely maps. Checks for the hit's edit and mismatches properties.""" - with BwaAlnInteractive( + bwa = Bwa( ref=ref_fasta, max_hits=1, max_mismatches=5, max_gap_opens=1, min_indel_to_end_distance=5, reverse_complement=reverse_complement, - ) as bwa: - query = Query(bases=bases, id=f"{hit}") - result = bwa.map_one(query=query.bases, id=query.id) - assert result.hit_count == 1, "hit_count" - assert result.query == query, "query" - assert result.hits[0].refname == hit.refname, "chr" - assert result.hits[0].start == hit.start, "start" - assert result.hits[0].negative == hit.negative, "negative" - assert result.hits[0].cigar == hit.cigar, "cigar" - assert result.hits[0].edits == hit.edits, "edits" - assert result.hits[0].end == end, "end" - assert result.hits[0].mismatches == mismatches, "mismatches" - assert result.hits[0].mismatches == hit.mismatches, "hit.mismatches" + ) + query = Query(bases=bases, id=f"{hit}") + result = bwa.map_one(query=query.bases, id=query.id) + assert result.hit_count == 1, "hit_count" + assert result.query == query, "query" + assert result.hits[0].refname == hit.refname, "chr" + assert result.hits[0].start == hit.start, "start" + assert result.hits[0].negative == hit.negative, "negative" + assert result.hits[0].cigar == hit.cigar, "cigar" + assert result.hits[0].edits == hit.edits, "edits" + assert result.hits[0].end == end, "end" + assert result.hits[0].mismatches == mismatches, "mismatches" + assert result.hits[0].mismatches == hit.mismatches, "hit.mismatches" @pytest.mark.parametrize("reverse_complement", [True, False]) @@ -248,56 +224,56 @@ def test_map_multi_hit(ref_fasta: Path, reverse_complement: bool) -> None: BwaHit.build("chr1", 8701, False, "31M", 0), ] bases: str = "AAGTGCTGGGATTACAGGCATGAGCCACCAC" - with BwaAlnInteractive( + bwa = Bwa( ref=ref_fasta, max_hits=len(expected_hits), max_mismatches=mismatches, max_gap_opens=0, reverse_complement=reverse_complement, - ) as bwa: - query = Query(bases=bases, id="test") - actual = bwa.map_one(query=query.bases, id=query.id) - assert actual.hit_count == 3, "hit_count" - assert actual.query == query, "query" - actual_hits = sorted(actual.hits, key=lambda hit: (hit.refname, hit.start)) - assert len(actual_hits) == len(expected_hits) - for actual_hit, expected_hit in zip(actual_hits, expected_hits, strict=True): - assert actual_hit.refname == expected_hit.refname, "chr" - assert actual_hit.start == expected_hit.start, "start" - assert actual_hit.negative == expected_hit.negative, "negative" - assert actual_hit.cigar == expected_hit.cigar, "cigar" - assert actual_hit.edits == expected_hit.edits, "edits" - assert actual_hit.end == expected_hit.start + 30, "end" - assert actual_hit.mismatches == mismatches, "mismatches" - assert actual_hit.mismatches == expected_hit.mismatches, "hit.mismatches" + ) + query = Query(bases=bases, id="test") + actual = bwa.map_one(query=query.bases, id=query.id) + assert actual.hit_count == 3, "hit_count" + assert actual.query == query, "query" + actual_hits = sorted(actual.hits, key=lambda hit: (hit.refname, hit.start)) + assert len(actual_hits) == len(expected_hits) + for actual_hit, expected_hit in zip(actual_hits, expected_hits, strict=True): + assert actual_hit.refname == expected_hit.refname, "chr" + assert actual_hit.start == expected_hit.start, "start" + assert actual_hit.negative == expected_hit.negative, "negative" + assert actual_hit.cigar == expected_hit.cigar, "cigar" + assert actual_hit.edits == expected_hit.edits, "edits" + assert actual_hit.end == expected_hit.start + 30, "end" + assert actual_hit.mismatches == mismatches, "mismatches" + assert actual_hit.mismatches == expected_hit.mismatches, "hit.mismatches" def test_to_result_out_of_order(ref_fasta: Path) -> None: - with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa: - query = Query(bases="GATTACA", id="foo") - rec = SamBuilder().add_single(name="bar") - with pytest.raises(ValueError, match="Query and Results are out of order"): - bwa._to_result(query=query, rec=rec) + bwa = Bwa(ref=ref_fasta, max_hits=1) + query = Query(bases="GATTACA", id="foo") + rec = SamBuilder().add_single(name="bar") + with pytest.raises(ValueError, match="Query and Results are out of order"): + bwa._to_result(query=query, rec=rec) def test_to_result_num_hits_on_unmapped(ref_fasta: Path) -> None: - with BwaAlnInteractive(ref=ref_fasta, max_hits=1) as bwa: - query = Query(bases="GATTACA", id="foo") - - # HN *can* be non-zero - rec = SamBuilder().add_single(name=query.id, attrs={"HN": 42}) - result = bwa._to_result(query=query, rec=rec) - assert result.hit_count == 42 - assert result.hits == [] - - # OK: HN tag is zero - rec = SamBuilder().add_single(name=query.id, attrs={"HN": 0}) - result = bwa._to_result(query=query, rec=rec) - assert result.hit_count == 0 - assert result.hits == [] - - # OK: no HN tag - rec = SamBuilder().add_single(name=query.id) - result = bwa._to_result(query=query, rec=rec) - assert result.hit_count == 0 - assert result.hits == [] + bwa = Bwa(ref=ref_fasta, max_hits=1) + query = Query(bases="GATTACA", id="foo") + + # HN *can* be non-zero + rec = SamBuilder().add_single(name=query.id, attrs={"HN": 42}) + result = bwa._to_result(query=query, rec=rec) + assert result.hit_count == 42 + assert result.hits == [] + + # OK: HN tag is zero + rec = SamBuilder().add_single(name=query.id, attrs={"HN": 0}) + result = bwa._to_result(query=query, rec=rec) + assert result.hit_count == 0 + assert result.hits == [] + + # OK: no HN tag + rec = SamBuilder().add_single(name=query.id) + result = bwa._to_result(query=query, rec=rec) + assert result.hit_count == 0 + assert result.hits == [] diff --git a/tests/offtarget/test_offtarget.py b/tests/offtarget/test_offtarget.py index 99cc68a..a6b8206 100644 --- a/tests/offtarget/test_offtarget.py +++ b/tests/offtarget/test_offtarget.py @@ -9,7 +9,6 @@ from prymer import PrimerPair from prymer import Span from prymer import Strand -from prymer.offtarget.bwa import BWA_EXECUTABLE_NAME from prymer.offtarget.bwa import BwaHit from prymer.offtarget.bwa import BwaResult from prymer.offtarget.bwa import Query @@ -39,7 +38,6 @@ def _build_detector( cache_results=cache_results, keep_spans=True, keep_primer_spans=True, - executable=BWA_EXECUTABLE_NAME, )