From c0d9e00bf0f0220048eb1c9dc2bbc613c8318997 Mon Sep 17 00:00:00 2001 From: Ted Brookings Date: Tue, 14 Nov 2023 16:50:02 -0500 Subject: [PATCH] Update for PR * Rebased onto current main * Update argument names to avoid shadowing "input" * Allow samtools index for multiple paths --- ci/check.sh | 1 + fgpyo/io/tests/test_io.py | 8 +- fgpyo/sam/__init__.py | 2 + fgpyo/sam/builder.py | 7 +- fgpyo/samtools.py | 52 ++++++------ fgpyo/tests/test_samtools.py | 155 ++++++++++++++++++----------------- fgpyo/util/inspect.py | 5 +- pyproject.toml | 1 - 8 files changed, 120 insertions(+), 111 deletions(-) diff --git a/ci/check.sh b/ci/check.sh index dca833f7..456b8588 100755 --- a/ci/check.sh +++ b/ci/check.sh @@ -44,6 +44,7 @@ if [[ "$1" == "--check" ]]; then fi banner "Executing in conda environment ${CONDA_DEFAULT_ENV} in directory ${root}" +run "Checking poetry" "poetry check" run "Unit Tests" "python -m pytest -vv -r sx fgpyo" run "Import Sorting" "isort --force-single-line-imports --profile black fgpyo" run "Style Checking" "black --line-length 99 $black_extra_args fgpyo" diff --git a/fgpyo/io/tests/test_io.py b/fgpyo/io/tests/test_io.py index 1e4bc559..11fe86df 100644 --- a/fgpyo/io/tests/test_io.py +++ b/fgpyo/io/tests/test_io.py @@ -86,8 +86,8 @@ def test_assert_path_is_writeable_pass() -> None: @pytest.mark.parametrize( "suffix, expected", [ - (".gz", io._io.TextIOWrapper), - (".fa", io._io.TextIOWrapper), + (".gz", io.TextIOWrapper), + (".fa", io.TextIOWrapper), ], ) def test_reader( @@ -103,8 +103,8 @@ def test_reader( @pytest.mark.parametrize( "suffix, expected", [ - (".gz", io._io.TextIOWrapper), - (".fa", io._io.TextIOWrapper), + (".gz", io.TextIOWrapper), + (".fa", io.TextIOWrapper), ], ) def test_writer( diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index f77b75a0..64e5ec57 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -912,3 +912,5 @@ class SamOrder(enum.Enum): Coordinate = "coordinate" #: coordinate sorted QueryName = "queryname" #: queryname sorted Unknown = "unknown" # Unknown SAM / BAM / CRAM sort order + # Sort by template-coordinate, SO is unsorted, GO is query, SS is template-coordinate: + TemplateCoordinate = "unsorted" diff --git a/fgpyo/sam/builder.py b/fgpyo/sam/builder.py index b93f9b34..39c6e1a2 100755 --- a/fgpyo/sam/builder.py +++ b/fgpyo/sam/builder.py @@ -137,6 +137,11 @@ def __init__( "SQ": (sd if sd is not None else SamBuilder.default_sd()), "RG": [(rg if rg is not None else SamBuilder.default_rg())], } + if sort_order == SamOrder.TemplateCoordinate: + self._header["HD"] = { + **self._header["HD"], + **{"GO": "query", "SS": "template-coordinate"}, + } if extra_header is not None: self._header = {**self._header, **extra_header} self._samheader = AlignmentHeader.from_dict(self._header) @@ -592,7 +597,7 @@ def to_path( file_handle.close() if self._sort_order not in {SamOrder.Unsorted, SamOrder.Unknown}: samtools.sort( - input=Path(fp.name), + alignment_file=Path(fp.name), output=path, index_output=index, sort_order=self._sort_order, diff --git a/fgpyo/samtools.py b/fgpyo/samtools.py index 189dbc5b..3188e74b 100644 --- a/fgpyo/samtools.py +++ b/fgpyo/samtools.py @@ -11,9 +11,9 @@ >>> from fgpyo import samtools >>> Path("example1.sorted.bam.bai").exists() False - >>> samtools.sort(input=Path("example1.bam"), output=Path("example1.sorted.bam")) + >>> samtools.sort(alignment_file=Path("example1.bam"), output=Path("example1.sorted.bam")) '' - >>> samtools.index(input=Path("example1.sorted.bam")) + >>> samtools.index(alignment_file=Path("example1.sorted.bam")) '' >>> Path("example1.sorted.bam.bai").exists() True @@ -35,7 +35,9 @@ from typing import TYPE_CHECKING from typing import List from typing import Optional +from typing import Sequence from typing import Tuple +from typing import Union from fgpyo.sam import SamFileType from fgpyo.sam import SamOrder @@ -61,7 +63,7 @@ def pysam_fn(*args: str) -> str: def dict( - input: Path, + fasta_file: Path, output: Path, no_header: bool = False, alias: bool = False, @@ -72,6 +74,7 @@ def dict( ) -> str: """ Calls the `samtools` function `dict` using the pysam samtools dispatcher. + Create a dictionary file from an input FASTA file. Arguments will be formatted into the appropriate command-line call which will be invoked using the pysam dispatcher. @@ -79,7 +82,7 @@ def dict( Returns the stdout of the resulting call to pysam.dict() Args: - input: Path to the FASTA files to generate a sequence dictionary for. + fasta_file: Path to the FASTA files to generate a sequence dictionary for. output: Path to the file to write output to. no_header: If true will not print the @HD header line. assembly: The assembly for the AS tag. @@ -114,7 +117,7 @@ def dict( if uri is not None: args.extend(["--uri", uri]) - args.append(str(input)) + args.append(str(fasta_file)) return _pysam_dict(*args) @@ -128,7 +131,7 @@ class FaidxMarkStrand(enum.Enum): def faidx( - input: Path, + sequence_file: Path, output: Optional[Path] = None, length: int = 60, regions: Optional[List[str]] = None, @@ -143,6 +146,7 @@ def faidx( ) -> str: """ Calls the `samtools` function `faidx` using the pysam samtools dispatcher. + Index an input FASTA or FASTQ file. Arguments will be formatted into the appropriate command-line call which will be invoked using the pysam dispatcher. @@ -150,8 +154,8 @@ def faidx( Returns the stdout of the resulting call to pysam.faidx() Args: - input: Path to the FAIDX files to index / read from. - output: Path to the file to write FASTA output. + sequence_file: Path to the FASTA/FASTQ file to index. + output: Path to the file to write output index file. length: Length of FASTA sequence line. regions: regions to extract from the FASTA file in samtools region format (:<1-based start>-<1-based end>) @@ -159,7 +163,7 @@ def faidx( region format (:<1-based start>-<1-based end>). 1 region descriptor per line. fai_idx: Read/Write to specified FAI index file. gzi_idx: Read/Write to specified compressed file index (used with .gz files). - continue_if_non_existent: If true continue qorking if a non-existent region is requested. + continue_if_non_existent: If true continue working if a non-existent region is requested. fastq: Read FASTQ files and output extracted sequences in FASTQ format. Same as using samtools fqidx. reverse_complement: Output the sequence as the reverse complement. When this option is @@ -215,7 +219,7 @@ def faidx( if region_file is not None: args.extend(["--region-file", str(region_file)]) - args.append(str(input)) + args.append(str(sequence_file)) if regions is not None: args.extend(regions) @@ -230,9 +234,7 @@ class SamIndexType(enum.Enum): def index( - input: Path, - # See https://github.com/pysam-developers/pysam/issues/1155 - # inputs: List[Path], Cant currently accept multiple + alignment_file: Union[Path, Sequence[Path]], output: Optional[Path] = None, threads: int = 0, index_type: SamIndexType = SamIndexType.BAI, @@ -247,7 +249,7 @@ def index( Returns the stdout of the resulting call to pysam.index() Args: - input: Path to the SAM/BAM/CRAM file to index. + alignment_file: Path to the SAM/BAM/CRAM file to index. output: Path to the file to write output. (Currently may only be used when exactly one alignment file is being indexed.) threads: Number of input/output compression threads to use in addition to main thread. @@ -255,16 +257,12 @@ def index( csi_index_size: Sets the minimum interval size of CSI indices to 2^INT. """ - # assert len(inputs) >= 1, "Must provide at least one input to samtools index." + if isinstance(alignment_file, Path): + alignment_file = [alignment_file] + if output is not None and len(alignment_file) != 1: + raise ValueError("Only one output may be specified, for indexing only one alignment file") - # if len(inputs) != 1: - # assert ( - # output is None - # ), "Output currently can only be used if there is exactly one input file being indexed" - # args = ["-M"] - # else: - # args = [] - args = [] + args = [] if len(alignment_file) == 1 else ["-M"] if index_type != SamIndexType.BAI: args.append(index_type.value) @@ -272,7 +270,7 @@ def index( if index_type == SamIndexType.CSI: args.extend(["-m", str(csi_index_size)]) args.extend(["-@", str(threads)]) - args.append(str(input)) + args.extend(f"{input_path}" for input_path in alignment_file) if output is not None: args.extend(["-o", str(output)]) @@ -280,7 +278,7 @@ def index( def sort( - input: Path, + alignment_file: Path, output: Path, index_output: bool = True, sort_unmapped_reads: bool = False, @@ -303,7 +301,7 @@ def sort( Returns the stdout of the resulting call to pysam.sort() Args: - input: Path to the SAM/BAM/CRAM file to sort. + alignment_file: Path to the SAM/BAM/CRAM file to sort. output: Path to the file to write output. index_output: If true, creates an index for the output file. sort_unmapped_reads: If true, sort unmapped reads by their sequence minimizer, reverse @@ -372,6 +370,6 @@ def sort( if no_pg: args.append("--no-PG") - args.extend(["-o", output_string, str(input)]) + args.extend(["-o", output_string, str(alignment_file)]) return _pysam_sort(*args) diff --git a/fgpyo/tests/test_samtools.py b/fgpyo/tests/test_samtools.py index ee0f97a9..d678f491 100644 --- a/fgpyo/tests/test_samtools.py +++ b/fgpyo/tests/test_samtools.py @@ -2,7 +2,7 @@ from typing import List from typing import Optional -import bgzip +import pysam import pytest from pysam.utils import SamtoolsError @@ -65,7 +65,7 @@ def test_dict_produces_sequence_dict( ) -> None: output_access = tmp_path / "example.dict" assert not output_access.exists() - samtools.dict(input=example_dict_fasta, output=output_access, uri="consistent_location") + samtools.dict(fasta_file=example_dict_fasta, output=output_access, uri="consistent_location") assert output_access.exists() output_contents: str @@ -82,7 +82,10 @@ def test_dict_no_header_works( output_access = tmp_path / "example.dict" assert not output_access.exists() samtools.dict( - input=example_dict_fasta, output=output_access, no_header=True, uri="consistent_location" + fasta_file=example_dict_fasta, + output=output_access, + no_header=True, + uri="consistent_location", ) assert output_access.exists() @@ -117,7 +120,7 @@ def test_dict_other_tags_work( output_access = tmp_path / "example.dict" assert not output_access.exists() samtools.dict( - input=example_dict_fasta, + fasta_file=example_dict_fasta, output=output_access, alias=True, assembly="test1", @@ -177,13 +180,13 @@ def test_faidx_produces_functional_index( # Make sure we're producing the index assert not output_index_expected.exists() - samtools.faidx(input=example_faidx_fasta) + samtools.faidx(sequence_file=example_faidx_fasta) assert output_index_expected.exists() output_access = tmp_path / "output_subset.fa" # Make sure the index is functional samtools.faidx( - input=example_faidx_fasta, output=output_access, regions=["chr1:1-7", "chr2:8-13"] + sequence_file=example_faidx_fasta, output=output_access, regions=["chr1:1-7", "chr2:8-13"] ) output_contents: str @@ -202,13 +205,15 @@ def test_faidx_fails_if_non_existent_region_requested( # Make sure we're producing the index assert not output_index_expected.exists() - samtools.faidx(input=example_faidx_fasta) + samtools.faidx(sequence_file=example_faidx_fasta) assert output_index_expected.exists() output_access = tmp_path / "output_subset.fa" # Make sure the index is functional samtools.faidx( - input=example_faidx_fasta, output=output_access, regions=["chr3:1-4", "chr1:1-7"] + sequence_file=example_faidx_fasta, + output=output_access, + regions=["chr3:1-4", "chr1:1-7"], ) @@ -219,12 +224,12 @@ def test_faidx_passes_if_non_existent_region_requested_when_continue_passed( output_index_expected = Path(f"{example_faidx_fasta}.fai") assert not output_index_expected.exists() - samtools.faidx(input=example_faidx_fasta) + samtools.faidx(sequence_file=example_faidx_fasta) assert output_index_expected.exists() output_access = tmp_path / "output_subset.fa" samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, output=output_access, continue_if_non_existent=True, regions=["chr3:1-4", "chr1:1-7"], @@ -239,7 +244,7 @@ def test_faidx_regions_and_regions_file_result_in_same_thing( # Make sure we're producing the index assert not output_index_expected.exists() - samtools.faidx(input=example_faidx_fasta) + samtools.faidx(sequence_file=example_faidx_fasta) assert output_index_expected.exists() output_access = tmp_path / "output_subset.fa" @@ -251,13 +256,15 @@ def test_faidx_regions_and_regions_file_result_in_same_thing( with region_file.open("w") as region_fh: region_fh.writelines([f"{region}\n" for region in regions]) - samtools.faidx(input=example_faidx_fasta, output=output_access, regions=regions) + samtools.faidx(sequence_file=example_faidx_fasta, output=output_access, regions=regions) manually_passed_output_contents: str with output_access.open("r") as subset_fasta: manually_passed_output_contents = subset_fasta.read() - samtools.faidx(input=example_faidx_fasta, output=output_access, region_file=region_file) + samtools.faidx( + sequence_file=example_faidx_fasta, output=output_access, region_file=region_file + ) file_passed_output_contents: str with output_access.open("r") as subset_fasta: @@ -275,14 +282,14 @@ def test_length_parameter( # Make sure we're producing the index assert not output_index_expected.exists() samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, ) assert output_index_expected.exists() output_access = tmp_path / "output_subset.fa" # Make sure the index is functional samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, output=output_access, regions=["chr1:1-7", "chr2:8-13"], length=5, @@ -312,14 +319,14 @@ def test_rc_parameter( # Make sure we're producing the index assert not output_index_expected.exists() samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, ) assert output_index_expected.exists() output_access = tmp_path / "output_subset.fa" # Make sure the index is functional samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, output=output_access, reverse_complement=True, regions=["chr1:1-7", "chr2:8-13"], @@ -355,14 +362,14 @@ def test_mark_strand_parameters( # Make sure we're producing the index assert not output_index_expected.exists() samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, ) assert output_index_expected.exists() output_access = tmp_path / "output_subset.fa" # Make sure the index is functional samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, output=output_access, reverse_complement=False, mark_strand=mark_strand, @@ -376,7 +383,7 @@ def test_mark_strand_parameters( assert output_contents == SUBSET_FASTA_TEMPLATE.format(mark_strand=expected_fwd_mark_strand) samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, output=output_access, reverse_complement=True, mark_strand=mark_strand, @@ -432,7 +439,7 @@ def test_fastq_parameter( # Make sure we're producing the index assert not output_index_expected.exists() samtools.faidx( - input=example_fastq, + sequence_file=example_fastq, fastq=True, ) assert output_index_expected.exists() @@ -440,7 +447,7 @@ def test_fastq_parameter( output_access = tmp_path / "output_subset.fq" # Make sure the index is functional samtools.faidx( - input=example_fastq, + sequence_file=example_fastq, output=output_access, regions=["chr1:1-7", "chr2:8-13"], fastq=True, @@ -457,9 +464,8 @@ def test_fastq_parameter( def example_faidx_fasta_gz(tmp_path: Path) -> Path: outfile = tmp_path / "example.fa.gz" - with outfile.open(mode="wb") as out_fh: - with bgzip.BGZipWriter(out_fh) as fh: - fh.write(bytes(EXAMPLE_FAIDX_FASTA, "Utf-8")) + with pysam.BGZFile(f"{outfile}", mode="wb", index=None) as fh: + fh.write(bytes(EXAMPLE_FAIDX_FASTA, "Utf-8")) return outfile @@ -474,7 +480,7 @@ def test_index_outputs( # Make sure we're producing the index assert not example_fai.exists() samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, fai_idx=example_fai, ) assert example_fai.exists() @@ -482,7 +488,7 @@ def test_index_outputs( output_access = tmp_path / "output_subset.fa" # Make sure the index is functional samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, output=output_access, regions=["chr1:1-7", "chr2:8-13"], fai_idx=example_fai, @@ -496,7 +502,7 @@ def test_index_outputs( example_gzi = Path(f"{example_faidx_fasta_gz}.gzi") assert not example_gzi.exists() samtools.faidx( - input=example_faidx_fasta_gz, + sequence_file=example_faidx_fasta_gz, gzi_idx=example_gzi, ) assert example_gzi.exists() @@ -504,7 +510,7 @@ def test_index_outputs( output_access = tmp_path / "output_subset.fa" # Make sure the index is functional samtools.faidx( - input=example_faidx_fasta, + sequence_file=example_faidx_fasta, output=output_access, regions=["chr1:1-7", "chr2:8-13"], gzi_idx=example_gzi, @@ -537,7 +543,7 @@ def test_index_works_with_one_input( builder.add_pair(name="test3", chrom="chr2", start1=4000, start2=4300) builder.add_pair(name="test4", chrom="chr5", start1=4000, start2=4300) - # At the moment sam builder doesnt support generating CRAM and SAM formats, so for now we're + # At the moment sam builder doesn't support generating CRAM and SAM formats, so for now we're # only testing on BAMs input_file = tmp_path / "test_input.bam" builder.to_path(input_file) @@ -547,53 +553,51 @@ def test_index_works_with_one_input( # Make sure we're producing the index assert not output_index_expected.exists() # samtools.index(inputs=[input_file], index_type=index_type) - samtools.index(input=input_file, index_type=index_type) + samtools.index(alignment_file=input_file, index_type=index_type) assert output_index_expected.exists() -# Can't accept multiple inputs at the moment. -# See https://github.com/pysam-developers/pysam/issues/1155 -# @pytest.mark.parametrize( -# argnames=["index_type"], -# argvalues=[ -# (SamIndexType.BAI,), -# (SamIndexType.CSI,), -# ], -# ids=["BAI", "CSI"], -# ) -# def test_index_works_with_multiple_inputs( -# tmp_path: Path, -# index_type: SamIndexType, -# ) -> None: -# builder = SamBuilder(sort_order=SamOrder.Coordinate) -# builder.add_pair(name="test1", chrom="chr1", start1=4000, start2=4300) -# builder.add_pair( -# name="test2", chrom="chr1", start1=5000, start2=4700, strand1="-", strand2="+" -# ) -# builder.add_pair(name="test3", chrom="chr2", start1=4000, start2=4300) -# builder.add_pair(name="test4", chrom="chr5", start1=4000, start2=4300) - -# # At the moment sam builder doesnt support generating CRAM and SAM formats, so for now we're -# # only testing on BAMs -# input_file1 = tmp_path / "test_input1.bam" -# builder.to_path(input_file1) -# input_file2 = tmp_path / "test_input2.bam" -# builder.to_path(input_file2) - -# inputs = [ -# input_file1, -# input_file2, -# ] - -# # Make sure we're producing the indices -# for input in inputs: -# output_index_expected = Path(f"{input}.{index_type._name_.lower()}") -# assert not output_index_expected.exists() - -# samtools.index(inputs=inputs, index_type=index_type) -# for input in inputs: -# output_index_expected = Path(f"{input}.{index_type._name_.lower()}") -# assert output_index_expected.exists() +@pytest.mark.parametrize( + argnames=["index_type"], + argvalues=[ + (SamIndexType.BAI,), + (SamIndexType.CSI,), + ], + ids=["BAI", "CSI"], +) +def test_index_works_with_multiple_inputs( + tmp_path: Path, + index_type: SamIndexType, +) -> None: + builder = SamBuilder(sort_order=SamOrder.Coordinate) + builder.add_pair(name="test1", chrom="chr1", start1=4000, start2=4300) + builder.add_pair( + name="test2", chrom="chr1", start1=5000, start2=4700, strand1="-", strand2="+" + ) + builder.add_pair(name="test3", chrom="chr2", start1=4000, start2=4300) + builder.add_pair(name="test4", chrom="chr5", start1=4000, start2=4300) + + # At the moment sam builder doesn't support generating CRAM and SAM formats, so for now we're + # only testing on BAMs + input_file1 = tmp_path / "test_input1.bam" + builder.to_path(input_file1) + input_file2 = tmp_path / "test_input2.bam" + builder.to_path(input_file2) + + inputs = [ + input_file1, + input_file2, + ] + + # Make sure we're producing the indices + for alignment_file in inputs: + output_index_expected = Path(f"{alignment_file}.{index_type._name_.lower()}") + assert not output_index_expected.exists() + + samtools.index(alignment_file=inputs, index_type=index_type) + for alignment_file in inputs: + output_index_expected = Path(f"{alignment_file}.{index_type._name_.lower()}") + assert output_index_expected.exists() @pytest.mark.parametrize( @@ -629,7 +633,6 @@ def test_sort_types( sort_order: Optional[SamOrder], expected_name_order: List[str], ) -> None: - builder = SamBuilder(sort_order=SamOrder.Unsorted) builder.add_pair( name="test3", chrom="chr1", start1=5000, start2=4700, strand1="-", strand2="+" @@ -644,7 +647,7 @@ def test_sort_types( builder.to_path(input_file) samtools.sort( - input=input_file, + alignment_file=input_file, output=output_file, index_output=index_output, sort_order=sort_order, diff --git a/fgpyo/util/inspect.py b/fgpyo/util/inspect.py index 6a69a6b5..1a7a32d4 100644 --- a/fgpyo/util/inspect.py +++ b/fgpyo/util/inspect.py @@ -1,3 +1,4 @@ +import sys from typing import Any from typing import Dict from typing import List @@ -5,9 +6,9 @@ from typing import Type from typing import Union -try: # py>=38 +if sys.version_info >= (3, 8): from typing import Literal -except ImportError: # py<38 +else: from typing_extensions import Literal import functools diff --git a/pyproject.toml b/pyproject.toml index 1cea5b74..783be7c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ flake8 = [ { version = ">=6.1.0", python = ">=3.12.0" }, ] black = ">=19.10b0" -bgzip = ">=0.4.0" pytest-cov = ">=2.8.1" isort = ">=5.10.1"