-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding initial batch of samtools type hinted wrapper functions
- Loading branch information
Nathan Roach
committed
Dec 30, 2022
1 parent
299cc7c
commit d4c66a7
Showing
14 changed files
with
1,175 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
def pysam_fn(*args: str) -> None: | ||
""" | ||
Type hinted import of an example function from the pysam samtools dispatcher. | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from fgpyo.pysam_dispatch.samtools.dict import dict | ||
from fgpyo.pysam_dispatch.samtools.faidx import faidx | ||
from fgpyo.pysam_dispatch.samtools.index import index | ||
from fgpyo.pysam_dispatch.samtools.sort import sort | ||
|
||
__all__ = ["dict", "faidx", "index", "sort"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
""" | ||
Type-Hinted Wrapper around pysam samtools dict dispatch | ||
------------------------------------------------------- | ||
""" | ||
|
||
from pathlib import Path | ||
from typing import TYPE_CHECKING | ||
from typing import List | ||
from typing import Optional | ||
|
||
if TYPE_CHECKING: | ||
from fgpyo.pysam_dispatch import pysam_fn as _pysam_dict | ||
else: | ||
from pysam import dict as _pysam_dict | ||
|
||
|
||
def dict( | ||
input: Path, | ||
output: Path, | ||
no_header: bool = False, | ||
alias: bool = False, | ||
assembly: Optional[str] = None, | ||
alt: Optional[Path] = None, | ||
species: Optional[str] = None, | ||
uri: Optional[str] = None, | ||
) -> List[str]: | ||
""" | ||
Calls the `samtools` function `dict` using the pysam samtools dispatcher. | ||
Arguments will be formatted into the appropriate command-line call which will be invoked | ||
using the pysam dispatcher. | ||
Returns the list of arguments passed to pysam.dict() | ||
Args: | ||
input: Path to the FASTA files to generate a sequence dictionary for. | ||
output: Path to the file to write output to. | ||
no_header: If true will not print the @HD header line. | ||
assembly: The assembly for the AS tag. | ||
alias: Adds an AN tag with the same value as the SN tag, except that a 'chr' prefix is | ||
removed if SN has one or added if it does not. For mitochondria (i.e., when SN is “M” | ||
or “MT”, with or without a “chr” prefix), also adds the remaining combinations of | ||
“chr/M/MT” to the AN tag. | ||
alt: Add an AH tag to each sequence listed in the specified bwa-style .alt file. These | ||
files use SAM records to represent alternate locus sequences (as named in the QNAME | ||
field) and their mappings to the primary assembly. | ||
species: Specify the species for the SP tag. | ||
uri: Specify the URI for the UR tag. Defaults to the absolute path of ref.fasta. | ||
""" | ||
|
||
args: List[str] = ["--output", str(output)] | ||
|
||
if no_header: | ||
args.append("--no-header") | ||
|
||
if alias: | ||
args.append("--alias") | ||
|
||
if assembly is not None: | ||
args.extend(["--assembly", assembly]) | ||
|
||
if alt is not None: | ||
args.extend(["--alt", str(alt)]) | ||
|
||
if species is not None: | ||
args.extend(["--species", species]) | ||
|
||
if uri is not None: | ||
args.extend(["--uri", uri]) | ||
|
||
args.append(str(input)) | ||
|
||
_pysam_dict(*args) | ||
|
||
return args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
""" | ||
Type-Hinted Wrapper around pysam samtools faidx dispatch | ||
-------------------------------------------------------- | ||
""" | ||
|
||
import enum | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING | ||
from typing import List | ||
from typing import Optional | ||
from typing import Tuple | ||
|
||
if TYPE_CHECKING: | ||
from fgpyo.pysam_dispatch import pysam_fn as _pysam_faidx | ||
else: | ||
from pysam import faidx as _pysam_faidx | ||
|
||
|
||
@enum.unique | ||
class FaidxMarkStrand(enum.Enum): | ||
RevComp = "rc" | ||
No = "no" | ||
Sign = "sign" | ||
Custom = "custom" | ||
|
||
|
||
def faidx( | ||
input: Path, | ||
output: Optional[Path] = None, | ||
length: int = 60, | ||
regions: Optional[List[str]] = None, | ||
region_file: Optional[Path] = None, | ||
fai_idx: Optional[Path] = None, | ||
gzi_idx: Optional[Path] = None, | ||
continue_if_non_existent: bool = False, | ||
fastq: bool = False, | ||
reverse_complement: bool = False, | ||
mark_strand: FaidxMarkStrand = FaidxMarkStrand.RevComp, | ||
custom_mark_strand: Optional[Tuple[str, str]] = None, | ||
) -> List[str]: | ||
""" | ||
Calls the `samtools` function `faidx` using the pysam samtools dispatcher. | ||
Arguments will be formatted into the appropriate command-line call which will be invoked | ||
using the pysam dispatcher. | ||
Returns the list of arguments passed to pysam.faidx() | ||
Args: | ||
input: Path to the FAIDX files to index / read from. | ||
output: Path to the file to write FASTA output. | ||
length: Length of FASTA sequence line. | ||
regions: regions to extract from the FASTA file in samtools region format | ||
(<chrom>:<1-based start>-<1-based end>) | ||
region_file: Path to file containing regions to extract from the FASTA file in samtools | ||
region format (<chrom>:<1-based start>-<1-based end>). 1 region descriptor per line. | ||
fai_idx: Read/Write to specified FAI index file. | ||
gzi_idx: Read/Write to specified compressed file index (used with .gz files). | ||
continue_if_non_existent: If true continue qorking if a non-existent region is requested. | ||
fastq: Read FASTQ files and output extracted sequences in FASTQ format. Same as using | ||
samtools fqidx. | ||
reverse_complement: Output the sequence as the reverse complement. When this option is | ||
used, “/rc” will be appended to the sequence names. To turn this off or change the | ||
string appended, use the mark_strand parameter. | ||
mark_strand: Append strand indicator to sequence name. Type to append can be one of: | ||
FaidxMarkStrand.RevComp - Append '/rc' when writing the reverse complement. This is the | ||
default. | ||
FaidxMarkStrand.No - Do not append anything. | ||
FaidxMarkStrand.Sign - Append '(+)' for forward strand or '(-)' for reverse | ||
complement. This matches the output of “bedtools getfasta -s”. | ||
FaidxMarkStrand.Custom - custom,<custom_mark_strand[0]>,<custom_mark_strand[1]> | ||
Append string <pos> to names when writing the forward strand and <neg> when writing the | ||
reverse strand. Spaces are preserved, so it is possible to move the indicator into the | ||
comment part of the description line by including a leading space in the strings | ||
<custom_mark_strand[0]> and <custom_mark_strand[1]>. | ||
custom_mark_strand: The custom strand indicators to use in in the Custom MarkStrand | ||
setting. The first value of the tuple will be used as the positive strand indicator, | ||
the second value will be used as the negative strand indicator. | ||
""" | ||
|
||
mark_strand_str: str | ||
if mark_strand == FaidxMarkStrand.Custom: | ||
assert custom_mark_strand is not None, ( | ||
"Cannot use custom mark strand without providing the custom strand indicators to " | ||
+ "`custom_mark_string`" | ||
) | ||
mark_strand_str = f"{mark_strand.value},{custom_mark_strand[0]},{custom_mark_strand[1]}" | ||
else: | ||
mark_strand_str = mark_strand.value | ||
|
||
args: List[str] = ["--length", str(length), "--mark-strand", mark_strand_str] | ||
|
||
if output is not None: | ||
args.extend(["--output", str(output)]) | ||
|
||
if continue_if_non_existent: | ||
args.append("--continue") | ||
if reverse_complement: | ||
args.append("--reverse-complement") | ||
if fastq: | ||
args.append("--fastq") | ||
|
||
if fai_idx is not None: | ||
args.extend(["--fai-idx", str(fai_idx)]) | ||
|
||
if gzi_idx is not None: | ||
args.extend(["--gzi-idx", str(gzi_idx)]) | ||
|
||
if region_file is not None: | ||
args.extend(["--region-file", str(region_file)]) | ||
|
||
args.append(str(input)) | ||
|
||
if regions is not None: | ||
args.extend(regions) | ||
|
||
print(args) | ||
_pysam_faidx(*args) | ||
|
||
return args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
""" | ||
Type-Hinted Wrapper around pysam samtools index dispatch | ||
-------------------------------------------------------- | ||
""" | ||
|
||
import enum | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING | ||
from typing import List | ||
from typing import Optional | ||
|
||
if TYPE_CHECKING: | ||
from fgpyo.pysam_dispatch import pysam_fn as _pysam_index | ||
else: | ||
from pysam import index as _pysam_index | ||
|
||
|
||
@enum.unique | ||
class SamIndexType(enum.Enum): | ||
BAI = "-b" | ||
CSI = "-c" | ||
|
||
|
||
def index( | ||
inputs: List[Path], | ||
output: Optional[Path] = None, | ||
threads: int = 0, | ||
index_type: SamIndexType = SamIndexType.BAI, | ||
csi_index_size: int = 14, | ||
) -> List[str]: | ||
""" | ||
Calls the `samtools` function `index` using the pysam samtools dispatcher. | ||
Arguments will be formatted into the appropriate command-line call which will be invoked | ||
using the pysam dispatcher. | ||
Returns the list of arguments passed to pysam.index() | ||
Args: | ||
inputs: Paths to the SAM/BAM/CRAM files to index. | ||
output: Path to the file to write output. (Currently may only be used when exactly one | ||
alignment file is being indexed.) | ||
threads: Number of input/output compression threads to use in addition to main thread. | ||
index_type: The type of index file to produce when indexing. | ||
csi_index_size: Sets the minimum interval size of CSI indices to 2^INT. | ||
""" | ||
|
||
assert len(inputs) >= 1, "Must provide at least one input to samtools index." | ||
|
||
if len(inputs) != 1: | ||
assert ( | ||
output is None | ||
), "Output currently can only be used if there is exactly one input file being indexed" | ||
args = ["-M"] | ||
else: | ||
args = [] | ||
|
||
if index_type != SamIndexType.BAI: | ||
args.append(index_type.value) | ||
|
||
if index_type == SamIndexType.CSI: | ||
args.extend(["-m", str(csi_index_size)]) | ||
args.extend(["-@", str(threads)]) | ||
args.extend([str(x) for x in inputs]) | ||
if output is not None: | ||
args.extend(["-o", str(output)]) | ||
print(args) | ||
_pysam_index(*args) | ||
|
||
return args |
Oops, something went wrong.