From b40548ea78d327f51743c0589b03328cc0928071 Mon Sep 17 00:00:00 2001 From: Joseph Guhlin Date: Wed, 27 Nov 2024 11:03:05 +1300 Subject: [PATCH 1/3] Should help with #70 --- README.md | 154 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 140 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index dc0951f..12e7419 100644 --- a/README.md +++ b/README.md @@ -60,21 +60,18 @@ It's equivalent to running minimap2 -x map_ont -x short ... ### Customization [MapOpts](https://docs.rs/minimap2-sys/0.1.5/minimap2_sys/struct.mm_mapopt_t.html) and [IdxOpts](https://docs.rs/minimap2-sys/0.1.5/minimap2_sys/struct.mm_idxopt_t.html) can be customized with Rust's struct pattern, as well as applying mapping settings. Inspired by [bevy](https://bevyengine.org/). ```rust -Aligner { - mapopt: MapOpt { - seed: 42, - best_n: 1, - ..Default::default() - }, - idxopt: IdxOpt { - k: 21, - ..Default::default() - }, - ..map_ont() -} +let mut aligner: Aligner = Aligner::builder().map_ont(); +aligner.mapopt.seed = 42; +aligner.mapopt.best_n = 1; +aligner.idxopt.k = 21; +self.mapopt.flag |= MM_F_COPY_COMMENT as i64; // Setting a flag. If you do this frequently, open an [issue](https://github.com/jguhlin/minimap2-rs/issues/new) asking for an ergonomic function! +self.idxopt.flag |= MM_I_HPC as i32; ``` + +See [full list of options](#minimap2-mapping-and-indexing-options) below. + ### Working Example -There is a binary called "fakeminimap2" that I am using to test for memory leaks. You can follow the [source code](https://github.com/jguhlin/minimap2-rs/blob/main/fakeminimap2/src/main.rs) for an example. It also shows some helper functions for identifying compression types and FASTA vs FASTQ files. I used my own parsers as they are well fuzzed, but open to removing them or putting them behind a feature wall. +There is a binary called "fakeminimap2" which demonstrates basic usage and multithreading using mpsc channels. You can find it [in this repo](https://github.com/jguhlin/minimap2-rs/tree/main/fakeminimap2) for an example. Alignment functions return a [Mapping](https://docs.rs/minimap2/latest/minimap2/struct.Mapping.html) struct. The [Alignment](https://docs.rs/minimap2/latest/minimap2/struct.Alignment.html) struct is only returned when the [Aligner](https://docs.rs/minimap2/latest/minimap2/struct.Aligner.html) is created using [.with_cigar()](https://docs.rs/minimap2/latest/minimap2/struct.Aligner.html#method.with_cigar). @@ -131,7 +128,8 @@ The following crate features are available: Map-file is a *default* feature and enabled unless otherwise specified. -## Missing Features +## Missing Features +Create an [issue](https://github.com/jguhlin/minimap2-rs/issues/new) if you need any of the following: * setting mismatch penalty for base transitions [minimap 2.27 release notes](https://github.com/lh3/minimap2/releases/tag/v2.27) * Generate ds tags to indicate uncertainty in indels @@ -169,6 +167,7 @@ Minimap2 is tested on x86_64 and aarch64 (arm64). Other platforms may work, plea Please cite the appropriate minimap2 papers if you use this in your work, as well as this library. ## DOI for this library +... coming soon ... ## Minimap2 Papers @@ -180,6 +179,133 @@ and/or: > Li, H. (2021). New strategies to improve minimap2 alignment accuracy. > *Bioinformatics*, **37**:4572-4574. [doi:10.1093/bioinformatics/btab705][doi2] +# Minimap2 Mapping and Indexing Options + +## Mapping Options (`MapOpt` in rust, alias for `mm_mapopt_t`) + +| Field Name | Type | Description | +|-------------------|-------------------------|----------------------------------------------------------------------| +| `flag` | `i64` | Flags to control mapping behavior (bitwise flags). | +| `seed` | `c_int` | Random seed for mapping. | +| `sdust_thres` | `c_int` | Threshold for masking low-complexity regions using SDUST. | +| `max_qlen` | `c_int` | Maximum query length. | +| `bw` | `c_int` | Bandwidth for alignment of short reads. | +| `bw_long` | `c_int` | Bandwidth for alignment of long reads. | +| `max_gap` | `c_int` | Maximum gap allowed in mapping. | +| `max_gap_ref` | `c_int` | Maximum gap allowed on the reference. | +| `max_frag_len` | `c_int` | Maximum fragment length for paired-end reads. | +| `max_chain_skip` | `c_int` | Maximum number of seeds to skip in chaining. | +| `max_chain_iter` | `c_int` | Maximum number of chaining iterations. | +| `min_cnt` | `c_int` | Minimum number of seeds required for a chain. | +| `min_chain_score` | `c_int` | Minimum score for a chain to be considered. | +| `chain_gap_scale` | `f32` | Scaling factor for chain gap penalty. | +| `chain_skip_scale`| `f32` | Scaling factor for chain skipping. | +| `rmq_size_cap` | `c_int` | Size cap for RMQ (Range Minimum Query). | +| `rmq_inner_dist` | `c_int` | Inner distance for RMQ rescue. | +| `rmq_rescue_size` | `c_int` | Size threshold for RMQ rescue. | +| `rmq_rescue_ratio`| `f32` | Rescue ratio for RMQ. | +| `mask_level` | `f32` | Level at which to mask repetitive seeds. | +| `mask_len` | `c_int` | Length of sequences to mask. | +| `pri_ratio` | `f32` | Ratio threshold for primary alignment selection. | +| `best_n` | `c_int` | Maximum number of best alignments to retain. | +| `alt_drop` | `f32` | Score drop ratio for alternative mappings. | +| `a` | `c_int` | Match score. | +| `b` | `c_int` | Mismatch penalty. | +| `q` | `c_int` | Gap open penalty. | +| `e` | `c_int` | Gap extension penalty. | +| `q2` | `c_int` | Gap open penalty for long gaps. | +| `e2` | `c_int` | Gap extension penalty for long gaps. | +| `transition` | `c_int` | Penalty for transitions in spliced alignment. | +| `sc_ambi` | `c_int` | Score for ambiguous bases. | +| `noncan` | `c_int` | Allow non-canonical splicing (boolean flag). | +| `junc_bonus` | `c_int` | Bonus score for junctions. | +| `zdrop` | `c_int` | Z-drop score for alignment extension stopping. | +| `zdrop_inv` | `c_int` | Inverse Z-drop score. | +| `end_bonus` | `c_int` | Bonus score for aligning to the ends of sequences. | +| `min_dp_max` | `c_int` | Minimum score to consider a DP alignment valid. | +| `min_ksw_len` | `c_int` | Minimum length for performing Smith-Waterman alignment. | +| `anchor_ext_len` | `c_int` | Length for anchor extension. | +| `anchor_ext_shift`| `c_int` | Shift for anchor extension. | +| `max_clip_ratio` | `f32` | Maximum allowed clipping ratio. | +| `rank_min_len` | `c_int` | Minimum length for rank filtering. | +| `rank_frac` | `f32` | Fraction for rank filtering. | +| `pe_ori` | `c_int` | Expected orientation of paired-end reads. | +| `pe_bonus` | `c_int` | Bonus score for proper paired-end alignment. | +| `mid_occ_frac` | `f32` | Fraction for mid-occurrence filtering. | +| `q_occ_frac` | `f32` | Fraction for query occurrence filtering. | +| `min_mid_occ` | `i32` | Minimum mid-occurrence threshold. | +| `max_mid_occ` | `i32` | Maximum mid-occurrence threshold. | +| `mid_occ` | `i32` | Mid-occurrence cutoff value. | +| `max_occ` | `i32` | Maximum occurrence cutoff value. | +| `max_max_occ` | `i32` | Maximum allowed occurrence value. | +| `occ_dist` | `i32` | Distribution of occurrences for filtering. | +| `mini_batch_size` | `i64` | Size of mini-batches for processing. | +| `max_sw_mat` | `i64` | Maximum size of Smith-Waterman matrices. | +| `cap_kalloc` | `i64` | Memory allocation cap for kalloc. | +| `split_prefix` | `*const c_char` | Prefix for splitting output files. | + +## Mapping Flags (`MM_F_*`) + +| Flag Constant | Value | Description | +|-------------------------|----------------|-----------------------------------------------------------------| +| `MM_F_NO_DIAG` | `1` | Skip seed pairs on the same diagonal. | +| `MM_F_NO_DUAL` | `2` | Do not compute reverse complement of seeds. | +| `MM_F_CIGAR` | `4` | Compute CIGAR string. | +| `MM_F_OUT_SAM` | `8` | Output alignments in SAM format. | +| `MM_F_NO_QUAL` | `16` | Do not output base quality in SAM. | +| `MM_F_OUT_CG` | `32` | Output CIGAR in CG format (Compact CIGAR). | +| `MM_F_OUT_CS` | `64` | Output cs tag (difference string) in SAM/PAF. | +| `MM_F_SPLICE` | `128` | Enable spliced alignment (for RNA-seq). | +| `MM_F_SPLICE_FOR` | `256` | Only consider the forward strand for spliced alignment. | +| `MM_F_SPLICE_REV` | `512` | Only consider the reverse strand for spliced alignment. | +| `MM_F_NO_LJOIN` | `1024` | Disable long join for gapped alignment. | +| `MM_F_OUT_CS_LONG` | `2048` | Output cs tag in long format. | +| `MM_F_SR` | `4096` | Perform split read alignment (for short reads). | +| `MM_F_FRAG_MODE` | `8192` | Fragment mode for paired-end reads. | +| `MM_F_NO_PRINT_2ND` | `16384` | Do not output secondary alignments. | +| `MM_F_2_IO_THREADS` | `32768` | Use two I/O threads during mapping. | +| `MM_F_LONG_CIGAR` | `65536` | Use long CIGAR (>65535 operations). | +| `MM_F_INDEPEND_SEG` | `131072` | Map segments independently in multiple mapping. | +| `MM_F_SPLICE_FLANK` | `262144` | Add flanking bases for spliced alignment. | +| `MM_F_SOFTCLIP` | `524288` | Perform soft clipping at ends. | +| `MM_F_FOR_ONLY` | `1048576` | Only map the forward strand of the query. | +| `MM_F_REV_ONLY` | `2097152` | Only map the reverse complement of the query. | +| `MM_F_HEAP_SORT` | `4194304` | Use heap sort for mapping. | +| `MM_F_ALL_CHAINS` | `8388608` | Output all chains (may include suboptimal chains). | +| `MM_F_OUT_MD` | `16777216` | Output MD tag in SAM. | +| `MM_F_COPY_COMMENT` | `33554432` | Copy comment from FASTA/Q to SAM output. | +| `MM_F_EQX` | `67108864` | Use =/X instead of M in CIGAR. | +| `MM_F_PAF_NO_HIT` | `134217728` | Output unmapped reads in PAF format. | +| `MM_F_NO_END_FLT` | `268435456` | Disable end flanking region filtering. | +| `MM_F_HARD_MLEVEL` | `536870912` | Hard mask low-complexity regions. | +| `MM_F_SAM_HIT_ONLY` | `1073741824` | Output only alignments in SAM (no headers). | +| `MM_F_RMQ` | `2147483648` | Use RMQ for read mapping quality estimation. | +| `MM_F_QSTRAND` | `4294967296` | Consider query strand in mapping. | +| `MM_F_NO_INV` | `8589934592` | Disable inversion in alignment. | +| `MM_F_NO_HASH_NAME` | `17179869184` | Do not hash read names (for reproducibility). | +| `MM_F_SPLICE_OLD` | `34359738368` | Use old splice alignment model. | +| `MM_F_SECONDARY_SEQ` | `68719476736` | Output sequence of secondary alignments. | +| `MM_F_OUT_DS` | `137438953472` | Output detailed alignment score (ds tag). | + +## Index Options (`IdxOpt` in rust, alias for `mm_idxopt_t`) + +| Field Name | Type | Description | +|-------------------|----------------|-----------------------------------------------------------------| +| `k` | `c_short` | K-mer size (mer length). | +| `w` | `c_short` | Minimizer window size. | +| `flag` | `c_short` | Flags to control indexing behavior (bitwise flags). | +| `bucket_bits` | `c_short` | Number of bits for the size of hash table buckets. | +| `mini_batch_size` | `i64` | Size of mini-batches for indexing (number of bases). | +| `batch_size` | `u64` | Total batch size for indexing (number of bases). | + +## Indexing Flags (`MM_I_*`) + +| Flag Constant | Value | Description | +|-------------------|--------|----------------------------------------------------------| +| `MM_I_HPC` | `1` | Use homopolymer-compressed k-mers for indexing. | +| `MM_I_NO_SEQ` | `2` | Do not store sequences in the index. | +| `MM_I_NO_NAME` | `4` | Do not store sequence names in the index. | + # Changelog ### 0.1.21 minimap2 2.28 Contributors to this release: @mbhall88 @rob-p @Sam-Sims @charlesgregory @PB-DB From eea4c385d63d1cafc7432a1217df4e02e5ad20a0 Mon Sep 17 00:00:00 2001 From: Joseph Guhlin Date: Wed, 27 Nov 2024 11:05:58 +1300 Subject: [PATCH 2/3] Minor note --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 12e7419..e00bcb0 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,8 @@ and/or: # Minimap2 Mapping and Indexing Options +See [customization](#customization) for how to use these. + ## Mapping Options (`MapOpt` in rust, alias for `mm_mapopt_t`) | Field Name | Type | Description | From 13b01ba78cf8f95c4a74f34388efe04a40fb9bba Mon Sep 17 00:00:00 2001 From: Joseph Guhlin Date: Wed, 27 Nov 2024 11:19:59 +1300 Subject: [PATCH 3/3] Provide gap open functions Closes #70 --- src/lib.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 7a3b917..71e198d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -605,6 +605,23 @@ where self } + + /// Sets the gap open penalty for minimap2. + /// + /// minimap2 -O 4 sets both the short and long gap open penalty to 4. + /// [minimap2 code](https://github.com/lh3/minimap2/blob/618d33515e5853c4576d5a3d126fdcda28f0e8a4/main.c#L315) + /// + /// To set the long gap open penalty, simply provide a value for `penalty_long`. + pub fn with_gap_open_penalty(mut self, penalty: i32, penalty_long: Option) -> Self { + self.mapopt.q = penalty; + if let Some(penalty_long) = penalty_long { + self.mapopt.q2 = penalty_long; + } else { + self.mapopt.q2 = penalty; + } + self + } + /// Sets the number of threads minimap2 will use for building the index /// ``` /// # use minimap2::*;