From 6835aecf51a1ecd29b631219dd0f3380213b5921 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 11 Dec 2024 12:08:55 -0700 Subject: [PATCH 1/2] feat: support protein sequences --- src/lib/matcher.rs | 39 ++++++++++++++------ src/lib/mod.rs | 1 + src/main.rs | 92 +++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 116 insertions(+), 16 deletions(-) diff --git a/src/lib/matcher.rs b/src/lib/matcher.rs index d27dbd0..6bcabee 100644 --- a/src/lib/matcher.rs +++ b/src/lib/matcher.rs @@ -5,6 +5,7 @@ use crate::color::{color_background, color_head}; use crate::color::{COLOR_BACKGROUND, COLOR_BASES, COLOR_QUALS}; use crate::reverse_complement; use crate::DNA_BASES; +use crate::AMINO_ACIDS; use anyhow::{bail, Context, Result}; use bstr::ByteSlice; use regex::bytes::{Regex, RegexBuilder, RegexSet, RegexSetBuilder}; @@ -120,9 +121,18 @@ fn bases_colored( } /// Validates that a given FIXED pattern contains only valid DNA bases (ACGTN) -pub fn validate_fixed_pattern(pattern: &str) -> Result<()> { +pub fn validate_fixed_pattern(pattern: &str, protein: bool) -> Result<()> { for (index, base) in pattern.chars().enumerate() { - if !DNA_BASES.contains(&(base as u8)) { + if protein { + if !AMINO_ACIDS.contains(&(base as u8)) { + bail!( + "Fixed pattern must contain only amino acids: {} .. [{}] .. {}", + &pattern[0..index], + &pattern[index..=index], + &pattern[index + 1..], + ) + } + } else if !DNA_BASES.contains(&(base as u8)) { bail!( "Fixed pattern must contain only DNA bases: {} .. [{}] .. {}", &pattern[0..index], @@ -609,17 +619,24 @@ pub mod tests { // Tests validate_fixed_pattern() // ############################################################################################ - #[test] - fn test_validate_fixed_pattern_is_ok() { - let pattern = "AGTGTGATG"; - let result = validate_fixed_pattern(&pattern); + #[rstest] + #[case("AGTGTGATG", false)] + #[case("QRNQRNQRN", true)] + fn test_validate_fixed_pattern_is_ok( + #[case] pattern: &str, + #[case] protein: bool) { + let result = validate_fixed_pattern(&pattern, protein); assert!(result.is_ok()) } - #[test] - fn test_validate_fixed_pattern_error() { - let pattern = "AXGTGTGATG"; - let msg = String::from("Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG"); - let result = validate_fixed_pattern(&pattern); + + #[rstest] + #[case("AXGTGTGATG", false, "Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG")] + #[case("QRNQRNZQRN", true, "Fixed pattern must contain only amino acids: QRNQRN .. [Z] .. QRN")] + fn test_validate_fixed_pattern_error( + #[case] pattern: &str, + #[case] protein: bool, + #[case] msg: &str) { + let result = validate_fixed_pattern(&pattern, protein); let inner = result.unwrap_err().to_string(); assert_eq!(inner, msg); } diff --git a/src/lib/mod.rs b/src/lib/mod.rs index aaaccfd..e86609d 100644 --- a/src/lib/mod.rs +++ b/src/lib/mod.rs @@ -11,6 +11,7 @@ use lazy_static::lazy_static; use std::{borrow::Borrow, path::Path}; pub const DNA_BASES: [u8; 5] = *b"ACGTN"; +pub const AMINO_ACIDS: [u8; 20] = *b"ARNDCEQGHILKMFPSTWYV"; pub const IUPAC_BASES: [u8; 15] = *b"AGCTYRWSKMDVHBN"; pub const IUPAC_BASES_COMPLEMENT: [u8; 15] = *b"TCGARYWSMKHBDVN"; diff --git a/src/main.rs b/src/main.rs index 2e9f833..f662cea 100644 --- a/src/main.rs +++ b/src/main.rs @@ -258,6 +258,10 @@ struct Opts { #[structopt(long)] progress: bool, + /// The input files contain protein sequence (amino acids), not DNA sequence. + #[structopt(long)] + protein: bool, + /// The first argument is the pattern to match, with the remaining arguments containing the /// files to match. If `-e` is given, then all the arguments are files to match. /// Use standard input if either no files are given or `-` is given. @@ -366,13 +370,17 @@ fn fqgrep_from_opts(opts: &Opts) -> Result { ); } + if opts.protein && opts.reverse_complement { + bail!("--reverse-complement many not be used with --protein") + } + // Validate the fixed string pattern, if fixed-strings are specified if opts.fixed_strings { if let Some(pattern) = &pattern { - validate_fixed_pattern(pattern)?; + validate_fixed_pattern(pattern, opts.protein)?; } else if !opts.regexp.is_empty() { for pattern in &opts.regexp { - validate_fixed_pattern(pattern)?; + validate_fixed_pattern(pattern, opts.protein)?; } } else { bail!("A pattern must be given as a positional argument or with -e/--regexp") @@ -672,6 +680,7 @@ pub mod tests { paired: false, reverse_complement: false, progress: true, + protein: false, args: fq_path.to_vec(), output: output, }; @@ -796,6 +805,46 @@ pub mod tests { let sequence_match = expected_seq == return_sequences; assert_eq!(sequence_match, expected_bool); } + + // ############################################################################################ + //Tests match with protein (not DNA!) + // ############################################################################################ + #[rstest] + // fixed strings + #[case(vec!["A"], vec!["AAAA", "ATAT", "TATA", "AAAT", "TTTA"])] // unpaired: fixed string with multiple matches + #[case(vec!["Q"], vec!["QFPQFP"])] // unpaired: fixed string with one match + #[case(vec!["M"], vec![])] // unpaired: fixed string with no matches + // regex + #[case(vec!["^Q"], vec!["QFPQFP"])] // unpaired: regex with one match + #[case(vec!["^Q", "^F"], vec!["QFPQFP"])] // unpaired: regex set with two matches + #[case(vec!["^Q", "^A"], vec!["AAAA", "ATAT", "AAAT", "QFPQFP"])] // unpaired: regex set with two matches + #[case(vec!["^M", "^K"], vec![])] // unpaired: regex set with no matches + fn test_protein_ok( + #[case] pattern: Vec<&str>, + #[case] expected_seq: Vec<&str>, + ) { + let dir = TempDir::new().unwrap(); + let seqs = vec![ + vec!["AAAA", "TTTT", "ATAT", "TATA", "AAAT", "TTTA", "QFPQFP"], + ]; + let out_path = dir.path().join(String::from("output.fq")); + let result_path = &out_path.clone(); + let pattern = pattern.iter().map(|&s| s.to_owned()).collect::>(); + let mut opts = build_opts( + &dir, + &seqs, + &pattern, + true, + Some(out_path), + String::from(".fq"), + ); + + opts.protein = true; + let _result = fqgrep_from_opts(&opts); + let return_sequences = slurp_output(result_path.to_path_buf()); + + assert_eq!(return_sequences, expected_seq); + } // ############################################################################################ // Tests two fastqs for 'TGGATTCAGACTT' which is only found once in the reverse complement @@ -831,21 +880,52 @@ pub mod tests { let result = fqgrep_from_opts(&opts); assert_eq!(result.unwrap(), expected); } + + // + // ############################################################################################ + // Tests that an error is returned when protein and reverse_complement are both present + // ############################################################################################ + #[rstest] + #[should_panic( + expected = "called `Result::unwrap()` on an `Err` value: --reverse-complement many not be used with --protein" + )] + #[case()] + fn text_fails_with_protein_and_reverse_complement() { + let dir = TempDir::new().unwrap(); + let seqs = vec![vec!["GGGG", "TTTT"], vec!["AAAA", "CCCC"]]; + let pattern = vec![String::from("^G")]; + + let mut opts = build_opts(&dir, &seqs, &pattern, true, None, String::from(".fq")); + + opts.protein = true; + opts.reverse_complement = true; + let _result = fqgrep_from_opts(&opts); + _result.unwrap(); + } // ############################################################################################ - // Tests that an error is returned when fixed_strings is true and regex is present + // Tests that an error is returned when fixed_strings is true for DNA and regex is present // ############################################################################################ #[rstest] #[should_panic( expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only DNA bases: .. [^] .. G" )] - #[case(true, vec![String::from("^G")])] // panic with single regex + #[case(true, false, vec![String::from("^G")])] // panic with single regex #[should_panic( expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only DNA bases: .. [^] .. A" )] - #[case(true, vec![String::from("^A"),String::from("AA")])] // panic with combination of regex and fixed string + #[case(true, false, vec![String::from("^A"),String::from("AA")])] // panic with combination of regex and fixed string + #[should_panic( + expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only amino acids: .. [^] .. Q" + )] + #[case(true, true, vec![String::from("^Q")])] // panic with single regex + #[should_panic( + expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only amino acids: .. [^] .. Q" + )] + #[case(true, true, vec![String::from("^Q"),String::from("QQ")])] // panic with combination of regex and fixed string fn test_regexp_from_fixed_string_fails_with_regex( #[case] fixed_strings: bool, + #[case] protein: bool, #[case] pattern: Vec, ) { let dir = TempDir::new().unwrap(); @@ -854,9 +934,11 @@ pub mod tests { let mut opts = build_opts(&dir, &seqs, &pattern, true, None, String::from(".fq")); opts.fixed_strings = fixed_strings; + opts.protein = protein; let _result = fqgrep_from_opts(&opts); _result.unwrap(); } + // ############################################################################################ // Tests error is returned from main when three records are defined as paired // ############################################################################################ From 9612b6414f8d27528c3a6d4cd8d99262f160d6fd Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Wed, 11 Dec 2024 14:17:37 -0700 Subject: [PATCH 2/2] lint --- src/lib/matcher.rs | 21 ++++++++++++++------- src/main.rs | 15 ++++++--------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/lib/matcher.rs b/src/lib/matcher.rs index 6bcabee..23ef5ae 100644 --- a/src/lib/matcher.rs +++ b/src/lib/matcher.rs @@ -4,8 +4,8 @@ use std::ops::Range; use crate::color::{color_background, color_head}; use crate::color::{COLOR_BACKGROUND, COLOR_BASES, COLOR_QUALS}; use crate::reverse_complement; -use crate::DNA_BASES; use crate::AMINO_ACIDS; +use crate::DNA_BASES; use anyhow::{bail, Context, Result}; use bstr::ByteSlice; use regex::bytes::{Regex, RegexBuilder, RegexSet, RegexSetBuilder}; @@ -622,20 +622,27 @@ pub mod tests { #[rstest] #[case("AGTGTGATG", false)] #[case("QRNQRNQRN", true)] - fn test_validate_fixed_pattern_is_ok( - #[case] pattern: &str, - #[case] protein: bool) { + fn test_validate_fixed_pattern_is_ok(#[case] pattern: &str, #[case] protein: bool) { let result = validate_fixed_pattern(&pattern, protein); assert!(result.is_ok()) } #[rstest] - #[case("AXGTGTGATG", false, "Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG")] - #[case("QRNQRNZQRN", true, "Fixed pattern must contain only amino acids: QRNQRN .. [Z] .. QRN")] + #[case( + "AXGTGTGATG", + false, + "Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG" + )] + #[case( + "QRNQRNZQRN", + true, + "Fixed pattern must contain only amino acids: QRNQRN .. [Z] .. QRN" + )] fn test_validate_fixed_pattern_error( #[case] pattern: &str, #[case] protein: bool, - #[case] msg: &str) { + #[case] msg: &str, + ) { let result = validate_fixed_pattern(&pattern, protein); let inner = result.unwrap_err().to_string(); assert_eq!(inner, msg); diff --git a/src/main.rs b/src/main.rs index f662cea..81be97c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -805,7 +805,7 @@ pub mod tests { let sequence_match = expected_seq == return_sequences; assert_eq!(sequence_match, expected_bool); } - + // ############################################################################################ //Tests match with protein (not DNA!) // ############################################################################################ @@ -819,14 +819,11 @@ pub mod tests { #[case(vec!["^Q", "^F"], vec!["QFPQFP"])] // unpaired: regex set with two matches #[case(vec!["^Q", "^A"], vec!["AAAA", "ATAT", "AAAT", "QFPQFP"])] // unpaired: regex set with two matches #[case(vec!["^M", "^K"], vec![])] // unpaired: regex set with no matches - fn test_protein_ok( - #[case] pattern: Vec<&str>, - #[case] expected_seq: Vec<&str>, - ) { + fn test_protein_ok(#[case] pattern: Vec<&str>, #[case] expected_seq: Vec<&str>) { let dir = TempDir::new().unwrap(); - let seqs = vec![ - vec!["AAAA", "TTTT", "ATAT", "TATA", "AAAT", "TTTA", "QFPQFP"], - ]; + let seqs = vec![vec![ + "AAAA", "TTTT", "ATAT", "TATA", "AAAT", "TTTA", "QFPQFP", + ]]; let out_path = dir.path().join(String::from("output.fq")); let result_path = &out_path.clone(); let pattern = pattern.iter().map(|&s| s.to_owned()).collect::>(); @@ -880,7 +877,7 @@ pub mod tests { let result = fqgrep_from_opts(&opts); assert_eq!(result.unwrap(), expected); } - + // // ############################################################################################ // Tests that an error is returned when protein and reverse_complement are both present