From 24c9eab2aaa8339e6dfa26d33bf065aa6da79c86 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Tue, 24 Oct 2023 16:01:42 +1100 Subject: [PATCH 1/7] misc --- flexiplex.c++ | 51 ++++++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index d0e8d62..74ad4ca 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -145,8 +145,8 @@ unsigned int edit_distance(const std::string& s1, const std::string& s2, unsigne //a targeted search in the region for barcode //Seaquence seearch is performed using edlib -Barcode get_barcode(string & seq, - unordered_set *known_barcodes, +Barcode get_barcode(const string & seq, + const unordered_set &known_barcodes, int flank_max_editd, int barcode_max_editd, const std::vector> &search_pattern) { @@ -182,7 +182,7 @@ Barcode get_barcode(string & seq, //search for the concatenated pattern EdlibAlignResult result = edlibAlign(search_string.c_str(), search_string.length(), seq.c_str(), seq.length(), edlibConf); - if(result.status != EDLIB_STATUS_OK | result.numLocations==0 ){ + if(result.status != EDLIB_STATUS_OK || result.numLocations==0 ){ edlibFreeAlignResult(result); return(barcode); // no match found - return } //fill in info about found primer and polyT location @@ -263,7 +263,7 @@ std::vector subpattern_ends; std::string exact_bc = seq.substr( read_to_subpatterns[bc_index], search_pattern[bc_index].second.length()); -if(known_barcodes->size()==0 || (known_barcodes->find(exact_bc) != known_barcodes->end())){ +if(known_barcodes.size()==0 || (known_barcodes.find(exact_bc) != known_barcodes.end())){ barcode.barcode=exact_bc; barcode.editd=0; barcode.unambiguous=true; @@ -283,17 +283,15 @@ return(barcode); search_pattern[bc_index].second.length() + 2 * OFFSET); //iterate over all the known barcodes, checking each sequentially - unordered_set::iterator known_barcodes_itr=known_barcodes->begin(); unsigned int editDistance; unsigned int endDistance; - for(; known_barcodes_itr!=known_barcodes->end(); known_barcodes_itr++){ - search_string=(*known_barcodes_itr); //known barcode to check again - editDistance = edit_distance(barcode_seq,search_string,endDistance,barcode_max_editd); + for (const auto &known_bc : known_barcodes) { + editDistance = edit_distance(barcode_seq,known_bc,endDistance,barcode_max_editd); if (editDistance == barcode.editd) { barcode.unambiguous=false; } else if (editDistance < barcode.editd && editDistance <= barcode_max_editd) { //if best so far, update barcode.unambiguous=true; barcode.editd=editDistance; - barcode.barcode=*known_barcodes_itr; + barcode.barcode=known_bc; if (umi_index == -1) { barcode.umi = ""; } else if (umi_index == bc_index + 1) { @@ -331,16 +329,16 @@ vector big_barcode_search(string & sequence, unordered_set & kn vector return_vec; //vector of all the barcodes found //search for barcode - Barcode result=get_barcode(sequence,&known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); + Barcode result=get_barcode(sequence,known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); if(result.editd<=max_editd && result.unambiguous) //add to return vector if edit distance small enough - return_vec.push_back(result); + return_vec.emplace_back(result); //if a result was found, mask out the flanking sequence and search again in case there are more. - if(return_vec.size()>0){ + if (!return_vec.empty()) { string masked_sequence = sequence; - for(int i=0; i masked_res; masked_res=big_barcode_search(masked_sequence,known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); @@ -375,21 +373,16 @@ void print_stats(string read_id, vector & vec_bc, ostream & out_stream) } } -void print_line(string id, string read, string quals, ostream & out_stream){ - - //flag for read format - bool is_fastq=!(quals==""); //no quality scores passed = fasta - +void print_line(const string& id, const string& read, const string& quals, ostream& out_stream) { + const char delimiter = quals.empty() ? '>' : '@'; + //output to the new read file - if(is_fastq) - out_stream << "@" << id << "\n"; - else - out_stream << ">" << id << "\n"; - out_stream << read << "\n"; - if(is_fastq){ - out_stream << "+" << id << "\n"; - out_stream << quals << "\n"; - } + out_stream << delimiter << id << '\n' + << read << '\n'; + + if (!quals.empty()) + out_stream << '+' << id << '\n' + << quals << '\n'; } //print fastq or fasta lines.. From d9a1897e188007d12e7f7102743798006b2f32a2 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Tue, 24 Oct 2023 16:04:56 +1100 Subject: [PATCH 2/7] wip: trim polyT --- flexiplex.c++ | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 74ad4ca..4f76d7c 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -188,7 +188,7 @@ Barcode get_barcode(const string & seq, } //fill in info about found primer and polyT location barcode.flank_editd=result.editDistance; barcode.flank_start=result.startLocations[0]; - barcode.flank_end=result.endLocations[0]; + barcode.flank_end=seq.find_first_not_of('T', result.endLocations[0]); // Extract sub-patterns from aligment directly std::vector subpattern_lengths; @@ -337,7 +337,12 @@ vector big_barcode_search(string & sequence, unordered_set & kn if (!return_vec.empty()) { string masked_sequence = sequence; for(const auto &barcode : return_vec){ - int flank_length = barcode.flank_end - barcode.flank_start; + int flank_length; + if (barcode.flank_end == std::string::npos) { + flank_length = masked_sequence.length() - 1 - barcode.flank_start; + } else { + flank_length = barcode.flank_end - barcode.flank_start; + } masked_sequence.replace(barcode.flank_start, flank_length,string(flank_length,'X')); } //recursively call this function until no more barcodes are found vector masked_res; @@ -400,6 +405,9 @@ void print_read(string read_id,string read, string qual, string new_read_id=barcode+"_"+vec_bc.at(b).umi+"#"+read_id+ss.str(); // work out the start and end base in case multiple barcodes + if (vec_bc.at(b).flank_end == std::string::npos) { + continue; + } int read_start=vec_bc.at(b).flank_end; int read_length=read.length()-read_start; for(int f=0; f Date: Tue, 24 Oct 2023 16:18:46 +1100 Subject: [PATCH 3/7] add conda test for flexiplex output --- conda-recipe/run_test.sh | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 conda-recipe/run_test.sh diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh new file mode 100644 index 0000000..054c892 --- /dev/null +++ b/conda-recipe/run_test.sh @@ -0,0 +1,36 @@ +set -eu +msg() { + echo "=> $@" +} + +shell() { + msg Testing bash curl etc + ( + diff <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) \ + <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) > /dev/null +) +} + +testflexiplex() { + msg Testing flexiplex demultiplex output + diff <(flexiplex -x CTACACGACGCTCTTCCGATCT -b '????????????????' -u '????????????' -x TTTTTTTTT -e 2 -f 8 -k <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/fastq/musc_rps24.fastq.gz | zcat)) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/tests/testthat/demultiplexed.fq) > /dev/null + msg done +} + +testflexiplexfail() { + msg Testing fail + diff <(flexiplex -x CTACACGACGCTCTTCCGATCT -b '????????????????' -u '????????????' -x TTTTTTTTT -e 2 -f 1 -k <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/fastq/musc_rps24.fastq.gz | zcat) 2>/dev/null) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/tests/testthat/demultiplexed.fq) > /dev/null + msg unexpected line reached +} + + +msg Running in $PWD +if [[ -n "${1:-}" ]]; then + # Run single specified code-quality tool. + $1 +else + # Run all code-quality tools. + shell + testflexiplex + testflexiplexfail +fi From 3691dc1f0f7de20938056905cb65b51e14ed9a19 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Tue, 24 Oct 2023 16:32:17 +1100 Subject: [PATCH 4/7] update conda test --- conda-recipe/run_test.sh | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh index 054c892..5adec65 100644 --- a/conda-recipe/run_test.sh +++ b/conda-recipe/run_test.sh @@ -9,21 +9,15 @@ shell() { diff <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) \ <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) > /dev/null ) + msg shell env is fine } testflexiplex() { msg Testing flexiplex demultiplex output diff <(flexiplex -x CTACACGACGCTCTTCCGATCT -b '????????????????' -u '????????????' -x TTTTTTTTT -e 2 -f 8 -k <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/fastq/musc_rps24.fastq.gz | zcat)) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/tests/testthat/demultiplexed.fq) > /dev/null - msg done + msg flexiplex demultiplex output passed } -testflexiplexfail() { - msg Testing fail - diff <(flexiplex -x CTACACGACGCTCTTCCGATCT -b '????????????????' -u '????????????' -x TTTTTTTTT -e 2 -f 1 -k <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/fastq/musc_rps24.fastq.gz | zcat) 2>/dev/null) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/tests/testthat/demultiplexed.fq) > /dev/null - msg unexpected line reached -} - - msg Running in $PWD if [[ -n "${1:-}" ]]; then # Run single specified code-quality tool. @@ -32,5 +26,6 @@ else # Run all code-quality tools. shell testflexiplex - testflexiplexfail fi +echo "---" +echo All tests passed! From 703050c1ab32b7ef544d7eefa2a770bfb0dbd648 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Tue, 24 Oct 2023 17:11:06 +1100 Subject: [PATCH 5/7] fix conda test for mac --- conda-recipe/run_test.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh index 5adec65..2eba2c2 100644 --- a/conda-recipe/run_test.sh +++ b/conda-recipe/run_test.sh @@ -14,7 +14,10 @@ shell() { testflexiplex() { msg Testing flexiplex demultiplex output - diff <(flexiplex -x CTACACGACGCTCTTCCGATCT -b '????????????????' -u '????????????' -x TTTTTTTTT -e 2 -f 8 -k <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/fastq/musc_rps24.fastq.gz | zcat)) <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/tests/testthat/demultiplexed.fq) > /dev/null + curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/bc_allow.tsv.gz | zcat > bc_allow.tsv + curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/inst/extdata/fastq/musc_rps24.fastq.gz | zcat > musc_rps24.fastq + diff <(flexiplex -x CTACACGACGCTCTTCCGATCT -b '????????????????' -u '????????????' -x TTTTTTTTT -e 2 -f 8 -k bc_allow.tsv musc_rps24.fastq) \ + <(curl -sL https://raw.githubusercontent.com/mritchielab/FLAMES/devel/tests/testthat/demultiplexed.fq) msg flexiplex demultiplex output passed } From 5fc4d2c68322f46b8c555f459bdfc3f35e0a1d39 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Tue, 19 Mar 2024 10:41:09 +1100 Subject: [PATCH 6/7] check if last pattern is polyT/A --- flexiplex.c++ | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 8c35118..1b267f8 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -239,7 +239,15 @@ Barcode get_barcode(const string & seq, } //fill in info about found primer and polyT location barcode.flank_editd=result.editDistance; barcode.flank_start=result.startLocations[0]; - barcode.flank_end=seq.find_first_not_of('T', result.endLocations[0]); + + // check if last element of saerch_pattern is polyT + if (std::all_of(search_pattern.back().second.begin(), search_pattern.back().second.end(), [](char c) { return c == 'T'; })) { + barcode.flank_end = seq.find_first_not_of('T', result.endLocations[0]); + } else if (std::all_of(search_pattern.back().second.begin(), search_pattern.back().second.end(), [](char c) { return c == 'A'; })) { + barcode.flank_end = seq.find_first_not_of('A', result.endLocations[0]); + } else { + barcode.flank_end=result.endLocations[0]; + } // Extract sub-patterns from aligment directly std::vector subpattern_lengths; From 31ba380a128f350afac535d9db9318f7138b8b33 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Tue, 19 Mar 2024 10:48:00 +1100 Subject: [PATCH 7/7] update help docs --- flexiplex.c++ | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 1b267f8..00b8c7b 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -72,8 +72,10 @@ void print_usage(){ cerr << " -b sequence Append the barcode pattern to search for\n"; cerr << " -u sequence Append the UMI pattern to search for\n"; cerr << " Notes:\n"; - cerr << " The order of these options matters\n"; - cerr << " ? - can be used as a wildcard\n"; + cerr << " The order of these options matters.\n"; + cerr << " ? - can be used as a wildcard.\n"; + cerr << " If the last pattern consists of only T or A, the polyT/A will be trimmed\n"; + cerr << " until the first non-T(or A for polyA tails) character.\n"; cerr << " When no search pattern x,b,u option is provided, the following default pattern is used: \n"; cerr << " primer: CTACACGACGCTCTTCCGATCT\n"; cerr << " barcode: ????????????????\n";