From a19202b53c4364ef4d8d901aa24541a47becea2b Mon Sep 17 00:00:00 2001 From: rzlim08 <37033997+rzlim08@users.noreply.github.com> Date: Fri, 24 May 2024 09:50:38 -0700 Subject: [PATCH] Sort bowtie2 outputs for reproducibility (#365) * sort bowtie2 outputs for reproducibility * linting * add multithreading to sort --- lib/idseq-dag/idseq_dag/steps/run_assembly.py | 17 +++++++++++++++++ lib/idseq-dag/idseq_dag/util/m8.py | 2 +- workflows/short-read-mngs/host_filter.wdl | 3 +++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/lib/idseq-dag/idseq_dag/steps/run_assembly.py b/lib/idseq-dag/idseq_dag/steps/run_assembly.py index 13b908836..4135ca110 100644 --- a/lib/idseq-dag/idseq_dag/steps/run_assembly.py +++ b/lib/idseq-dag/idseq_dag/steps/run_assembly.py @@ -221,6 +221,23 @@ def generate_read_to_contig_mapping(assembled_contig, } ) ) + + # sort bowtie2 output file + # samtools sort -n -O sam -o bowtie2.sam bowtie2.sam + samtools_sort_params = [ + "sort", + "-n", + "-O", "sam", + "-o", output_bowtie_sam, + output_bowtie_sam + ] + command.execute( + command_patterns.SingleCommand( + cmd="samtools", + args=samtools_sort_params + ) + ) + contig_stats, _ = generate_info_from_sam(output_bowtie_sam, read2contig, duplicate_cluster_sizes_path=duplicate_cluster_sizes_path) with open(output_contig_stats, 'w') as ocf: json.dump(contig_stats, ocf) diff --git a/lib/idseq-dag/idseq_dag/util/m8.py b/lib/idseq-dag/idseq_dag/util/m8.py index 7b9ac04fd..02fc14444 100644 --- a/lib/idseq-dag/idseq_dag/util/m8.py +++ b/lib/idseq-dag/idseq_dag/util/m8.py @@ -431,7 +431,7 @@ def generate_taxon_count_json_from_m8( agg_bucket["base_count"], } if agg_bucket.get('source_count_type'): - taxon_counts_row['source_count_type'] = list(agg_bucket['source_count_type']) + taxon_counts_row['source_count_type'] = sorted(list(agg_bucket['source_count_type'])) taxon_counts_attributes.append(taxon_counts_row) output_dict = { diff --git a/workflows/short-read-mngs/host_filter.wdl b/workflows/short-read-mngs/host_filter.wdl index 9aefcebb9..38c30dada 100644 --- a/workflows/short-read-mngs/host_filter.wdl +++ b/workflows/short-read-mngs/host_filter.wdl @@ -276,6 +276,9 @@ task ercc_bowtie2_filter { ~{bowtie2_invocation} + # sort bowtie2 outputs for reproducibility + samtools sort -n -O sam -@ 8 -o /tmp/bowtie2_ercc.sam /tmp/bowtie2_ercc.sam + # Extract reads [pairs] that did NOT map to the index if [[ '~{paired}' == 'true' ]]; then # 1 (read paired)