From 8b7f5543c11a95cdf8fc5a753b5db6ea9c494f93 Mon Sep 17 00:00:00 2001 From: Miguel Machado Date: Wed, 16 Jan 2019 16:05:44 +0000 Subject: [PATCH] v4.0.1 Allow to change Bowtie2 alignment mode --- README.md | 21 +++++++++++-- ReMatCh/__init__.py | 2 +- ReMatCh/modules/rematch_module.py | 51 ++++++++++++++++++------------- ReMatCh/rematch.py | 37 ++++++++++++++-------- 4 files changed, 73 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 9cc92ae..28de628 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,9 @@ The sample files are required to be in "fq.gz" (or "fastq.gz") format. [--minFrequencyDominantAllele 0.6] [--minGeneCoverage N] [--minGeneIdentity N] [--doubleRun] [--reportSequenceCoverage] [--notWriteConsensus] - [--bowtieOPT] [--debug] + [--bowtieAlgo="--very-sensitive-local"] + [--bowtieOPT="--no-mixed"] + [--debug] [--mlstSchemaNumber N] [--mlstConsensus noMatter] [--mlstRun first] [-a /path/to/asperaweb_id_dsa.openssh] [-k] @@ -212,8 +214,21 @@ The sample files are required to be in "fq.gz" (or "fastq.gz") format. present in at least one sample (usefull when using a large number of reference sequences, and only for first run) (default: False) - --bowtieOPT "--no-mixed" - Extra Bowtie2 options (default: None) + --bowtieAlgo="--very-sensitive-local" + Bowtie2 alignment mode. It can be an end-to-end alignment + (unclipped alignment) or local alignment (soft clipped + alignment). Also, can choose between fast or sensitive + alignments. Please check Bowtie2 manual for extra information: + http://bowtie-bio.sourceforge.net/bowtie2/index.shtml . + This option should be provided between quotes and starting + with an empty space (like --bowtieAlgo " --very-fast") or + using equal sign (like --bowtieAlgo="--very-fast") + (default: "--very-sensitive-local") + --bowtieOPT="--no-mixed" + Extra Bowtie2 options. This option should be provided between + quotes and starting with an empty space + (like --bowtieOPT " --no-mixed") or using equal sign + (like --bowtieOPT="--no-mixed") (default: None) --debug DeBug Mode: do not remove temporary files (default: False) --mlstReference If the curated scheme for MLST alleles is available, tells ReMatCh to use these as reference (force Bowtie2 to run diff --git a/ReMatCh/__init__.py b/ReMatCh/__init__.py index a93f051..1a3bef5 100644 --- a/ReMatCh/__init__.py +++ b/ReMatCh/__init__.py @@ -1 +1 @@ -__version__ = '4.0' +__version__ = '4.0.1' diff --git a/ReMatCh/modules/rematch_module.py b/ReMatCh/modules/rematch_module.py index 4586439..c2d3f40 100644 --- a/ReMatCh/modules/rematch_module.py +++ b/ReMatCh/modules/rematch_module.py @@ -33,15 +33,16 @@ def index_sequence_bowtie2(reference_file, threads): # Mapping with Bowtie2 -def mapping_bowtie2(fastq_files, reference_file, threads, outdir, conserved_true, num_map_loc, bowtie_opt): +def mapping_bowtie2(fastq_files, reference_file, threads, outdir, num_map_loc, + bowtie_algorithm='--very-sensitive-local', bowtie_opt=None): sam_file = os.path.join(outdir, str('alignment.sam')) # Index reference file run_successfully = index_sequence_bowtie2(reference_file, threads) if run_successfully: - command = ['bowtie2', '-k', str(num_map_loc), '-q', '', '--threads', str(threads), '-x', reference_file, '', - '--no-unal', '', '-S', sam_file] + command = ['bowtie2', '-k', str(num_map_loc), '-q', bowtie_algorithm, '--threads', str(threads), '-x', + reference_file, '', '--no-unal', '', '-S', sam_file] if len(fastq_files) == 1: command[9] = '-U ' + fastq_files[0] @@ -50,11 +51,6 @@ def mapping_bowtie2(fastq_files, reference_file, threads, outdir, conserved_true else: return False, None - if conserved_true: - command[4] = '--sensitive' - else: - command[4] = '--very-sensitive-local' - if bowtie_opt is not None: command[11] = bowtie_opt @@ -266,17 +262,22 @@ def index_alignment(alignment_file): return run_successfully -def mapping_reads(fastq_files, reference_file, threads, outdir, conserved_true, num_map_loc, rematch_run, +def mapping_reads(fastq_files, reference_file, threads, outdir, num_map_loc, rematch_run, soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode, - bowtie_opt): + bowtie_algorithm, bowtie_opt, clean_run=True): # Create a symbolic link to the reference_file - reference_link = os.path.join(outdir, os.path.basename(reference_file)) - os.symlink(reference_file, reference_link) + if clean_run: + reference_link = os.path.join(outdir, os.path.basename(reference_file)) + if os.path.islink(reference_link): + os.unlink(reference_link) + os.symlink(reference_file, reference_link) + reference_file = reference_link bam_file = None # Mapping reads using Bowtie2 - run_successfully, sam_file = mapping_bowtie2(fastq_files, reference_link, threads, outdir, conserved_true, - num_map_loc, bowtie_opt) + run_successfully, sam_file = mapping_bowtie2(fastq_files=fastq_files, reference_file=reference_file, + threads=threads, outdir=outdir, num_map_loc=num_map_loc, + bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt) if run_successfully: # Remove soft clipping @@ -294,7 +295,7 @@ def mapping_reads(fastq_files, reference_file, threads, outdir, conserved_true, # Index bam run_successfully = index_alignment(bam_file) - return run_successfully, bam_file, reference_link + return run_successfully, bam_file, reference_file def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): @@ -913,6 +914,7 @@ def analyse_sequence_data(bam_file, sequence_information, outdir, counter, refer mean_coverage = None number_diferences = 0 number_multi_alleles = 0 + consensus_sequence = {'correct': {}, 'noMatter': {}, 'alignment': {}} # Create vcf file (for multiple alleles check) run_successfully, gene_vcf = create_vcf(bam_file, sequence_information['header'], outdir, counter, reference_file) @@ -1121,19 +1123,24 @@ def gather_data_together(sample, data_directory, sequences_information, outdir, @rematch_timer def run_rematch_module(sample, fastq_files, reference_file, threads, outdir, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, - conserved_true, debug_mode_true, num_map_loc, minimum_gene_identity, rematch_run, + debug_mode_true, num_map_loc, minimum_gene_identity, rematch_run, soft_clip_base_quality, soft_clip_recode_run, reference_dict, soft_clip_cigar_flag_recode, - bowtie_opt, gene_list_reference, not_write_consensus): + bowtie_algorithm, bowtie_opt, gene_list_reference, not_write_consensus, clean_run=True): rematch_folder = os.path.join(outdir, 'rematch_module', '') + utils.remove_directory(rematch_folder) os.mkdir(rematch_folder) # Map reads - run_successfully, bam_file, reference_file = mapping_reads(fastq_files, reference_file, threads, rematch_folder, - conserved_true, num_map_loc, rematch_run, - soft_clip_base_quality, soft_clip_recode_run, - reference_dict, soft_clip_cigar_flag_recode, bowtie_opt) - + run_successfully, bam_file, reference_file = mapping_reads(fastq_files=fastq_files, reference_file=reference_file, + threads=threads, outdir=rematch_folder, + num_map_loc=num_map_loc, rematch_run=rematch_run, + soft_clip_base_quality=soft_clip_base_quality, + soft_clip_recode_run=soft_clip_recode_run, + reference_dict=reference_dict, + soft_clip_cigar_flag_recode=soft_clip_cigar_flag_recode, + bowtie_algorithm=bowtie_algorithm, bowtie_opt=bowtie_opt, + clean_run=clean_run) if run_successfully: # Index reference file run_successfully, stdout = index_fasta_samtools(reference_file, None, None, True) diff --git a/ReMatCh/rematch.py b/ReMatCh/rematch.py index 3a29a40..e15d23a 100755 --- a/ReMatCh/rematch.py +++ b/ReMatCh/rematch.py @@ -7,9 +7,9 @@ and consensus sequences production -Copyright (C) 2018 Miguel Machado +Copyright (C) 2019 Miguel Machado -Last modified: October 15, 2018 +Last modified: January 02, 2019 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -356,7 +356,6 @@ def run_rematch(args): args.mlstSchemaNumber, workdir) args.softClip_recodeRun = 'first' - args.conservedSeq = False if args.reference is None: reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path) @@ -437,10 +436,11 @@ def run_rematch(args): rematch_module.run_rematch_module(sample, fastq_files, reference_file, args.threads, sample_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, - args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity, + args.debug, args.numMapLoc, args.minGeneIdentity, 'first', args.softClip_baseQuality, args.softClip_recodeRun, - reference_dict, args.softClip_cigarFlagRecode, args.bowtieOPT, - gene_list_reference, args.notWriteConsensus) + reference_dict, args.softClip_cigarFlagRecode, + args.bowtieAlgo, args.bowtieOPT, + gene_list_reference, args.notWriteConsensus, clean_run=True) if run_successfully_rematch_first: if args.mlst is not None and (args.mlstRun == 'first' or args.mlstRun == 'all'): run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'first', workdir, time_str) @@ -471,12 +471,14 @@ def run_rematch(args): args.threads, rematch_second_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, - args.conservedSeq, args.debug, args.numMapLoc, + args.debug, args.numMapLoc, args.minGeneIdentity, 'second', args.softClip_baseQuality, args.softClip_recodeRun, consensus_concatenated_dict, - args.softClip_cigarFlagRecode, args.bowtieOPT, - gene_list_reference, args.notWriteConsensus) + args.softClip_cigarFlagRecode, + args.bowtieAlgo, args.bowtieOPT, + gene_list_reference, args.notWriteConsensus, + clean_run=True) if not args.debug: os.remove(consensus_concatenated_fasta) if run_successfully_rematch_second: @@ -578,8 +580,6 @@ def main(): required=False) parser_optional_rematch = parser.add_argument_group('ReMatCh module facultative options') - parser_optional_rematch.add_argument('--conservedSeq', action='store_true', help=argparse.SUPPRESS) - # parser_optional_rematch.add_argument('--conservedSeq', action='store_true', help='This option can be used with conserved sequences like MLST genes to speedup the analysis by alignning reads using Bowtie2 sensitive algorithm') parser_optional_rematch.add_argument('--extraSeq', type=int, metavar='N', help='Sequence length added to both ends of target sequences (usefull to' ' improve reads mapping to the target one) that will be trimmed in' @@ -621,7 +621,20 @@ def main(): ' sequences, and only for first run)') parser_optional_rematch.add_argument('--notWriteConsensus', action='store_true', help='Do not write consensus sequences') - parser_optional_rematch.add_argument('--bowtieOPT', type=str, metavar='"--no-mixed"', help='Extra Bowtie2 options', + parser_optional_rematch.add_argument('--bowtieAlgo', type=str, metavar='"--very-sensitive-local"', + help='Bowtie2 alignment mode. It can be an end-to-end alignment (unclipped' + ' alignment) or local alignment (soft clipped alignment). Also, can' + ' choose between fast or sensitive alignments. Please check Bowtie2' + ' manual for extra' + ' information: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml .' + ' This option should be provided between quotes and starting with' + ' an empty space (like --bowtieAlgo " --very-fast") or using equal' + ' sign (like --bowtieAlgo="--very-fast")', + required=False, default='--very-sensitive-local') + parser_optional_rematch.add_argument('--bowtieOPT', type=str, metavar='"--no-mixed"', + help='Extra Bowtie2 options. This option should be provided between quotes and' + ' starting with an empty space (like --bowtieOPT " --no-mixed") or using' + ' equal sign (like --bowtieOPT="--no-mixed")', required=False) parser_optional_rematch.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files')