From e12bab469fd85fade700b3d7b701cefd8ce2ccb9 Mon Sep 17 00:00:00 2001 From: miguelpmachado Date: Tue, 12 Feb 2019 11:56:59 +0000 Subject: [PATCH 1/6] =?UTF-8?q?Change=20Bowtie=20-k=20option=20behaviour?= =?UTF-8?q?=20Only=20specify=20when=20num=5Fmap=5Floc=20>=201=20To=20ensur?= =?UTF-8?q?e=20>=20The=20best=20alignment=20found=20is=20reported=20(rando?= =?UTF-8?q?mly=20selected=20from=20among=20best=20if=20tied).=20And=20avoi?= =?UTF-8?q?d=20>=20The=20search=20terminates=20when=20it=20can=E2=80=99t?= =?UTF-8?q?=20find=20more=20distinct=20valid=20alignments,=20or=20when=20i?= =?UTF-8?q?t=20finds=20,=20whichever=20happens=20first.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ReMatCh/modules/rematch_module.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ReMatCh/modules/rematch_module.py b/ReMatCh/modules/rematch_module.py index c2d3f40..d86fe63 100644 --- a/ReMatCh/modules/rematch_module.py +++ b/ReMatCh/modules/rematch_module.py @@ -41,9 +41,13 @@ def mapping_bowtie2(fastq_files, reference_file, threads, outdir, num_map_loc, run_successfully = index_sequence_bowtie2(reference_file, threads) if run_successfully: - command = ['bowtie2', '-k', str(num_map_loc), '-q', bowtie_algorithm, '--threads', str(threads), '-x', + command = ['bowtie2', '', '', '-q', bowtie_algorithm, '--threads', str(threads), '-x', reference_file, '', '--no-unal', '', '-S', sam_file] + if num_map_loc is not None and num_map_loc > 1: + command[1] = '-k' + command[2] = str(num_map_loc) + if len(fastq_files) == 1: command[9] = '-U ' + fastq_files[0] elif len(fastq_files) == 2: From d715d180b3dbcdd151899e0a15d6da560fd4379a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?In=C3=AAs=20Mendes?= Date: Fri, 12 Jul 2019 17:41:26 +0100 Subject: [PATCH 2/6] add saureus mlst schema (extendend) --- .../mlst_schemas/staphylococcus_aureus.fasta | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta diff --git a/ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta b/ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta new file mode 100644 index 0000000..c4f5e15 --- /dev/null +++ b/ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta @@ -0,0 +1,85 @@ +>yqiL +TAATATGATTTGTTAAATGCATAACAAGAATGAAAATGTAACATACGTAGCAATTGGTTTCATAAATTGGATGTTAGTGG +CGTATTGGTTCATTAGACGTATTAGTAATAAAATTGTATATATCATAAGGAGATGAATATGACATGACGAGAGTCGTATT +AGCAGCAGCATACAGGACACCTATTGGCGTTTTTGGAGGTGCGTTTAAAGACGTGCCAGCCTATGATTTAGGTGCGACTT +TAATAGAACATATTATTAAAGAGACGGGTTTGAATCCAAGTGAGATTGATGAAGTTATCATCGGTAACGTACTACAAGCA +GGACAAGGACAAAATCCAGCACGAATTGCTGCTATGAAAGGTGGCTTGCCAGAAACAGTACCTGCATTTACGGTGAATAA +AGTATGTGGTTCTGGGTTAAAGTCGATTCAATTAGCATATCAATCTATTGTGACTGGTGAAAATGACATCGTGCTAGCTG +GCGGTATGGAGAATATGTCTCAATCACCAATGCTTGTCAACAACAGTCGCTTTGGTTTTAAAATGGGACATCAATCAATG +GTTGATAGCATGGTATATGATGGTTTAACAGATGTATTTAATCAATATCATATGGGTATTACTGCTGAAAATTTAGCAGA +GCAATATGGTATTTCAAGAGAAGAACAAGATACATTTGCTGTAAACTCACAACAAAAAGCAGTACGTGCACAGCAAAATG +GTGAATTTGATAGTGAAATAGTTCCAGTATCGATTCCTCAACGTAAAGGTGAACCAATCGTAGTCACTAAGGATGAAGGT +GTACGTGAAAATGTATCAGTCGAAAAATTAAGTCGATTAAGACCAGCTTTCAAAAAAGACGGTACAGTTACAGCAGGTAA +TGCATCAGGAATCAATGATGGTGCTGCGATGATGTT +>tpi +TAATTTGTGCACCAGCAATTCAATTAGATGCATTAACTACTGCAGTTAAAGAAGGAAAAGCACAAGGTTTAGAAATCGGT +GCTCAAAATACGTATTTCGAAGATAATGGTGCGTTCACAGGTGAAACGTCTCCAGTTGCATTAGCAGATTTAGGCGTTAA +ATACGTTGTTATCGGTCATTCTGAACGTCGTGAATTATTCCACGAAACAGATGAAGAAATTAACAAAAAAGCGCACGCTA +TTTTCAAACATGGAATGACTCCAATTATATGTGTTGGTGAAACAGACGAAGAGCGTGAAAGTGGTAAAGCTAACGATGTT +GTAGGTGAGCAAGTTAAGAAAGCTGTTGCAGGTTTATCTGAAGATCAACTTAAATCAGTTGTAATTGCTTATGAACCAAT +CTGGGCAATCGGAACTGGTAAATCATCAACATCTGAAGATGCAAATGAAATGTGTGCATTTGTACGTCAAACTATTGCTG +ACTTATCAAGCAAAGAAGTATCAGAAGCAACTCGTATTCAATATGGTGGTAGTGTTAAACCTAACAACATTAAAGAATAC +ATGGCACAAACTGATATTGATGGGGCATTAGTAGGTGGCGCATCACTTAAAGTTGAAGATTTCGTACAATTGTTAGAAGG +TGCAAAATAATCATGGCTAAGAAACCAACTGCGTTAATTATTTTAGATGGTTTTGCGAACCGCGAAAGCGAACATGGTAA +TGCGGTAAAATTAGCAAACAAGCCTAATTTTGATCGTTATTACAACAAATATCCAACGACTCAAATCGAAGCGAGTGGCT +TA +>pta +ACGATAAAATTATGATTACAAATTGGTGACGTGGCATTATGAAATAAAATGGCGTATAATTATACCGTGAATGATTAATA +AGATTTATATTACAGGAGGACATTATGGCTGATTTATTAAATGTATTAAAAGACAAACTTTCTGGTAAAAACGTTAAAAT +CGTATTACCTGAAGGAGAGGACGAACGTGTTCTAACAGCTGCAACACAATTACAAGCAACAGATTATGTTACACCAATCG +TGTTAGGTGATGAGACTAAGGTTCAATCTTTAGCGCAAAAACTTGATCTTGATATTTCTAATATTGAATTAATTAATCCT +GCGACAAGTGAATTGAAAGCTGAATTAGTTCAATCATTTGTTGAACGACGTAAAGGTAAAGCGACTGAAGAACAAGCACA +AGAATTATTAAACAATGTGAACTACTTCGGTACAATGCTTGTTTATGCTGGTAAAGCAGATGGTTTAGTTAGTGGTGCAG +CACATTCAACAGGCGACACTGTGCGTCCAGCTTTACAAATCATCAAAACGAAACCAGGTGTATCAAGAACATCAGGTATC +TTCTTTATGATTAAAGGTGATGAACAATACATCTTTGGTGATTGTGCAATCAATCCAGAACTTGATTCACAAGGACTTGC +AGAAATTGCAGTAGAAAGTGCAAAATCAGCATTAAGCTTTGGCATGGATCCAAAAGTTGCAATGTTAAGCTTTTCAACAA +AAGGGTCTGCTAAATCAGACGACGTGACAAAAGTTCAAGAAGCTGTCAAATTAGCACAACAAAAAGCTGAAGAAGAAAAA +TTAGAAGCAATCATTGATGGCGAATTCCAATTTGATGCTGCGATTGTACCAGGTGTTGCTGAGAAAAAAGCGCC +>gmk +GCAATATAACGATATTGTTAGACTTAATAGAAATTATGGCATGCAATTTCAAATATGCTATACAATATAAAGAACAATGT +GATATCATATTTAAATAATAGAAGATTAGCTTAGAGAGGTCGTAAGGCATGGATAATGAAAAAGGATTGTTAATCGTTTT +ATCAGGACCATCTGGAGTAGGTAAAGGTACTGTTAGAAAACGAATATTTGAAGATCCAAGTACATCATATAAGTATTCTA +TTTCAATGACAACACGTCAAATGCGTGAAGGTGAAGTTGATGGCGTAGATTACTTTTTTAAAACTAGGGATGCGTTTGAA +GCTTTAATCAAAGATGACCAATTTATAGAATATGCTGAATATGTAGGCAACTATTATGGTACACCAGTTCAATATGTTAA +AGATACAATGGACGAAGGTCATGATGTATTTTTAGAAATTGAAGTAGAAGGTGCAAAGCAAGTTAGAAAGAAATTTCCAG +ATGCGCTATTTATTTTCTTAGCACCTCCAAGTTTAGAACACTTGAGAGAGCGATTAGTAGGTAGAGGAACAGAATCTGAT +GAGAAAATACAAAGTCGTATTAACGAAGCGCGTAAAGAAGTTGAAATGATGAATTTATACGATTACGTTGTAGTTAATGA +TGAAGTAGAACTTGCGAAGAATAGAATTCAATGTATTGTAGAAGCTGAGCACTTAAAAAGAGAGCGCGTAGAAGCTAAGT +ATAGAAAAATGATTTTGGAGGCTAAAAAATAATGTTAAATCCACCTTTAAACCAATTAACGTCACAAATTAAATCAAAGT +ATTTAATTGCAACAACT +>glpF +CAAATATAATAAAAGTTAATACATAGAATAGAGACGGGAGATTTCTACGAGCCAAACTGCTAGTGTAGGAATCTCTTTGT +CTTTTTGGGAGGACATTTAATATGAATGTATATTTAGCAGAATTCCTAGGAACTGCAATCTTAATCCTTTTTGGTGGTGG +CGTTTGTGCCAATGTCAATTTAAAGAGAAGTGCTGCGAATGGTGCTGATTGGATTGTCATCACAGCTGGATGGGGATTAG +CGGTTACAATGGGTGTGTTTGCTGTCGGTCAATTCTCAGGTGCACATTTAAACCCAGCGGTGTCTTTAGCTCTTGCATTA +GACGGAAGTTTTGATTGGTCATTAGTTCCTGGTTATATTGTTGCTCAAATGTTAGGTGCAATTGTCGGAGCAACAATTGT +ATGGTTAATGTACTTGCCACATTGGAAAGCGACAGAAGAAGCTGGCGCGAAATTAGGTGTTTTCTCTACAGCACCGGCTA +TTAAGAATTACTTTGCCAACTTTTTAAGTGAGATTATCGGAACAATGGCATTAACTTTAGGTATTTTATTTATCGGTGTA +AACAAAATTGCCGATGGTTTAAATCCTTTAATTGTCGGAGCATTAATTGTTGCAATCGGATTAAGTTTAGGCGGTGCTAC +TGGTTATGCAATCAACCCAGCACGTGATTTAGGTCCGAGAATTGCACATGCGATTTTACCAATAGCTGGTAAAGGTGGTT +CAAATTGGTCATATGCAATCGTTCCTATCTTAGGACCAATTGCCGGTGGTTTATTAGGTGCAGTGGTATACGCTGTATTT +TATAAACATACATTTAATATTGGTTGTGCAATTGCAATTGTTGTAGTTATTATTACTTTGATTTT +>aroE +CCCTGGTAAACAAACATATCTAAGCCATTATAAATATGGTTTCCCTTGCGCTCTGCTTCCTCTAAAATAGGTGTTTTATA +CGGTATATAAACAATATCACTCATTAAAGTATTGGGAGAAAGATGCTTTAAATTAATAATACTTTCGTTATTTCCAGCCA +TACCCGCTGGTGTTGTATTAATAACGATATCGAATTCAGCTAAATACTTTTCAGCATCTGCTAATGAAATTTGGTTTATA +TTTAAATTCCAAGATTCAAAACGAGCCATCGTTCTATTCGCAACAGTTAATTTGGGCTTTACAAATTTTGCTAATTCATA +AGCAATACCTTTACTTGCACCACCTGCGCCCAAAATTAAAATGTATGCATTTTCTAAATCTGGATAAACGCTGTGCAATC +CTTTAACATAACCAATACCATCTGTATTATACCCTATCCACTTGCCATCTTTTATCAAAACAGTGTTAACTGCACCTGCA +TTAATCGCTTGTTCATCAACATAATCTAAATACGGTATGATACGTTCTTTATGAGGAATTGTGATATTAAAGCCTTCTAA +TTCTTTTTTCGAAATAATTTCTTTAATTAAATGAAAATCTTCAATTGGAATATTTAAAGCTTCATAAGTATCATCTAATC +CTAAAGAATTAAAATTTGCTCTATGCATAACGGGCGACAAGGAATGTGAAATAGGATTTCCTATAACTGCAAATTTCATT +TTTTTAATCACCTTATAAAATAGAATTTCTTAATACAACATCAACATTTTTAGGAACACGAACGATTACTTTAGCCCCTG +GTCCTATAGTTATAAAGCCTAGACCAGAGATCATAACATCGCGTTTCTCTTTGCCT +>arcC +ATTTTTGGCAACATCGATCCTTCCACAAACTTACCTTGTGCCGCGTATTTTTTCAGTGTTGCTACATCAATATCATCGAT +TTGTTGTTGATTAGGTTCATTAAAGTTAATAAATACATTTTCTACATTCGTAAGAATCATTAAGGTATCTGCTTCAATCA +GCGTTGCTAATTTCTCACTAGCAAAATCTTTATCTATAACCGCTTCAACACCTTCATAGGTATTTTCTTTTTTTATAACT +GGAATACCGCCACCACCGCATGCAATGACAATATTTTTACCGTCTGCTAAAGTTCGAATTAACTGGTGTTCTAGTATAGA +TTGAGGTAGTGGTGACGCAACTACTTTTCTATAACCACGTCCTGCATCTTCTTTAAAGACTGAGTCTGGCTGTTCTTTTT +GTAATTCTTCAACTTCTTCTTTCGTATAAAAAGGACCAATTGGTTTAGTTGGGTTATCAAATCGTGGATCATCTTTATCT +ACTTCCACACGTGTAACGATTGTGCCTACAGTTCTATCACTATTCATTTCAGTTAAAATGCGATTGATTTCAGTTTCCAA +CCAATAGCCTATCATACCCTGTGACATTGCACCACAAGTATCCAATGGCATTGCCGGCGTTGTGTCACTGTTCGATTTAG +CTTGTTGGATTAATAAACTTCCAATTTGTGGACCATTACCATGTGAAATCACAATACGCGCTGGTGAATCAAATAAAGGT +TTAAGGTTTTGCATCGCACATCTAATAGCTGTTTGTTGTGCTTCAGCTGTTGCTTCTGTTGTCTGTATCGCATTACCGCC +TAATGCAATGACAATTTTCTCTTTCATATTTTTGTCGCTCCTTTTAAAAAACATTT From 25e04096067739b2c14f39ed7997acc7a3e48a25 Mon Sep 17 00:00:00 2001 From: miguelpmachado Date: Tue, 23 Jul 2019 14:46:56 +0100 Subject: [PATCH 3/6] Add alternative VCF reader enconding --- ReMatCh/modules/rematch_module.py | 17 ++++++++++------- ReMatCh/modules/utils.py | 14 ++++++++------ ReMatCh/rematch.py | 23 +++++++++++++---------- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/ReMatCh/modules/rematch_module.py b/ReMatCh/modules/rematch_module.py index d86fe63..fc44338 100644 --- a/ReMatCh/modules/rematch_module.py +++ b/ReMatCh/modules/rematch_module.py @@ -317,26 +317,26 @@ def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): # Read vcf file class Vcf: - def __init__(self, vcf_file): - self.vcf = open(vcf_file, 'rtU') + def __init__(self, vcf_file, encoding=None): + self.vcf = open(vcf_file, 'rtU', encoding=encoding) self.line_read = self.vcf.readline() while self.line_read.startswith('#'): self.line_read = self.vcf.readline() self.line = self.line_read def readline(self): - self.line_stored = self.line + line_stored = self.line self.line = self.vcf.readline() - return self.line_stored + return line_stored def close(self): self.vcf.close() -def get_variants(gene_vcf): +def get_variants(gene_vcf, encoding=None): variants = {} - vfc_file = Vcf(gene_vcf) + vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding) line = vfc_file.readline() while len(line) > 0: fields = line.splitlines()[0].split('\t') @@ -928,7 +928,10 @@ def analyse_sequence_data(bam_file, sequence_information, outdir, counter, refer compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter) if run_successfully: - variants = get_variants(gene_vcf) + try: + variants = get_variants(gene_vcf=gene_vcf, encoding=None) + except UnicodeDecodeError: + variants = get_variants(gene_vcf=gene_vcf, encoding='latin_1') coverage = get_coverage(gene_coverage) diff --git a/ReMatCh/modules/utils.py b/ReMatCh/modules/utils.py index f21fa1a..6055351 100644 --- a/ReMatCh/modules/utils.py +++ b/ReMatCh/modules/utils.py @@ -1,11 +1,11 @@ import pickle -import traceback +from traceback import format_exception as traceback_format_exception import shlex import subprocess from threading import Timer import shutil import time -import functools +from functools import wraps as functools_wraps import os.path import sys @@ -221,7 +221,7 @@ def run_time(start_time): def timer(function, name): - @functools.wraps(function) + @functools_wraps(function) def wrapper(*args, **kwargs): print('\n' + 'RUNNING {0}\n'.format(name)) start_time = time.time() @@ -254,7 +254,7 @@ def extract_variable_from_pickle(pickleFile): def trace_unhandled_exceptions(func): - @functools.wraps(func) + @functools_wraps(func) def wrapped_func(*args, **kwargs): try: func(*args, **kwargs) @@ -263,8 +263,10 @@ def wrapped_func(*args, **kwargs): print(e) exc_type, exc_value, exc_tb = sys.exc_info() - # print(exc_value) - print(''.join(traceback.format_exception(exc_type, exc_value, exc_tb))) + print(''.join(traceback_format_exception(exc_type, exc_value, exc_tb))) + + raise exc_type(exc_value) + return wrapped_func diff --git a/ReMatCh/rematch.py b/ReMatCh/rematch.py index e15d23a..893c345 100755 --- a/ReMatCh/rematch.py +++ b/ReMatCh/rematch.py @@ -358,18 +358,21 @@ def run_rematch(args): args.softClip_recodeRun = 'first' if args.reference is None: - reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path) - args.extraSeq = 200 - if reference_file is None: - print('It was not found provided MLST scheme sequences for ' + args.mlst) - print('Trying to obtain reference MLST sequences from PubMLST') - if len(mlst_sequences) > 0: - reference_file = check_mlst.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str) - args.extraSeq = 0 + if args.mlst is not None: + reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path) + args.extraSeq = 200 + if reference_file is None: + print('It was not found provided MLST scheme sequences for ' + args.mlst) + print('Trying to obtain reference MLST sequences from PubMLST') + if len(mlst_sequences) > 0: + reference_file = check_mlst.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str) + args.extraSeq = 0 + else: + sys.exit('It was not possible to download MLST sequences from PubMLST!') else: - sys.exit('It was not possible to download MLST sequences from PubMLST!') + print('Using provided scheme as referece: ' + reference_file) else: - print('Using provided scheme as referece: ' + reference_file) + sys.exit('Need to provide at least one of the following options: "--reference" and "--mlst"') else: reference_file = os.path.abspath(args.reference.name) From fb0906b34b2d9ab07039652d9c3c946d79401a62 Mon Sep 17 00:00:00 2001 From: miguelpmachado Date: Tue, 23 Jul 2019 15:28:26 +0100 Subject: [PATCH 4/6] Print informative action --- ReMatCh/modules/rematch_module.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ReMatCh/modules/rematch_module.py b/ReMatCh/modules/rematch_module.py index fc44338..6545ebc 100644 --- a/ReMatCh/modules/rematch_module.py +++ b/ReMatCh/modules/rematch_module.py @@ -931,6 +931,8 @@ def analyse_sequence_data(bam_file, sequence_information, outdir, counter, refer try: variants = get_variants(gene_vcf=gene_vcf, encoding=None) except UnicodeDecodeError: + print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' + ' "latin_1" encoding: {}'.format(gene_vcf)) variants = get_variants(gene_vcf=gene_vcf, encoding='latin_1') coverage = get_coverage(gene_coverage) From a2c0d15f3a90d31dfd123c55b57a9f93ccc2d5cc Mon Sep 17 00:00:00 2001 From: miguelpmachado Date: Wed, 24 Jul 2019 09:27:08 +0100 Subject: [PATCH 5/6] Avoid encoding problems that arise beyond the reference sequence --- ReMatCh/modules/checkMLST.py | 2 +- ReMatCh/modules/rematch_module.py | 67 ++++++++++++++++++++++--------- ReMatCh/rematch.py | 4 +- 3 files changed, 52 insertions(+), 21 deletions(-) diff --git a/ReMatCh/modules/checkMLST.py b/ReMatCh/modules/checkMLST.py index f7aba6c..1abf304 100644 --- a/ReMatCh/modules/checkMLST.py +++ b/ReMatCh/modules/checkMLST.py @@ -116,7 +116,7 @@ def download_pub_mlst_xml(originalSpecies, schema_number, outdir): success = 0 for scheme in tree.findall('species'): - species_scheme = scheme.text.splitlines()[0].rsplit('#', 1) + species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1) number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 species_scheme = species_scheme[0] if determine_species(species_scheme) == determine_species(originalSpecies): diff --git a/ReMatCh/modules/rematch_module.py b/ReMatCh/modules/rematch_module.py index 6545ebc..6e4a5f8 100644 --- a/ReMatCh/modules/rematch_module.py +++ b/ReMatCh/modules/rematch_module.py @@ -3,10 +3,10 @@ import functools import sys import pickle +from . import utils # https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))) -import utils def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true): @@ -185,7 +185,7 @@ def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_ba soft_clip_cigar_flag_recode): lines_sam = [] for line in line_collection: - line = line.splitlines()[0] + line = line.rstrip('\r\n') if len(line) > 0: if line.startswith('@'): lines_sam.append(line) @@ -317,10 +317,15 @@ def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): # Read vcf file class Vcf: - def __init__(self, vcf_file, encoding=None): - self.vcf = open(vcf_file, 'rtU', encoding=encoding) + def __init__(self, vcf_file, encoding=None, newline=None): + self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline) self.line_read = self.vcf.readline() + self.contigs_info_dict = {} while self.line_read.startswith('#'): + if self.line_read.startswith('##contig=')[0] + self.contigs_info_dict[seq] = int(seq_len) self.line_read = self.vcf.readline() self.line = self.line_read @@ -332,24 +337,34 @@ def readline(self): def close(self): self.vcf.close() + def get_contig_legth(self, contig): + return self.contigs_info_dict[contig] -def get_variants(gene_vcf, encoding=None): + +def get_variants(gene_vcf, seq_name, encoding=None, newline=None): variants = {} - vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding) + vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline) line = vfc_file.readline() + counter = 1 while len(line) > 0: - fields = line.splitlines()[0].split('\t') + fields = line.rstrip('\r\n').split('\t') if len(fields) > 0: fields[1] = int(fields[1]) info_field = {} - for i in fields[7].split(';'): - i = i.split('=') - if len(i) > 1: - info_field[i[0]] = i[1] + try: + for i in fields[7].split(';'): + i = i.split('=') + if len(i) > 1: + info_field[i[0]] = i[1] + else: + info_field[i[0]] = None + except IndexError: + if counter > vfc_file.get_contig_legth(contig=seq_name): + break else: - info_field[i[0]] = None + raise IndexError format_field = {} format_field_name = fields[8].split(':') @@ -365,7 +380,15 @@ def get_variants(gene_vcf, encoding=None): else: variants[fields[1]] = {0: fields_to_store} - line = vfc_file.readline() + try: + line = vfc_file.readline() + except UnicodeDecodeError: + if counter + 1 > vfc_file.get_contig_legth(contig=seq_name): + break + else: + raise UnicodeDecodeError + + counter += 1 vfc_file.close() return variants @@ -785,7 +808,7 @@ def get_coverage(gene_coverage): with open(gene_coverage, 'rtU') as reader: for line in reader: - line = line.splitlines()[0] + line = line.rstrip('\r\n') if len(line) > 0: line = line.split('\t') coverage[int(line[1])] = int(line[2]) @@ -929,11 +952,19 @@ def analyse_sequence_data(bam_file, sequence_information, outdir, counter, refer if run_successfully: try: - variants = get_variants(gene_vcf=gene_vcf, encoding=None) + variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], + encoding=sys.getdefaultencoding()) except UnicodeDecodeError: - print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' - ' "latin_1" encoding: {}'.format(gene_vcf)) - variants = get_variants(gene_vcf=gene_vcf, encoding='latin_1') + try: + print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' + ' "utf_8" encoding: {}'.format(gene_vcf)) + variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], + encoding='utf_8') + except UnicodeDecodeError: + print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' + ' "latin_1" encoding: {}'.format(gene_vcf)) + variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], + encoding='latin_1') coverage = get_coverage(gene_coverage) diff --git a/ReMatCh/rematch.py b/ReMatCh/rematch.py index 893c345..783c28e 100755 --- a/ReMatCh/rematch.py +++ b/ReMatCh/rematch.py @@ -103,7 +103,7 @@ def get_list_ids_from_file(file_list_ids): with open(file_list_ids, 'rtU') as lines: for line in lines: - line = line.splitlines()[0] + line = line.rstrip('\r\n') if len(line) > 0: list_ids.append(line) @@ -119,7 +119,7 @@ def get_taxon_run_ids(taxon_name, outputfile): run_ids = [] with open(outputfile, 'rtU') as reader: for line in reader: - line = line.splitlines()[0] + line = line.rstrip('\r\n') if len(line) > 0: if not line.startswith('#'): line = line.split('\t') From 2d101702e72505b40259b1b9a9e69f3b839d013e Mon Sep 17 00:00:00 2001 From: cimendes Date: Wed, 24 Jul 2019 14:17:57 +0100 Subject: [PATCH 6/6] update aroE and arcC alleles --- .../mlst_schemas/staphylococcus_aureus.fasta | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta b/ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta index c4f5e15..08084fd 100644 --- a/ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta +++ b/ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta @@ -62,24 +62,24 @@ TATAAACATACATTTAATATTGGTTGTGCAATTGCAATTGTTGTAGTTATTATTACTTTGATTTT >aroE CCCTGGTAAACAAACATATCTAAGCCATTATAAATATGGTTTCCCTTGCGCTCTGCTTCCTCTAAAATAGGTGTTTTATA CGGTATATAAACAATATCACTCATTAAAGTATTGGGAGAAAGATGCTTTAAATTAATAATACTTTCGTTATTTCCAGCCA -TACCCGCTGGTGTTGTATTAATAACGATATCGAATTCAGCTAAATACTTTTCAGCATCTGCTAATGAAATTTGGTTTATA -TTTAAATTCCAAGATTCAAAACGAGCCATCGTTCTATTCGCAACAGTTAATTTGGGCTTTACAAATTTTGCTAATTCATA -AGCAATACCTTTACTTGCACCACCTGCGCCCAAAATTAAAATGTATGCATTTTCTAAATCTGGATAAACGCTGTGCAATC -CTTTAACATAACCAATACCATCTGTATTATACCCTATCCACTTGCCATCTTTTATCAAAACAGTGTTAACTGCACCTGCA -TTAATCGCTTGTTCATCAACATAATCTAAATACGGTATGATACGTTCTTTATGAGGAATTGTGATATTAAAGCCTTCTAA -TTCTTTTTTCGAAATAATTTCTTTAATTAAATGAAAATCTTCAATTGGAATATTTAAAGCTTCATAAGTATCATCTAATC -CTAAAGAATTAAAATTTGCTCTATGCATAACGGGCGACAAGGAATGTGAAATAGGATTTCCTATAACTGCAAATTTCATT +TACCCGCTGGTGTTGTATTAATAACGATATCGAATTCAGCAATTTTAATTCTTTAGGATTAGATGATACTTATGAAGCTT +TAAATATTCCAATTGAAGATTTTCATTTAATTAAAGAAATTATTTCGAAAAAAGAATTAGATGGCTTTAATATCACAATT +CCTCATAAAGAACGTATCATACCGTATTTAGATCATGTTGATGAACAAGCGATTAATGCAGGTGCAGTTAACACTGTTTT +GATAAAAGATGACAAGTGGATAGGGTATAATACAGATGGTATTGGTTATGTTAAAGGATTGCACAGCGTTTATCCAGATT +TAGAAAATGCATACATTTTAATTTTGGGCGCAGGTGGTGCAAGTAAAGGTATTGCTTATGAATTAGCAAAATTTGTAAAG +CCCAAATTAACTGTTGCGAATAGAACGATGGCTCGTTTTGAATCTTGGAATTTAAATATAAACCAAATTTCATTAGCAGA +TGCTGAAAAGTATTTATGCTCTATGCATAACGGGCGACAAGGAATGTGAAATAGGATTTCCTATAACTGCAAATTTCATT TTTTTAATCACCTTATAAAATAGAATTTCTTAATACAACATCAACATTTTTAGGAACACGAACGATTACTTTAGCCCCTG GTCCTATAGTTATAAAGCCTAGACCAGAGATCATAACATCGCGTTTCTCTTTGCCT >arcC -ATTTTTGGCAACATCGATCCTTCCACAAACTTACCTTGTGCCGCGTATTTTTTCAGTGTTGCTACATCAATATCATCGAT -TTGTTGTTGATTAGGTTCATTAAAGTTAATAAATACATTTTCTACATTCGTAAGAATCATTAAGGTATCTGCTTCAATCA -GCGTTGCTAATTTCTCACTAGCAAAATCTTTATCTATAACCGCTTCAACACCTTCATAGGTATTTTCTTTTTTTATAACT -GGAATACCGCCACCACCGCATGCAATGACAATATTTTTACCGTCTGCTAAAGTTCGAATTAACTGGTGTTCTAGTATAGA -TTGAGGTAGTGGTGACGCAACTACTTTTCTATAACCACGTCCTGCATCTTCTTTAAAGACTGAGTCTGGCTGTTCTTTTT -GTAATTCTTCAACTTCTTCTTTCGTATAAAAAGGACCAATTGGTTTAGTTGGGTTATCAAATCGTGGATCATCTTTATCT -ACTTCCACACGTGTAACGATTGTGCCTACAGTTCTATCACTATTCATTTCAGTTAAAATGCGATTGATTTCAGTTTCCAA -CCAATAGCCTATCATACCCTGTGACATTGCACCACAAGTATCCAATGGCATTGCCGGCGTTGTGTCACTGTTCGATTTAG -CTTGTTGGATTAATAAACTTCCAATTTGTGGACCATTACCATGTGAAATCACAATACGCGCTGGTGAATCAAATAAAGGT -TTAAGGTTTTGCATCGCACATCTAATAGCTGTTTGTTGTGCTTCAGCTGTTGCTTCTGTTGTCTGTATCGCATTACCGCC -TAATGCAATGACAATTTTCTCTTTCATATTTTTGTCGCTCCTTTTAAAAAACATTT +ATTTTTGGCAACATCGATCCTTCCACAAACTTACCTTGTGCCGCGTATTTTTTCAGTGTTGCTACATCAATATCATCGATT +TGTTGTTGATTAGGTTCATTAAAGTTAATAAATACATTTTCTACATTCGTAAGAATCATTAAGGTATCTGCTTCAATCAGC +GTTGCTAATTTCTCACTAGCAAAATCTTTATCTATAACTTATTAATCCAACAAGCTAAATCGAACAGTGACACAACGCCGG +CAATGCCATTGGATACTTGTGGTGCAATGTCACAGGGTATGATAGGCTATTGGTTGGAAACTGAAATCAATCGCATTTTAA +CTGAAATGAATAGTGATAGAACTGTAGGCACAATCGTTACACGTGTGGAAGTAGATAAAGATGATCCACGATTCAATAACC +CAACCAAACCAATTGGTCCTTTTTATACGAAAGAAGAAGTTGAAGAATTACAAAAAGAACAGCCAGACTCAGTCTTTAAAG +AAGATGCAGGACGTGGTTATAGAAAAGTAGTTGCGTCACCACTACCTCAATCTATACTAGAACACCAGTTAATTCGAACTT +TAGCAGACGGTAAAAATATTGTCATTGCATGCGGTGGTGGCGGTATTCCAGTTATAAAAAAAGAAAATACCTATGAAGGTG +TTGAAGCGACTTCCAATTTGTGGACCATTACCATGTGAAATCACAATACGCGCTGGTGAATCAAATAAAGGTTTAAGGTTT +TGCATCGCACATCTAATAGCTGTTTGTTGTGCTTCAGCTGTTGCTTCTGTTGTCTGTATCGCATTACCGCCTAATGCAATG +ACAATTTTCTCTTTCATATTTTTGTCGCTCCTTTTAAAAAACATTT \ No newline at end of file