From a2c0d15f3a90d31dfd123c55b57a9f93ccc2d5cc Mon Sep 17 00:00:00 2001 From: miguelpmachado Date: Wed, 24 Jul 2019 09:27:08 +0100 Subject: [PATCH] Avoid encoding problems that arise beyond the reference sequence --- ReMatCh/modules/checkMLST.py | 2 +- ReMatCh/modules/rematch_module.py | 67 ++++++++++++++++++++++--------- ReMatCh/rematch.py | 4 +- 3 files changed, 52 insertions(+), 21 deletions(-) diff --git a/ReMatCh/modules/checkMLST.py b/ReMatCh/modules/checkMLST.py index f7aba6c..1abf304 100644 --- a/ReMatCh/modules/checkMLST.py +++ b/ReMatCh/modules/checkMLST.py @@ -116,7 +116,7 @@ def download_pub_mlst_xml(originalSpecies, schema_number, outdir): success = 0 for scheme in tree.findall('species'): - species_scheme = scheme.text.splitlines()[0].rsplit('#', 1) + species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1) number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 species_scheme = species_scheme[0] if determine_species(species_scheme) == determine_species(originalSpecies): diff --git a/ReMatCh/modules/rematch_module.py b/ReMatCh/modules/rematch_module.py index 6545ebc..6e4a5f8 100644 --- a/ReMatCh/modules/rematch_module.py +++ b/ReMatCh/modules/rematch_module.py @@ -3,10 +3,10 @@ import functools import sys import pickle +from . import utils # https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change sys.path.insert(0, os.path.dirname(os.path.realpath(__file__))) -import utils def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true): @@ -185,7 +185,7 @@ def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_ba soft_clip_cigar_flag_recode): lines_sam = [] for line in line_collection: - line = line.splitlines()[0] + line = line.rstrip('\r\n') if len(line) > 0: if line.startswith('@'): lines_sam.append(line) @@ -317,10 +317,15 @@ def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file): # Read vcf file class Vcf: - def __init__(self, vcf_file, encoding=None): - self.vcf = open(vcf_file, 'rtU', encoding=encoding) + def __init__(self, vcf_file, encoding=None, newline=None): + self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline) self.line_read = self.vcf.readline() + self.contigs_info_dict = {} while self.line_read.startswith('#'): + if self.line_read.startswith('##contig=')[0] + self.contigs_info_dict[seq] = int(seq_len) self.line_read = self.vcf.readline() self.line = self.line_read @@ -332,24 +337,34 @@ def readline(self): def close(self): self.vcf.close() + def get_contig_legth(self, contig): + return self.contigs_info_dict[contig] -def get_variants(gene_vcf, encoding=None): + +def get_variants(gene_vcf, seq_name, encoding=None, newline=None): variants = {} - vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding) + vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline) line = vfc_file.readline() + counter = 1 while len(line) > 0: - fields = line.splitlines()[0].split('\t') + fields = line.rstrip('\r\n').split('\t') if len(fields) > 0: fields[1] = int(fields[1]) info_field = {} - for i in fields[7].split(';'): - i = i.split('=') - if len(i) > 1: - info_field[i[0]] = i[1] + try: + for i in fields[7].split(';'): + i = i.split('=') + if len(i) > 1: + info_field[i[0]] = i[1] + else: + info_field[i[0]] = None + except IndexError: + if counter > vfc_file.get_contig_legth(contig=seq_name): + break else: - info_field[i[0]] = None + raise IndexError format_field = {} format_field_name = fields[8].split(':') @@ -365,7 +380,15 @@ def get_variants(gene_vcf, encoding=None): else: variants[fields[1]] = {0: fields_to_store} - line = vfc_file.readline() + try: + line = vfc_file.readline() + except UnicodeDecodeError: + if counter + 1 > vfc_file.get_contig_legth(contig=seq_name): + break + else: + raise UnicodeDecodeError + + counter += 1 vfc_file.close() return variants @@ -785,7 +808,7 @@ def get_coverage(gene_coverage): with open(gene_coverage, 'rtU') as reader: for line in reader: - line = line.splitlines()[0] + line = line.rstrip('\r\n') if len(line) > 0: line = line.split('\t') coverage[int(line[1])] = int(line[2]) @@ -929,11 +952,19 @@ def analyse_sequence_data(bam_file, sequence_information, outdir, counter, refer if run_successfully: try: - variants = get_variants(gene_vcf=gene_vcf, encoding=None) + variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], + encoding=sys.getdefaultencoding()) except UnicodeDecodeError: - print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' - ' "latin_1" encoding: {}'.format(gene_vcf)) - variants = get_variants(gene_vcf=gene_vcf, encoding='latin_1') + try: + print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' + ' "utf_8" encoding: {}'.format(gene_vcf)) + variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], + encoding='utf_8') + except UnicodeDecodeError: + print('It was found an enconding error while parsing the following VCF, but lets try forcing it to' + ' "latin_1" encoding: {}'.format(gene_vcf)) + variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'], + encoding='latin_1') coverage = get_coverage(gene_coverage) diff --git a/ReMatCh/rematch.py b/ReMatCh/rematch.py index 893c345..783c28e 100755 --- a/ReMatCh/rematch.py +++ b/ReMatCh/rematch.py @@ -103,7 +103,7 @@ def get_list_ids_from_file(file_list_ids): with open(file_list_ids, 'rtU') as lines: for line in lines: - line = line.splitlines()[0] + line = line.rstrip('\r\n') if len(line) > 0: list_ids.append(line) @@ -119,7 +119,7 @@ def get_taxon_run_ids(taxon_name, outputfile): run_ids = [] with open(outputfile, 'rtU') as reader: for line in reader: - line = line.splitlines()[0] + line = line.rstrip('\r\n') if len(line) > 0: if not line.startswith('#'): line = line.split('\t')