Skip to content

Commit

Permalink
Avoid encoding problems that arise beyond the reference sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
miguelpmachado committed Jul 24, 2019
1 parent 4d7a059 commit a2c0d15
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 21 deletions.
2 changes: 1 addition & 1 deletion ReMatCh/modules/checkMLST.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def download_pub_mlst_xml(originalSpecies, schema_number, outdir):

success = 0
for scheme in tree.findall('species'):
species_scheme = scheme.text.splitlines()[0].rsplit('#', 1)
species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1)
number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1
species_scheme = species_scheme[0]
if determine_species(species_scheme) == determine_species(originalSpecies):
Expand Down
67 changes: 49 additions & 18 deletions ReMatCh/modules/rematch_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import functools
import sys
import pickle
from . import utils

# https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
import utils


def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true):
Expand Down Expand Up @@ -185,7 +185,7 @@ def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_ba
soft_clip_cigar_flag_recode):
lines_sam = []
for line in line_collection:
line = line.splitlines()[0]
line = line.rstrip('\r\n')
if len(line) > 0:
if line.startswith('@'):
lines_sam.append(line)
Expand Down Expand Up @@ -317,10 +317,15 @@ def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file):

# Read vcf file
class Vcf:
def __init__(self, vcf_file, encoding=None):
self.vcf = open(vcf_file, 'rtU', encoding=encoding)
def __init__(self, vcf_file, encoding=None, newline=None):
self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline)
self.line_read = self.vcf.readline()
self.contigs_info_dict = {}
while self.line_read.startswith('#'):
if self.line_read.startswith('##contig=<ID='):
seq = self.line_read.split('=')[2].split(',')[0]
seq_len = self.line_read.split('=')[3].split('>')[0]
self.contigs_info_dict[seq] = int(seq_len)
self.line_read = self.vcf.readline()
self.line = self.line_read

Expand All @@ -332,24 +337,34 @@ def readline(self):
def close(self):
self.vcf.close()

def get_contig_legth(self, contig):
return self.contigs_info_dict[contig]

def get_variants(gene_vcf, encoding=None):

def get_variants(gene_vcf, seq_name, encoding=None, newline=None):
variants = {}

vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding)
vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline)
line = vfc_file.readline()
counter = 1
while len(line) > 0:
fields = line.splitlines()[0].split('\t')
fields = line.rstrip('\r\n').split('\t')
if len(fields) > 0:
fields[1] = int(fields[1])

info_field = {}
for i in fields[7].split(';'):
i = i.split('=')
if len(i) > 1:
info_field[i[0]] = i[1]
try:
for i in fields[7].split(';'):
i = i.split('=')
if len(i) > 1:
info_field[i[0]] = i[1]
else:
info_field[i[0]] = None
except IndexError:
if counter > vfc_file.get_contig_legth(contig=seq_name):
break
else:
info_field[i[0]] = None
raise IndexError

format_field = {}
format_field_name = fields[8].split(':')
Expand All @@ -365,7 +380,15 @@ def get_variants(gene_vcf, encoding=None):
else:
variants[fields[1]] = {0: fields_to_store}

line = vfc_file.readline()
try:
line = vfc_file.readline()
except UnicodeDecodeError:
if counter + 1 > vfc_file.get_contig_legth(contig=seq_name):
break
else:
raise UnicodeDecodeError

counter += 1
vfc_file.close()

return variants
Expand Down Expand Up @@ -785,7 +808,7 @@ def get_coverage(gene_coverage):

with open(gene_coverage, 'rtU') as reader:
for line in reader:
line = line.splitlines()[0]
line = line.rstrip('\r\n')
if len(line) > 0:
line = line.split('\t')
coverage[int(line[1])] = int(line[2])
Expand Down Expand Up @@ -929,11 +952,19 @@ def analyse_sequence_data(bam_file, sequence_information, outdir, counter, refer

if run_successfully:
try:
variants = get_variants(gene_vcf=gene_vcf, encoding=None)
variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
encoding=sys.getdefaultencoding())
except UnicodeDecodeError:
print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
' "latin_1" encoding: {}'.format(gene_vcf))
variants = get_variants(gene_vcf=gene_vcf, encoding='latin_1')
try:
print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
' "utf_8" encoding: {}'.format(gene_vcf))
variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
encoding='utf_8')
except UnicodeDecodeError:
print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
' "latin_1" encoding: {}'.format(gene_vcf))
variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
encoding='latin_1')

coverage = get_coverage(gene_coverage)

Expand Down
4 changes: 2 additions & 2 deletions ReMatCh/rematch.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def get_list_ids_from_file(file_list_ids):

with open(file_list_ids, 'rtU') as lines:
for line in lines:
line = line.splitlines()[0]
line = line.rstrip('\r\n')
if len(line) > 0:
list_ids.append(line)

Expand All @@ -119,7 +119,7 @@ def get_taxon_run_ids(taxon_name, outputfile):
run_ids = []
with open(outputfile, 'rtU') as reader:
for line in reader:
line = line.splitlines()[0]
line = line.rstrip('\r\n')
if len(line) > 0:
if not line.startswith('#'):
line = line.split('\t')
Expand Down

0 comments on commit a2c0d15

Please sign in to comment.