From a2c0d15f3a90d31dfd123c55b57a9f93ccc2d5cc Mon Sep 17 00:00:00 2001
From: miguelpmachado <mpmachado@fc.ul.pt>
Date: Wed, 24 Jul 2019 09:27:08 +0100
Subject: [PATCH] Avoid encoding problems that arise beyond the reference
 sequence

---
 ReMatCh/modules/checkMLST.py      |  2 +-
 ReMatCh/modules/rematch_module.py | 67 ++++++++++++++++++++++---------
 ReMatCh/rematch.py                |  4 +-
 3 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/ReMatCh/modules/checkMLST.py b/ReMatCh/modules/checkMLST.py
index f7aba6c..1abf304 100644
--- a/ReMatCh/modules/checkMLST.py
+++ b/ReMatCh/modules/checkMLST.py
@@ -116,7 +116,7 @@ def download_pub_mlst_xml(originalSpecies, schema_number, outdir):
 
     success = 0
     for scheme in tree.findall('species'):
-        species_scheme = scheme.text.splitlines()[0].rsplit('#', 1)
+        species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1)
         number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1
         species_scheme = species_scheme[0]
         if determine_species(species_scheme) == determine_species(originalSpecies):
diff --git a/ReMatCh/modules/rematch_module.py b/ReMatCh/modules/rematch_module.py
index 6545ebc..6e4a5f8 100644
--- a/ReMatCh/modules/rematch_module.py
+++ b/ReMatCh/modules/rematch_module.py
@@ -3,10 +3,10 @@
 import functools
 import sys
 import pickle
+from . import utils
 
 # https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change
 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
-import utils
 
 
 def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true):
@@ -185,7 +185,7 @@ def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_ba
                                       soft_clip_cigar_flag_recode):
     lines_sam = []
     for line in line_collection:
-        line = line.splitlines()[0]
+        line = line.rstrip('\r\n')
         if len(line) > 0:
             if line.startswith('@'):
                 lines_sam.append(line)
@@ -317,10 +317,15 @@ def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file):
 
 # Read vcf file
 class Vcf:
-    def __init__(self, vcf_file, encoding=None):
-        self.vcf = open(vcf_file, 'rtU', encoding=encoding)
+    def __init__(self, vcf_file, encoding=None, newline=None):
+        self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline)
         self.line_read = self.vcf.readline()
+        self.contigs_info_dict = {}
         while self.line_read.startswith('#'):
+            if self.line_read.startswith('##contig=<ID='):
+                seq = self.line_read.split('=')[2].split(',')[0]
+                seq_len = self.line_read.split('=')[3].split('>')[0]
+                self.contigs_info_dict[seq] = int(seq_len)
             self.line_read = self.vcf.readline()
         self.line = self.line_read
 
@@ -332,24 +337,34 @@ def readline(self):
     def close(self):
         self.vcf.close()
 
+    def get_contig_legth(self, contig):
+        return self.contigs_info_dict[contig]
 
-def get_variants(gene_vcf, encoding=None):
+
+def get_variants(gene_vcf, seq_name, encoding=None, newline=None):
     variants = {}
 
-    vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding)
+    vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline)
     line = vfc_file.readline()
+    counter = 1
     while len(line) > 0:
-        fields = line.splitlines()[0].split('\t')
+        fields = line.rstrip('\r\n').split('\t')
         if len(fields) > 0:
             fields[1] = int(fields[1])
 
             info_field = {}
-            for i in fields[7].split(';'):
-                i = i.split('=')
-                if len(i) > 1:
-                    info_field[i[0]] = i[1]
+            try:
+                for i in fields[7].split(';'):
+                    i = i.split('=')
+                    if len(i) > 1:
+                        info_field[i[0]] = i[1]
+                    else:
+                        info_field[i[0]] = None
+            except IndexError:
+                if counter > vfc_file.get_contig_legth(contig=seq_name):
+                    break
                 else:
-                    info_field[i[0]] = None
+                    raise IndexError
 
             format_field = {}
             format_field_name = fields[8].split(':')
@@ -365,7 +380,15 @@ def get_variants(gene_vcf, encoding=None):
             else:
                 variants[fields[1]] = {0: fields_to_store}
 
-        line = vfc_file.readline()
+        try:
+            line = vfc_file.readline()
+        except UnicodeDecodeError:
+            if counter + 1 > vfc_file.get_contig_legth(contig=seq_name):
+                break
+            else:
+                raise UnicodeDecodeError
+
+        counter += 1
     vfc_file.close()
 
     return variants
@@ -785,7 +808,7 @@ def get_coverage(gene_coverage):
 
     with open(gene_coverage, 'rtU') as reader:
         for line in reader:
-            line = line.splitlines()[0]
+            line = line.rstrip('\r\n')
             if len(line) > 0:
                 line = line.split('\t')
                 coverage[int(line[1])] = int(line[2])
@@ -929,11 +952,19 @@ def analyse_sequence_data(bam_file, sequence_information, outdir, counter, refer
 
         if run_successfully:
             try:
-                variants = get_variants(gene_vcf=gene_vcf, encoding=None)
+                variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
+                                        encoding=sys.getdefaultencoding())
             except UnicodeDecodeError:
-                print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
-                      ' "latin_1" encoding: {}'.format(gene_vcf))
-                variants = get_variants(gene_vcf=gene_vcf, encoding='latin_1')
+                try:
+                    print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
+                          ' "utf_8" encoding: {}'.format(gene_vcf))
+                    variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
+                                            encoding='utf_8')
+                except UnicodeDecodeError:
+                    print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
+                          ' "latin_1" encoding: {}'.format(gene_vcf))
+                    variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
+                                            encoding='latin_1')
 
             coverage = get_coverage(gene_coverage)
 
diff --git a/ReMatCh/rematch.py b/ReMatCh/rematch.py
index 893c345..783c28e 100755
--- a/ReMatCh/rematch.py
+++ b/ReMatCh/rematch.py
@@ -103,7 +103,7 @@ def get_list_ids_from_file(file_list_ids):
 
     with open(file_list_ids, 'rtU') as lines:
         for line in lines:
-            line = line.splitlines()[0]
+            line = line.rstrip('\r\n')
             if len(line) > 0:
                 list_ids.append(line)
 
@@ -119,7 +119,7 @@ def get_taxon_run_ids(taxon_name, outputfile):
     run_ids = []
     with open(outputfile, 'rtU') as reader:
         for line in reader:
-            line = line.splitlines()[0]
+            line = line.rstrip('\r\n')
             if len(line) > 0:
                 if not line.startswith('#'):
                     line = line.split('\t')