Skip to content

Commit

Permalink
Merge pull request #24 from B-UMMI/dev
Browse files Browse the repository at this point in the history
Add Saureus mlst and avoid VCF enconding problems
  • Loading branch information
cimendes authored Aug 5, 2019
2 parents 8b7f554 + 2d10170 commit 4ff4508
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 37 deletions.
2 changes: 1 addition & 1 deletion ReMatCh/modules/checkMLST.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def download_pub_mlst_xml(originalSpecies, schema_number, outdir):

success = 0
for scheme in tree.findall('species'):
species_scheme = scheme.text.splitlines()[0].rsplit('#', 1)
species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1)
number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1
species_scheme = species_scheme[0]
if determine_species(species_scheme) == determine_species(originalSpecies):
Expand Down
85 changes: 85 additions & 0 deletions ReMatCh/modules/mlst_schemas/staphylococcus_aureus.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
>yqiL
TAATATGATTTGTTAAATGCATAACAAGAATGAAAATGTAACATACGTAGCAATTGGTTTCATAAATTGGATGTTAGTGG
CGTATTGGTTCATTAGACGTATTAGTAATAAAATTGTATATATCATAAGGAGATGAATATGACATGACGAGAGTCGTATT
AGCAGCAGCATACAGGACACCTATTGGCGTTTTTGGAGGTGCGTTTAAAGACGTGCCAGCCTATGATTTAGGTGCGACTT
TAATAGAACATATTATTAAAGAGACGGGTTTGAATCCAAGTGAGATTGATGAAGTTATCATCGGTAACGTACTACAAGCA
GGACAAGGACAAAATCCAGCACGAATTGCTGCTATGAAAGGTGGCTTGCCAGAAACAGTACCTGCATTTACGGTGAATAA
AGTATGTGGTTCTGGGTTAAAGTCGATTCAATTAGCATATCAATCTATTGTGACTGGTGAAAATGACATCGTGCTAGCTG
GCGGTATGGAGAATATGTCTCAATCACCAATGCTTGTCAACAACAGTCGCTTTGGTTTTAAAATGGGACATCAATCAATG
GTTGATAGCATGGTATATGATGGTTTAACAGATGTATTTAATCAATATCATATGGGTATTACTGCTGAAAATTTAGCAGA
GCAATATGGTATTTCAAGAGAAGAACAAGATACATTTGCTGTAAACTCACAACAAAAAGCAGTACGTGCACAGCAAAATG
GTGAATTTGATAGTGAAATAGTTCCAGTATCGATTCCTCAACGTAAAGGTGAACCAATCGTAGTCACTAAGGATGAAGGT
GTACGTGAAAATGTATCAGTCGAAAAATTAAGTCGATTAAGACCAGCTTTCAAAAAAGACGGTACAGTTACAGCAGGTAA
TGCATCAGGAATCAATGATGGTGCTGCGATGATGTT
>tpi
TAATTTGTGCACCAGCAATTCAATTAGATGCATTAACTACTGCAGTTAAAGAAGGAAAAGCACAAGGTTTAGAAATCGGT
GCTCAAAATACGTATTTCGAAGATAATGGTGCGTTCACAGGTGAAACGTCTCCAGTTGCATTAGCAGATTTAGGCGTTAA
ATACGTTGTTATCGGTCATTCTGAACGTCGTGAATTATTCCACGAAACAGATGAAGAAATTAACAAAAAAGCGCACGCTA
TTTTCAAACATGGAATGACTCCAATTATATGTGTTGGTGAAACAGACGAAGAGCGTGAAAGTGGTAAAGCTAACGATGTT
GTAGGTGAGCAAGTTAAGAAAGCTGTTGCAGGTTTATCTGAAGATCAACTTAAATCAGTTGTAATTGCTTATGAACCAAT
CTGGGCAATCGGAACTGGTAAATCATCAACATCTGAAGATGCAAATGAAATGTGTGCATTTGTACGTCAAACTATTGCTG
ACTTATCAAGCAAAGAAGTATCAGAAGCAACTCGTATTCAATATGGTGGTAGTGTTAAACCTAACAACATTAAAGAATAC
ATGGCACAAACTGATATTGATGGGGCATTAGTAGGTGGCGCATCACTTAAAGTTGAAGATTTCGTACAATTGTTAGAAGG
TGCAAAATAATCATGGCTAAGAAACCAACTGCGTTAATTATTTTAGATGGTTTTGCGAACCGCGAAAGCGAACATGGTAA
TGCGGTAAAATTAGCAAACAAGCCTAATTTTGATCGTTATTACAACAAATATCCAACGACTCAAATCGAAGCGAGTGGCT
TA
>pta
ACGATAAAATTATGATTACAAATTGGTGACGTGGCATTATGAAATAAAATGGCGTATAATTATACCGTGAATGATTAATA
AGATTTATATTACAGGAGGACATTATGGCTGATTTATTAAATGTATTAAAAGACAAACTTTCTGGTAAAAACGTTAAAAT
CGTATTACCTGAAGGAGAGGACGAACGTGTTCTAACAGCTGCAACACAATTACAAGCAACAGATTATGTTACACCAATCG
TGTTAGGTGATGAGACTAAGGTTCAATCTTTAGCGCAAAAACTTGATCTTGATATTTCTAATATTGAATTAATTAATCCT
GCGACAAGTGAATTGAAAGCTGAATTAGTTCAATCATTTGTTGAACGACGTAAAGGTAAAGCGACTGAAGAACAAGCACA
AGAATTATTAAACAATGTGAACTACTTCGGTACAATGCTTGTTTATGCTGGTAAAGCAGATGGTTTAGTTAGTGGTGCAG
CACATTCAACAGGCGACACTGTGCGTCCAGCTTTACAAATCATCAAAACGAAACCAGGTGTATCAAGAACATCAGGTATC
TTCTTTATGATTAAAGGTGATGAACAATACATCTTTGGTGATTGTGCAATCAATCCAGAACTTGATTCACAAGGACTTGC
AGAAATTGCAGTAGAAAGTGCAAAATCAGCATTAAGCTTTGGCATGGATCCAAAAGTTGCAATGTTAAGCTTTTCAACAA
AAGGGTCTGCTAAATCAGACGACGTGACAAAAGTTCAAGAAGCTGTCAAATTAGCACAACAAAAAGCTGAAGAAGAAAAA
TTAGAAGCAATCATTGATGGCGAATTCCAATTTGATGCTGCGATTGTACCAGGTGTTGCTGAGAAAAAAGCGCC
>gmk
GCAATATAACGATATTGTTAGACTTAATAGAAATTATGGCATGCAATTTCAAATATGCTATACAATATAAAGAACAATGT
GATATCATATTTAAATAATAGAAGATTAGCTTAGAGAGGTCGTAAGGCATGGATAATGAAAAAGGATTGTTAATCGTTTT
ATCAGGACCATCTGGAGTAGGTAAAGGTACTGTTAGAAAACGAATATTTGAAGATCCAAGTACATCATATAAGTATTCTA
TTTCAATGACAACACGTCAAATGCGTGAAGGTGAAGTTGATGGCGTAGATTACTTTTTTAAAACTAGGGATGCGTTTGAA
GCTTTAATCAAAGATGACCAATTTATAGAATATGCTGAATATGTAGGCAACTATTATGGTACACCAGTTCAATATGTTAA
AGATACAATGGACGAAGGTCATGATGTATTTTTAGAAATTGAAGTAGAAGGTGCAAAGCAAGTTAGAAAGAAATTTCCAG
ATGCGCTATTTATTTTCTTAGCACCTCCAAGTTTAGAACACTTGAGAGAGCGATTAGTAGGTAGAGGAACAGAATCTGAT
GAGAAAATACAAAGTCGTATTAACGAAGCGCGTAAAGAAGTTGAAATGATGAATTTATACGATTACGTTGTAGTTAATGA
TGAAGTAGAACTTGCGAAGAATAGAATTCAATGTATTGTAGAAGCTGAGCACTTAAAAAGAGAGCGCGTAGAAGCTAAGT
ATAGAAAAATGATTTTGGAGGCTAAAAAATAATGTTAAATCCACCTTTAAACCAATTAACGTCACAAATTAAATCAAAGT
ATTTAATTGCAACAACT
>glpF
CAAATATAATAAAAGTTAATACATAGAATAGAGACGGGAGATTTCTACGAGCCAAACTGCTAGTGTAGGAATCTCTTTGT
CTTTTTGGGAGGACATTTAATATGAATGTATATTTAGCAGAATTCCTAGGAACTGCAATCTTAATCCTTTTTGGTGGTGG
CGTTTGTGCCAATGTCAATTTAAAGAGAAGTGCTGCGAATGGTGCTGATTGGATTGTCATCACAGCTGGATGGGGATTAG
CGGTTACAATGGGTGTGTTTGCTGTCGGTCAATTCTCAGGTGCACATTTAAACCCAGCGGTGTCTTTAGCTCTTGCATTA
GACGGAAGTTTTGATTGGTCATTAGTTCCTGGTTATATTGTTGCTCAAATGTTAGGTGCAATTGTCGGAGCAACAATTGT
ATGGTTAATGTACTTGCCACATTGGAAAGCGACAGAAGAAGCTGGCGCGAAATTAGGTGTTTTCTCTACAGCACCGGCTA
TTAAGAATTACTTTGCCAACTTTTTAAGTGAGATTATCGGAACAATGGCATTAACTTTAGGTATTTTATTTATCGGTGTA
AACAAAATTGCCGATGGTTTAAATCCTTTAATTGTCGGAGCATTAATTGTTGCAATCGGATTAAGTTTAGGCGGTGCTAC
TGGTTATGCAATCAACCCAGCACGTGATTTAGGTCCGAGAATTGCACATGCGATTTTACCAATAGCTGGTAAAGGTGGTT
CAAATTGGTCATATGCAATCGTTCCTATCTTAGGACCAATTGCCGGTGGTTTATTAGGTGCAGTGGTATACGCTGTATTT
TATAAACATACATTTAATATTGGTTGTGCAATTGCAATTGTTGTAGTTATTATTACTTTGATTTT
>aroE
CCCTGGTAAACAAACATATCTAAGCCATTATAAATATGGTTTCCCTTGCGCTCTGCTTCCTCTAAAATAGGTGTTTTATA
CGGTATATAAACAATATCACTCATTAAAGTATTGGGAGAAAGATGCTTTAAATTAATAATACTTTCGTTATTTCCAGCCA
TACCCGCTGGTGTTGTATTAATAACGATATCGAATTCAGCAATTTTAATTCTTTAGGATTAGATGATACTTATGAAGCTT
TAAATATTCCAATTGAAGATTTTCATTTAATTAAAGAAATTATTTCGAAAAAAGAATTAGATGGCTTTAATATCACAATT
CCTCATAAAGAACGTATCATACCGTATTTAGATCATGTTGATGAACAAGCGATTAATGCAGGTGCAGTTAACACTGTTTT
GATAAAAGATGACAAGTGGATAGGGTATAATACAGATGGTATTGGTTATGTTAAAGGATTGCACAGCGTTTATCCAGATT
TAGAAAATGCATACATTTTAATTTTGGGCGCAGGTGGTGCAAGTAAAGGTATTGCTTATGAATTAGCAAAATTTGTAAAG
CCCAAATTAACTGTTGCGAATAGAACGATGGCTCGTTTTGAATCTTGGAATTTAAATATAAACCAAATTTCATTAGCAGA
TGCTGAAAAGTATTTATGCTCTATGCATAACGGGCGACAAGGAATGTGAAATAGGATTTCCTATAACTGCAAATTTCATT
TTTTTAATCACCTTATAAAATAGAATTTCTTAATACAACATCAACATTTTTAGGAACACGAACGATTACTTTAGCCCCTG
GTCCTATAGTTATAAAGCCTAGACCAGAGATCATAACATCGCGTTTCTCTTTGCCT
>arcC
ATTTTTGGCAACATCGATCCTTCCACAAACTTACCTTGTGCCGCGTATTTTTTCAGTGTTGCTACATCAATATCATCGATT
TGTTGTTGATTAGGTTCATTAAAGTTAATAAATACATTTTCTACATTCGTAAGAATCATTAAGGTATCTGCTTCAATCAGC
GTTGCTAATTTCTCACTAGCAAAATCTTTATCTATAACTTATTAATCCAACAAGCTAAATCGAACAGTGACACAACGCCGG
CAATGCCATTGGATACTTGTGGTGCAATGTCACAGGGTATGATAGGCTATTGGTTGGAAACTGAAATCAATCGCATTTTAA
CTGAAATGAATAGTGATAGAACTGTAGGCACAATCGTTACACGTGTGGAAGTAGATAAAGATGATCCACGATTCAATAACC
CAACCAAACCAATTGGTCCTTTTTATACGAAAGAAGAAGTTGAAGAATTACAAAAAGAACAGCCAGACTCAGTCTTTAAAG
AAGATGCAGGACGTGGTTATAGAAAAGTAGTTGCGTCACCACTACCTCAATCTATACTAGAACACCAGTTAATTCGAACTT
TAGCAGACGGTAAAAATATTGTCATTGCATGCGGTGGTGGCGGTATTCCAGTTATAAAAAAAGAAAATACCTATGAAGGTG
TTGAAGCGACTTCCAATTTGTGGACCATTACCATGTGAAATCACAATACGCGCTGGTGAATCAAATAAAGGTTTAAGGTTT
TGCATCGCACATCTAATAGCTGTTTGTTGTGCTTCAGCTGTTGCTTCTGTTGTCTGTATCGCATTACCGCCTAATGCAATG
ACAATTTTCTCTTTCATATTTTTGTCGCTCCTTTTAAAAAACATTT
76 changes: 58 additions & 18 deletions ReMatCh/modules/rematch_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import functools
import sys
import pickle
from . import utils

# https://chrisyeh96.github.io/2017/08/08/definitive-guide-python-imports.html#case-2-syspath-could-change
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
import utils


def index_fasta_samtools(fasta, region_none, region_outfile_none, print_comand_true):
Expand Down Expand Up @@ -41,9 +41,13 @@ def mapping_bowtie2(fastq_files, reference_file, threads, outdir, num_map_loc,
run_successfully = index_sequence_bowtie2(reference_file, threads)

if run_successfully:
command = ['bowtie2', '-k', str(num_map_loc), '-q', bowtie_algorithm, '--threads', str(threads), '-x',
command = ['bowtie2', '', '', '-q', bowtie_algorithm, '--threads', str(threads), '-x',
reference_file, '', '--no-unal', '', '-S', sam_file]

if num_map_loc is not None and num_map_loc > 1:
command[1] = '-k'
command[2] = str(num_map_loc)

if len(fastq_files) == 1:
command[9] = '-U ' + fastq_files[0]
elif len(fastq_files) == 2:
Expand Down Expand Up @@ -181,7 +185,7 @@ def parallelized_recode_soft_clipping(line_collection, pickle_file, soft_clip_ba
soft_clip_cigar_flag_recode):
lines_sam = []
for line in line_collection:
line = line.splitlines()[0]
line = line.rstrip('\r\n')
if len(line) > 0:
if line.startswith('@'):
lines_sam.append(line)
Expand Down Expand Up @@ -313,39 +317,54 @@ def create_vcf(bam_file, sequence_to_analyse, outdir, counter, reference_file):

# Read vcf file
class Vcf:
def __init__(self, vcf_file):
self.vcf = open(vcf_file, 'rtU')
def __init__(self, vcf_file, encoding=None, newline=None):
self.vcf = open(vcf_file, 'rt', encoding=encoding, newline=newline)
self.line_read = self.vcf.readline()
self.contigs_info_dict = {}
while self.line_read.startswith('#'):
if self.line_read.startswith('##contig=<ID='):
seq = self.line_read.split('=')[2].split(',')[0]
seq_len = self.line_read.split('=')[3].split('>')[0]
self.contigs_info_dict[seq] = int(seq_len)
self.line_read = self.vcf.readline()
self.line = self.line_read

def readline(self):
self.line_stored = self.line
line_stored = self.line
self.line = self.vcf.readline()
return self.line_stored
return line_stored

def close(self):
self.vcf.close()

def get_contig_legth(self, contig):
return self.contigs_info_dict[contig]


def get_variants(gene_vcf):
def get_variants(gene_vcf, seq_name, encoding=None, newline=None):
variants = {}

vfc_file = Vcf(gene_vcf)
vfc_file = Vcf(vcf_file=gene_vcf, encoding=encoding, newline=newline)
line = vfc_file.readline()
counter = 1
while len(line) > 0:
fields = line.splitlines()[0].split('\t')
fields = line.rstrip('\r\n').split('\t')
if len(fields) > 0:
fields[1] = int(fields[1])

info_field = {}
for i in fields[7].split(';'):
i = i.split('=')
if len(i) > 1:
info_field[i[0]] = i[1]
try:
for i in fields[7].split(';'):
i = i.split('=')
if len(i) > 1:
info_field[i[0]] = i[1]
else:
info_field[i[0]] = None
except IndexError:
if counter > vfc_file.get_contig_legth(contig=seq_name):
break
else:
info_field[i[0]] = None
raise IndexError

format_field = {}
format_field_name = fields[8].split(':')
Expand All @@ -361,7 +380,15 @@ def get_variants(gene_vcf):
else:
variants[fields[1]] = {0: fields_to_store}

line = vfc_file.readline()
try:
line = vfc_file.readline()
except UnicodeDecodeError:
if counter + 1 > vfc_file.get_contig_legth(contig=seq_name):
break
else:
raise UnicodeDecodeError

counter += 1
vfc_file.close()

return variants
Expand Down Expand Up @@ -781,7 +808,7 @@ def get_coverage(gene_coverage):

with open(gene_coverage, 'rtU') as reader:
for line in reader:
line = line.splitlines()[0]
line = line.rstrip('\r\n')
if len(line) > 0:
line = line.split('\t')
coverage[int(line[1])] = int(line[2])
Expand Down Expand Up @@ -924,7 +951,20 @@ def analyse_sequence_data(bam_file, sequence_information, outdir, counter, refer
compute_genome_coverage_data(bam_file, sequence_information['header'], outdir, counter)

if run_successfully:
variants = get_variants(gene_vcf)
try:
variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
encoding=sys.getdefaultencoding())
except UnicodeDecodeError:
try:
print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
' "utf_8" encoding: {}'.format(gene_vcf))
variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
encoding='utf_8')
except UnicodeDecodeError:
print('It was found an enconding error while parsing the following VCF, but lets try forcing it to'
' "latin_1" encoding: {}'.format(gene_vcf))
variants = get_variants(gene_vcf=gene_vcf, seq_name=sequence_information['header'],
encoding='latin_1')

coverage = get_coverage(gene_coverage)

Expand Down
14 changes: 8 additions & 6 deletions ReMatCh/modules/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import pickle
import traceback
from traceback import format_exception as traceback_format_exception
import shlex
import subprocess
from threading import Timer
import shutil
import time
import functools
from functools import wraps as functools_wraps
import os.path
import sys

Expand Down Expand Up @@ -221,7 +221,7 @@ def run_time(start_time):


def timer(function, name):
@functools.wraps(function)
@functools_wraps(function)
def wrapper(*args, **kwargs):
print('\n' + 'RUNNING {0}\n'.format(name))
start_time = time.time()
Expand Down Expand Up @@ -254,7 +254,7 @@ def extract_variable_from_pickle(pickleFile):


def trace_unhandled_exceptions(func):
@functools.wraps(func)
@functools_wraps(func)
def wrapped_func(*args, **kwargs):
try:
func(*args, **kwargs)
Expand All @@ -263,8 +263,10 @@ def wrapped_func(*args, **kwargs):
print(e)

exc_type, exc_value, exc_tb = sys.exc_info()
# print(exc_value)
print(''.join(traceback.format_exception(exc_type, exc_value, exc_tb)))
print(''.join(traceback_format_exception(exc_type, exc_value, exc_tb)))

raise exc_type(exc_value)

return wrapped_func


Expand Down
27 changes: 15 additions & 12 deletions ReMatCh/rematch.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def get_list_ids_from_file(file_list_ids):

with open(file_list_ids, 'rtU') as lines:
for line in lines:
line = line.splitlines()[0]
line = line.rstrip('\r\n')
if len(line) > 0:
list_ids.append(line)

Expand All @@ -119,7 +119,7 @@ def get_taxon_run_ids(taxon_name, outputfile):
run_ids = []
with open(outputfile, 'rtU') as reader:
for line in reader:
line = line.splitlines()[0]
line = line.rstrip('\r\n')
if len(line) > 0:
if not line.startswith('#'):
line = line.split('\t')
Expand Down Expand Up @@ -358,18 +358,21 @@ def run_rematch(args):
args.softClip_recodeRun = 'first'

if args.reference is None:
reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path)
args.extraSeq = 200
if reference_file is None:
print('It was not found provided MLST scheme sequences for ' + args.mlst)
print('Trying to obtain reference MLST sequences from PubMLST')
if len(mlst_sequences) > 0:
reference_file = check_mlst.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str)
args.extraSeq = 0
if args.mlst is not None:
reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path)
args.extraSeq = 200
if reference_file is None:
print('It was not found provided MLST scheme sequences for ' + args.mlst)
print('Trying to obtain reference MLST sequences from PubMLST')
if len(mlst_sequences) > 0:
reference_file = check_mlst.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str)
args.extraSeq = 0
else:
sys.exit('It was not possible to download MLST sequences from PubMLST!')
else:
sys.exit('It was not possible to download MLST sequences from PubMLST!')
print('Using provided scheme as referece: ' + reference_file)
else:
print('Using provided scheme as referece: ' + reference_file)
sys.exit('Need to provide at least one of the following options: "--reference" and "--mlst"')
else:
reference_file = os.path.abspath(args.reference.name)

Expand Down

0 comments on commit 4ff4508

Please sign in to comment.