From 7b0f7f190fe3d1602aad779d117d6c317739d0bb Mon Sep 17 00:00:00 2001 From: luis Date: Sat, 20 Oct 2018 11:45:04 +0200 Subject: [PATCH] Including the SNP implementation and the alignment for Indels sequences --- taranis.py | 180 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 159 insertions(+), 21 deletions(-) diff --git a/taranis.py b/taranis.py index 5408574..5c42bd0 100644 --- a/taranis.py +++ b/taranis.py @@ -17,7 +17,8 @@ from Bio.SeqRecord import SeqRecord from Bio.Alphabet import generic_dna from Bio import Seq - +from Bio import pairwise2 +from Bio.pairwise2 import format_alignment from Bio.Blast.Applications import NcbiblastnCommandline from io import StringIO from Bio.Blast import NCBIXML @@ -319,7 +320,7 @@ def check_sequence_order(allele_sequence, logger) : return 'reverse' return False -def get_snp(sample, query) : +def get_snp_2(sample, query) : snp_list = [] for i in range(len(sample)): try: @@ -329,6 +330,46 @@ def get_snp(sample, query) : snp_list.append([str(i+1), '-', sample[i]]) return snp_list +def get_snp (sample, query) : + prot_annotation = {'S': 'polar' ,'T': 'polar' ,'Y': 'polar' ,'Q': 'polar' ,'N': 'polar' ,'C': 'polar' ,'S': 'polar' , + 'F': 'nonpolar' ,'L': 'nonpolar','I': 'nonpolar','M': 'nonpolar','P': 'nonpolar','V': 'nonpolar','A': 'nonpolar','W': 'nonpolar','G': 'nonpolar', + 'D' : 'acidic', 'E' :'acidic', + 'H': 'basic' , 'K': 'basic' , 'R' : 'basic', + '-': '-----', '*' : 'Stop codon'} + + snp_list = [] + length = max(len(sample), len(query)) + # normalize the lenght of the sample for the iteration + if len(sample) < length : + need_to_add = length - len(sample) + sample = sample + need_to_add * '-' + if len(query) < length : + need_to_add = length - len(query) + query = query + need_to_add * '-' + # convert to Seq class to translate to protein + seq_sample = Seq.Seq(sample) + seq_query = Seq.Seq(query) + + + for index in range (0, length -2, 3) : + codon_seq = seq_sample[index : index + 3] + codon_que = seq_query[index : index + 3] + if codon_seq != codon_que : + if str(codon_seq) != '---' : + prot_seq = str(codon_seq.translate()) + else: + prot_seq = '-' + if str(codon_que) != '---' : + prot_que = str(codon_que.translate()) + else: + prot_que = '-' + snp_list.append([str(index+1),str(codon_seq) + '/'+ str(codon_que), prot_seq + '/' + prot_que, prot_annotation[prot_seq] + ' / ' + prot_annotation[prot_que]]) + + + + + return snp_list + def convert_to_protein (sequence) : seq = Seq.Seq(sequence) @@ -336,6 +377,46 @@ def convert_to_protein (sequence) : return protein +def nucleotide_to_protein_aligment (sample_seq, query_seq ) : + aligment = [] + sample_prot = convert_to_protein(sample_seq) + query_prot = convert_to_protein(query_seq) + minimun_length = min(len(sample_prot), len(query_prot)) + for i in range(minimun_length): + if sample_prot[i] == query_prot[i] : + aligment.append('|') + else: + aligment.append(' ') + protein_alignment = [['sample', sample_prot],['match', ''.join(aligment)], ['schema', query_prot]] + return protein_alignment + +def get_alignment_for_indels (blast_db_name, qseq) : + #match_alignment =[] + cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 80, outfmt= 5, max_target_seqs=10, max_hsps=10,num_threads=3) + out, err = cline(stdin = qseq) + psiblast_xml = StringIO(out) + blast_records = NCBIXML.parse(psiblast_xml) + for blast_record in blast_records: + for alignment in blast_record.alignments: + for match in alignment.hsps: + match_alignment = [['sample', match.sbjct],['match', match.match], ['schema',match.query]] + return match_alignment + + +def get_aligments_for_deletions (sample_seq, query_seq): + index_found = False + alignments = pairwise2.align.globalxx(sample_seq, query_seq) + for index in range(len(alignments)) : + if alignments[index][4] == len(query_seq) : + index_found = True + break + if not index_found : + index = 0 + values = format_alignment(*alignments[index]).split('\n') + + match_alignment = [['sample', values[0]],['match', values[1]], ['schema',values[2]]] + + return match_alignment def create_summary (samples_matrix_dict, logger) : summary_dict = {} @@ -401,6 +482,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, plot_dict = {} snp_dict = {} protein_dict = {} + match_alignment_dict = {} blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"' header_macthing_alleles_conting = ['Sample Name', 'Contig', 'Core Gene','start', 'stop', 'direction', 'codification'] header_paralogs = ['Sample Name','Core Gene', 'Allele','Contig','Bit Score', 'Start Seq', 'End Seq','Sequence'] @@ -408,8 +490,9 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, header_insertions = [ 'Core Gene', 'Sample Name' , 'Insertion item' ,'Allele', 'Contig', 'Bitscore', 'Query length' , 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence' ] header_deletions = [ 'Core Gene', 'Sample Name' , 'Deletion item' ,'Allele', 'Contig', 'Bitscore', 'Query length' , 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence' ] header_plot = ['Core Gene', 'Sample Name' , 'Allele','Contig','Bit Score', 'Start Seq', 'End Seq','Sequence'] - header_snp = ['Sample Name','Core Gene', 'Position','Value in Reference','Value in Sample'] + header_snp = ['Sample Name','Core Gene', 'Position','Sequence Sample/Schema','Protein in Sample/Schema', 'Annotation Sample / Schema'] header_protein = ['Sample Name','Core Gene', 'Protein in ' , 'Protein sequence'] + header_match_alignment = ['Sample Name','Core Gene','Alignment', 'Sequence'] for core_file in core_gene_dict_files: print ( 'Analyzing core file : ', core_file) @@ -422,11 +505,13 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, logger.debug('load in memory the core file %s ', core_file) ref_query_parse = list (SeqIO.parse(reference_query, "fasta")) query_length = len(ref_query_parse[0].seq) - query_length_list =[] + #query_length_list =[] + ''' for allele in ref_query_parse : allele_length = len(allele.seq) if not allele_length in query_length_list : query_length_list.append(allele_length) + ''' #create new_allele_dict to infer new_allele_dict ={} @@ -463,7 +548,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, s_length = values[4] - if int(s_length) in query_length_list : + #if int(s_length) in query_length_list : + if int(s_length) in schema_variability[core_name] : #if int(s_length) == int(query_length) : contig_id = values[1] gene_start = values[9] @@ -520,7 +606,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, values = line.split('\t') s_length = values[4] - if int(s_length) == int(query_length) : + #if int(s_length) == int(query_length) : + if int(s_length) in schema_variability[core_name] : contig_id = values[1] gene_start = values[9] gene_end = values[10] @@ -637,8 +724,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, - - if int(s_length) < query_length : + #if int(s_length) < min(schema_variability[core_name]) : + if int(s_length) < int(query_length) : ## check if the blast alignment could be clasified as PLOT seq_id_split = sseqid.split('_') length_sseqid = seq_id_split[3] @@ -742,17 +829,30 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, snp_dict[core_name][sample_value] = [] snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) + + # execute again blast with the reference query the previous query found to get the aligment format to get the SNPs + if not core_name in match_alignment_dict : + match_alignment_dict[core_name] = {} + if not sample_value in match_alignment_dict[core_name] : + match_alignment_dict[core_name][sample_value] = get_aligments_for_deletions (new_sseq, str(qseq)) + + + + + + # convert the sequence to protein if not core_name in protein_dict : protein_dict[core_name] = {} if not sample_value in protein_dict[core_name] : protein_dict[core_name][sample_value] = [] - protein_dict[core_name][sample_value] = [['Sample',convert_to_protein(new_sseq)],['Schema', convert_to_protein(qseq)]] + protein_dict[core_name][sample_value] = nucleotide_to_protein_aligment(new_sseq, qseq ) else: logger.error('ERROR : Stop codon was not found for the core %s and the sample %s', core_name, sample_value) #if int(s_length) > int(query_length) : - elif int(s_length) > int(query_length) : + elif int(s_length) > max(schema_variability[core_name]) : + #elif int(s_length) > int(query_length) : #print ('there is a insertion of ', gapopen ,' bases in the sequence') #print ('qlen is: ',qlen, ' seq_len is : ', length, 'query_reference_length is : ', query_length) #query_seq = Seq.Seq(qseq) @@ -771,7 +871,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, insertions_dict[core_name].append(new_sseq) ### find the index of ASM to include it in the sample matrix dict index_insert = insertions_dict[core_name].index(new_sseq) - if new_sequence_lenght < query_length : + #if new_sequence_lenght < query_length : + if new_sequence_lenght < min(schema_variability[core_name]) : insert_allele = 'ASM_INSERT_' + core_name + '_' + str(index_insert) else: insert_allele = 'AEM_INSERT_' + core_name + '_' + str(index_insert) @@ -795,18 +896,38 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, qseq = str(allele_sequence.reverse_complement()) else: qseq = str(allele_sequence) - # get the SNP for the delection - if not core_name in snp_dict : - snp_dict[core_name] = {} - if not sample_value in snp_dict[core_name] : - snp_dict[core_name][sample_value] = [] - snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) + # get the SNP for the delection + if not core_name in snp_dict : + snp_dict[core_name] = {} + if not sample_value in snp_dict[core_name] : + snp_dict[core_name][sample_value] = [] + snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) + + ''' + cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 80, outfmt= 5, max_target_seqs=10, max_hsps=10,num_threads=3) + out, err = cline(stdin = qseq) + psiblast_xml = StringIO(out) + blast_records = NCBIXML.parse(psiblast_xml) + for blast_record in blast_records: + for alignment in blast_record.alignments: + for match in alignment.hsps: + match_alignment = [['sample', match.sbjct],['match', match.match], ['schema',match.query]] + ''' + if not core_name in match_alignment_dict : + match_alignment_dict[core_name] = {} + if not sample_value in match_alignment_dict[core_name] : + match_alignment_dict[core_name][sample_value] = get_alignment_for_indels (blast_db_name, qseq) + # index_not_match = [m.start() for m in re.finditer(' ', match.match)] + # convert the sequence to protein if not core_name in protein_dict : protein_dict[core_name] = {} if not sample_value in protein_dict[core_name] : - protein_dict[core_name][sample_value] = [] - protein_dict[core_name][sample_value] = [['Sample',convert_to_protein(new_sseq)],['Schema', convert_to_protein(qseq)]] + #protein_dict[core_name][sample_value] = [] + protein_dict[core_name][sample_value] = nucleotide_to_protein_aligment(new_sseq, qseq ) + + # get the SNP from the alignment + else: samples_matrix_dict[sample_value].append('ERROR ') @@ -924,6 +1045,23 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, snp_fh.write(core + '\t' + sample + '\t' + '\t'.join (snp) + '\n') + match_alignment_dict + + logger.info('Saving matching alignment information to files..') + alignment_dir = os.path.join(outputdir,'alignments') + if os.path.exists(alignment_dir) : + shutil.rmtree(alignment_dir) + logger.info('deleting the alignment files from previous execution') + os.makedirs(alignment_dir) + for core in sorted(match_alignment_dict) : + for sample in sorted (match_alignment_dict[core]) : + match_alignment_file = os.path.join(alignment_dir, str('match_alignment_' + core + '_' + sample + '.txt')) + with open(match_alignment_file, 'w') as match_alignment_fh : + match_alignment_fh.write( '\t'.join(header_match_alignment) + '\n') + for match_align in match_alignment_dict[core][sample] : + match_alignment_fh.write(core + '\t'+ sample +'\t'+ '\t'.join(match_align) + '\n') + + # saving protein in a separated file logger.info('Saving protein information to files..') protein_dir = os.path.join(outputdir,'proteins') @@ -933,7 +1071,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, os.makedirs(protein_dir) for core in sorted(protein_dict) : for sample in sorted (protein_dict[core]) : - protein_file = os.path.join(protein_dir, str('protein_' + core + '_' + sample + '.tsv')) + protein_file = os.path.join(protein_dir, str('protein_' + core + '_' + sample + '.txt')) with open(protein_file, 'w') as protein_fh : protein_fh.write( '\t'.join(header_protein) + '\n') for protein in protein_dict[core][sample] : @@ -954,7 +1092,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, return True if __name__ == '__main__' : - version = ' Taranis 0.0.1' + version = ' Taranis 0.0.3' if sys.argv[1] == '-v' or sys.argv[1] == '--version': print( version, '\n') exit (0)