From ee04462713e6953fa196767f28018cb417b6360e Mon Sep 17 00:00:00 2001 From: saramonzon Date: Sat, 27 Oct 2018 12:44:27 +0200 Subject: [PATCH 1/4] Change version in script. Fix name of program in help --- taranis.py | 266 ++++++++++++++++++++++++++--------------------------- 1 file changed, 133 insertions(+), 133 deletions(-) diff --git a/taranis.py b/taranis.py index d67bfe8..0d3fd2e 100644 --- a/taranis.py +++ b/taranis.py @@ -28,7 +28,7 @@ import subprocess #from subprocess import check_output -import shutil +import shutil def open_log(log_name): working_dir = os.getcwd() @@ -56,7 +56,7 @@ def check_program_is_exec_version (program, version, logger): version_str= str(subprocess.check_output([program , '-version'])) if version_str == "b''" : version_str = subprocess.getoutput( str (program + ' -version')) - + tmp_re = re.search(r'.*: (\d.+)\.\d\+',version_str) present_version = float(tmp_re.group(1)) #if not re.search(version, version_str): @@ -68,24 +68,24 @@ def check_program_is_exec_version (program, version, logger): else: logger.info('Cannot find %s installed on your system', program) return False - + def check_prerequisites (logger): pre_requisite_list = [['blastp', '2.6'], ['makeblastdb' , '2.6']] - # check if blast is installed and has the minimum version + # check if blast is installed and has the minimum version for program, version in pre_requisite_list : if not check_program_is_exec_version (program , version, logger): return False return True def check_arg(args=None): - - parser = argparse.ArgumentParser(prog = 'get_subset', description="This program will make the Allele Calling using a predefined core Schema.") - + + parser = argparse.ArgumentParser(prog = 'taranis.py', description="This program will make the Allele Calling using a predefined core Schema.") + parser.add_argument('-coregenedir', help = 'Directory where the core gene files are located ') parser.add_argument('-inputdir', help ='Directory where are located the sample fasta files') parser.add_argument('-outputdir', help = 'Directory where the result files will be stored') - parser.add_argument('-cpus', required= False, help = 'Number of CPUS to be used in the program. Default is 3.', default = 3) + #parser.add_argument('-cpus', required= False, help = 'Number of CPUS to be used in the program. Default is 3.', default = 3) parser.add_argument('-updateschema' , required=False, help = 'Create a new schema with the new locus found. Default is True.', default = True) parser.add_argument('-percentlength', required=False, help = 'Allowed length percentage to be considered as ASM or ALM. Outside of this limit it is considered as LNF Default is 20.', default = 20) return parser.parse_args() @@ -112,7 +112,7 @@ def write_first_allele_seq(file_sequence, store_dir, logger): exit (0) first_record = SeqIO.parse(file_sequence, "fasta").__next__() # build the fasta file name to store under first_allele_firectory - fasta_file = os.path.join(full_path_first_allele, f_name) + fasta_file = os.path.join(full_path_first_allele, f_name) SeqIO.write(first_record, fasta_file, "fasta") return fasta_file @@ -129,14 +129,14 @@ def create_blastdb (file_name, db_name,db_type, logger ): logger.info('Cannot create directory for local blast database on Core Gene file %s' , f_name[0]) print ('Error when creating the directory %s for blastdb. ', db_dir) exit(0) - + blast_command = ['makeblastdb' , '-in' , file_name , '-parse_seqids', '-dbtype', db_type, '-out' , output_blast_dir] blast_result = subprocess.run(blast_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if blast_result.stderr: logger.error('cannot create blast db for %s ', f_name[0]) logger.error('makeblastdb returning error code %s', blast_result.stderr) return False - + else: logger.info('Skeeping the blastdb creation for %s, as it is already exists', f_name[0]) return True @@ -152,10 +152,10 @@ def check_blast (reference_allele, sample_files, db_name, logger) : return False cline = NcbiblastnCommandline(db=blast_db, evalue=0.001, outfmt=5, max_target_seqs=10, max_hsps=10,num_threads=1, query=reference_allele) out, err = cline() - + psiblast_xml = StringIO(out) blast_records = NCBIXML.parse(psiblast_xml) - + for blast_record in blast_records: locationcontigs = [] for alignment in blast_record.alignments: @@ -208,7 +208,7 @@ def prepare_core_gene(core_gene_file_list, store_dir, logger): blast_dir = os.path.join(store_dir,'blastdb') logger.info('start preparation of core genes files') for fasta_file in core_gene_file_list: - + # parsing fasta file and get in the dictionary the id and the sequence fasta_file_parsed_dict = parsing_fasta_file_to_dict(fasta_file, logger) f_name = os.path.basename(fasta_file).split('.') @@ -216,22 +216,22 @@ def prepare_core_gene(core_gene_file_list, store_dir, logger): # dump fasta file into pickle file with open (file_list[-1],'wb') as f: pickle.dump(fasta_file_parsed_dict, f) - # create the first allele for each core gene file + # create the first allele for each core gene file #### used only for gene annotation first_alleles_list.append(write_first_allele_seq(fasta_file, store_dir, logger)) alleles_len = [] for allele in fasta_file_parsed_dict : alleles_len.append(len(fasta_file_parsed_dict[allele])) - + schema_variability[f_name[0]]=list(set(alleles_len)) schema_statistics[f_name[0]]=[statistics.mode(alleles_len), min(alleles_len), max(alleles_len)] return file_list , first_alleles_list , schema_variability, schema_statistics - + def prepare_samples( sample_file_list, store_dir, logger): file_list = [] blast_dir = os.path.join(store_dir,'blastdb') - + for fasta_file in sample_file_list: # parsing fasta file and get in the dictionary the id and the sequence fasta_file_parsed_dict = parsing_fasta_file_to_dict(fasta_file, logger) @@ -240,7 +240,7 @@ def prepare_samples( sample_file_list, store_dir, logger): # dump fasta file into pickle file with open (file_list[-1],'wb') as f: pickle.dump(fasta_file_parsed_dict, f) - + # create local blast db for each core gene fasta file if not create_blastdb(fasta_file, blast_dir, 'nucl' ,logger): print('Error when creating the blastdb for core gene files. Check log file for more information. \n ') @@ -265,21 +265,21 @@ def get_gene_annotation (annotation_file, annotation_dir, logger) : #annotation_result = subprocess.run (['prokka', annotation_file , '--outdir' , str(annotation_dir + 'prokka_anotation' + name_file[0]), annotation_result = subprocess.run (['prokka', annotation_file , '--outdir' , annotation_dir , '--prefix', name_file[0]]) - + return str(annotation_dir + 'prokka_anotation' + name_file[0] + name_file[0] + '.gff') def analize_annotation_files (in_file, logger) : examiner = GFF.GFFExaminer() - file_fh = open(in_file) + file_fh = open(in_file) datos = examiner.available_limits(in_file) file_fh.close() return True - + def get_inferred_allele_number(core_dict, logger): #This function will look for the highest locus number and it will return a safe high value - # that will be added to the schema database + # that will be added to the schema database logger.debug('running get_inferred_allele_number function') int_keys = [] for key in core_dict.keys(): @@ -288,8 +288,8 @@ def get_inferred_allele_number(core_dict, logger): digit_length = len(str(max_value)) # return any of the values ( 10000, 100000, 1000000 and so on ) according to the bigest allele number used return True #str 1 ( #'1'+ '0'*digit_length + 2) - - + + def get_stop_codon_index(seq, tga_stop_codon, indel_position) : stop_codons = ['TAA', 'TAG','TGA'] @@ -307,7 +307,7 @@ def get_stop_codon_index(seq, tga_stop_codon, indel_position) : return index else : continue - + return index #index +=3 # Stop condon not foun tn the sequence @@ -340,7 +340,7 @@ def get_snp (sample, query) : 'D' : 'acidic', 'E' :'acidic', 'H': 'basic' , 'K': 'basic' , 'R' : 'basic', '-': '-----', '*' : 'Stop codon'} - + snp_list = [] length = max(len(sample), len(query)) # normalize the lenght of the sample for the iteration @@ -353,8 +353,8 @@ def get_snp (sample, query) : # convert to Seq class to translate to protein seq_sample = Seq.Seq(sample) seq_query = Seq.Seq(query) - - + + for index in range (0, length -2, 3) : codon_seq = seq_sample[index : index + 3] codon_que = seq_query[index : index + 3] @@ -368,17 +368,17 @@ def get_snp (sample, query) : else: prot_que = '-' snp_list.append([str(index+1),str(codon_seq) + '/'+ str(codon_que), prot_seq + '/' + prot_que, prot_annotation[prot_seq] + ' / ' + prot_annotation[prot_que]]) - - - - + + + + return snp_list def convert_to_protein (sequence) : seq = Seq.Seq(sequence) protein = str(seq.translate()) - + return protein def nucleotide_to_protein_aligment (sample_seq, query_seq ) : @@ -399,7 +399,7 @@ def get_alignment_for_indels (blast_db_name, qseq) : cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 80, outfmt= 5, max_target_seqs=10, max_hsps=10,num_threads=3) out, err = cline(stdin = qseq) psiblast_xml = StringIO(out) - blast_records = NCBIXML.parse(psiblast_xml) + blast_records = NCBIXML.parse(psiblast_xml) for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: @@ -417,9 +417,9 @@ def get_aligments_for_deletions (sample_seq, query_seq): if not index_found : index = 0 values = format_alignment(*alignments[index]).split('\n') - + match_alignment = [['sample', values[0]],['match', values[1]], ['schema',values[2]]] - + return match_alignment def create_summary (samples_matrix_dict, logger) : @@ -445,7 +445,7 @@ def create_summary (samples_matrix_dict, logger) : elif 'NIPH' in values : summary_dict[key]['NIPH'] += 1 elif 'NIPHEM' in values : - summary_dict[key]['NIPHEM'] += 1 + summary_dict[key]['NIPHEM'] += 1 elif 'PLOT' in values : summary_dict[key]['PLOT'] += 1 else: @@ -460,14 +460,14 @@ def create_summary (samples_matrix_dict, logger) : summary_dict[key]['Exact match'] +=1 except: logger.debug('The value %s, was found when collecting summary information for the %s', values, summary_dict[key] ) - else: + else: logger.debug('The value %s, was found when collecting summary information for the %s', values, summary_dict[key] ) summary_sample_list = [] for item in summary_heading_list : summary_sample_list.append(str(summary_dict[key][item])) summary_result_list.append(key + '\t' +'\t'.join(summary_sample_list)) - return summary_result_list - + return summary_result_list + @@ -497,7 +497,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, header_snp = ['Sample Name','Core Gene', 'Position','Sequence Sample/Schema','Protein in Sample/Schema', 'Annotation Sample / Schema'] header_protein = ['Sample Name','Core Gene', 'Protein in ' , 'Protein sequence'] header_match_alignment = ['Sample Name','Core Gene','Alignment', 'Sequence'] - + for core_file in core_gene_dict_files: print ( 'Analyzing core file : ', core_file) full_gene_list.append(os.path.basename(core_file)) @@ -506,7 +506,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, reference_query = os.path.join(reference_query_directory, str( core_name + '.fasta')) with open (core_file, 'rb') as core_f: core_dict = pickle.load(core_f) - logger.debug('load in memory the core file %s ', core_file) + logger.debug('load in memory the core file %s ', core_file) ref_query_parse = list (SeqIO.parse(reference_query, "fasta")) query_length = len(ref_query_parse[0].seq) #query_length_list =[] @@ -516,7 +516,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if not allele_length in query_length_list : query_length_list.append(allele_length) ''' - + #create new_allele_dict to infer new_allele_dict ={} samples_inferred = [] @@ -526,39 +526,39 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, #with open (sample_file,'rb') as sample_f : # sample_dict = pickle.load(sample_f) #logger.debug('loaded in memory the sample file %s' , sample_file) - + sample_value = os.path.basename(sample_file) if not sample_value in samples_matrix_dict: - # initialize the sample list to add the number of alleles and the start, stop positions + # initialize the sample list to add the number of alleles and the start, stop positions samples_matrix_dict[sample_value] = [] matching_genes_dict[sample_value] = {} #intersection = set(core_dict.values()).intersection(gene_dict.values()) blast_db_name = os.path.join(blast_db_directory, os.path.basename(sample_file),os.path.basename(sample_file)) #blast_db_name = '/srv/project_wgmlst/samples_listeria/RA-L2073/blastdb' #reference_query = '/srv/project_wgmlst/lmo_test.fasta' - + cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 100, outfmt = blast_parameters , max_target_seqs=2, max_hsps=1,num_threads=1, query=reference_query) #cline = NcbiblastnCommandline(db=Gene_Blast_DB_name, evalue=0.001, outfmt=5, max_target_seqs=10, max_hsps=10,num_threads=1, query='/srv/project_wgmlst/seqSphere_listeria_cgMLST_test/targets/lmo0001.fasta') out, err = cline() out_lines = out.splitlines( ) - + if len (out_lines) > 0 : - + bigger_bitscore = 0 allele_found = {} - + for line in out_lines : values = line.split('\t') - + s_length = values[4] - + #if int(s_length) in query_length_list : if int(s_length) in schema_variability[core_name] : #if int(s_length) == int(query_length) : contig_id = values[1] gene_start = values[9] gene_end = values[10] - + allele_is_subset = False if len(allele_found) > 0 : # check if the new match is a subset of the previous allele found in blast @@ -567,7 +567,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, logger.info('Found allele %s that starts or ends as the same position as %s ' , values[0], allele_found[allele][0]) allele_is_subset = True break - + if len(allele_found) == 0 or not allele_is_subset : contig_id_start = str(contig_id + '_'+ gene_start) allele_found[contig_id_start] = values @@ -575,7 +575,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, #qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend ,sseq , qseq= values #bigger_bitscore = int(bitscore) bigger_bitscore = int(values[8]) - + if len(allele_found) > 1: # found paralogs in the sample for the core gene samples_matrix_dict[sample_value].append('NIPHEM') @@ -598,7 +598,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, else: matching_genes_dict[sample_value][sseqid].append([core_name, sstart,send,'+', 'NIPHEM']) continue - + elif len(allele_found) == 1 : ## look for possible paralogos by finding other alleles that identity is > 90% paralog_found ={} @@ -609,7 +609,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, for line in out_lines : values = line.split('\t') s_length = values[4] - + #if int(s_length) == int(query_length) : if int(s_length) in schema_variability[core_name] : contig_id = values[1] @@ -633,7 +633,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, matching_genes_dict[sample_value][sseqid].append([core_name, sstart,send,'-','EXACT']) else: matching_genes_dict[sample_value][sseqid].append([core_name, sstart,send,'+','EXACT']) - + continue else: # paralog has been found @@ -645,7 +645,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, paralog_dict[sample_value] [core_name]= [] # merging the 2 dictionary paralog_matrix[sample_value] = {**allele_found, **paralog_found} - + for paralog in paralog_matrix[sample_value] : sstart = paralog_matrix[sample_value][paralog][9] send = paralog_matrix[sample_value][paralog] [10] @@ -685,17 +685,17 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, #print('mistmatch is : ', mismatch, 'gaps is : ', gapopen) #print('q start : ', qstart, ' q end : ', qend ) #print ('s start : ', sstart, ' s end', send) - - + + if int(s_length) in schema_variability[core_name] : - + #if int(s_length) == int(query_length) : # if equal then a new allele has been detected logger.info('Found new allele for core gene %s ', core_name) if not sample_value in inf_dict : inf_dict[sample_value] = {} if not core_name in inf_dict[sample_value] : inf_dict[sample_value] [core_name]= [] - ### adding new allele to the inferred allele list if it is not already included + ### adding new allele to the inferred allele list if it is not already included if not core_name in inferred_alleles_dict : inferred_alleles_dict[core_name] = [] if not sseq in inferred_alleles_dict[core_name] : @@ -705,7 +705,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, inferred_allele = 'INF_' + core_name + '_' + str(index_inferred) samples_matrix_dict[sample_value].append(inferred_allele) inf_dict[sample_value][core_name].append([qseqid,sseqid,bitscore,sstart, send, sseq]) - + if not sseqid in matching_genes_dict[sample_value] : matching_genes_dict[sample_value][sseqid] = [] if sstart > send : @@ -713,22 +713,22 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, else: matching_genes_dict[sample_value][sseqid].append([core_name, sstart,send,'+',inferred_allele]) continue - + #if int(s_length) / int(query_length) < ( 1- percentlength/100) or int(s_length) / int(query_length) > (1 + percentlength/100) : # samples_matrix_dict[sample_value].append('LNF') # logger.info('Locus found at sample %s, for gene %s', sample_value, core_name) - + # continue - + alleles_in_gene = list (SeqIO.parse(reference_query, "fasta")) for allele_item in alleles_in_gene : if allele_item.id == qseqid : break allele_sequence = allele_item.seq - - - - #if int(s_length) < min(schema_variability[core_name]) : + + + + #if int(s_length) < min(schema_variability[core_name]) : if int(s_length) < int(query_length) : ## check if the blast alignment could be clasified as PLOT seq_id_split = sseqid.split('_') @@ -747,9 +747,9 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, # print ('qlen is: ',qlen, ' seq_len is : ', length, 'query_reference_length is : ', query_length) # print('mistmatch is : ', mismatch, 'gaps is : ', gapopen) # print('q start : ', qstart, ' q end : ', qend ) - # print ('s start : ', sstart, ' s end', send) + # print ('s start : ', sstart, ' s end', send) + - query_direction = check_sequence_order(allele_sequence, logger) contig_file = os.path.join(inputdir,str(sample_value + '.fasta')) records = list (SeqIO.parse(contig_file, "fasta")) @@ -758,12 +758,12 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if record.id == sseqid : break accession_sequence = record.seq - + if allele_sequence.endswith ('TGA') or allele_sequence.startswith ('TCA') : tga_stop_codon = True else: tga_stop_codon = False - #it is assume that reference query is in reverse complement. + #it is assume that reference query is in reverse complement. #tga_stop_codon = allele_sequence.startswith('TCA') #bassed_added_start = int(qlen) -len(qseq) - int(qstart) if query_direction == 'reverse' : @@ -780,7 +780,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, sample_gene_sequence = accession_sequence[int(sstart) -1 : int(send) + 51] #else: # tga_stop_codon = allele_sequence.endswith ('TGA') or allele_sequence.startswith ('TCA') - # bassed_added_end = int(qlen) -len(qseq) + # bassed_added_end = int(qlen) -len(qseq) # if query_direction == 'forward' : # increasing the number of nucleotides in the sequence to find the stop codon when becomes longer because of the deletion # sample_gene_sequence = accession_sequence[int(send) - bassed_added_end - 51: int(sstart) ] @@ -790,9 +790,9 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, # sample_gene_sequence = accession_sequence[int(send) -1: int(sstart) + bassed_added_end + 51] stop_index = get_stop_codon_index(sample_gene_sequence, tga_stop_codon, int(qlen)- int(qstart)) if stop_index != False: - new_sequence_lenght = stop_index +3 + new_sequence_lenght = stop_index +3 new_sseq = str(sample_gene_sequence[0:new_sequence_lenght]) - + ### adding ASM allele to the asm_allele_matrix if it is not already include if not core_name in deletions_dict : deletions_dict[core_name] = [] @@ -807,21 +807,21 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, else: delete_allele = 'ALM_DELETE_' + core_name + '_' + str(index_delete) samples_matrix_dict[sample_value].append(delete_allele) - + if not sseqid in matching_genes_dict[sample_value] : matching_genes_dict[sample_value][sseqid] = [] if sstart > send : matching_genes_dict[sample_value][sseqid].append([core_name, str(int(sstart)-new_sequence_lenght -1), sstart,'-', delete_allele]) else: matching_genes_dict[sample_value][sseqid].append([core_name, sstart,str(int(sstart)+ new_sequence_lenght),'+', delete_allele]) - + ### add the deletion into deletion list if not core_name in list_deletions : list_deletions [core_name] = {} if not sample_value in list_deletions[core_name] : list_deletions[core_name][sample_value] = {} list_deletions[core_name][sample_value][delete_allele] = [qseqid, sseqid, bitscore, str(query_length) , s_length, str(new_sequence_lenght), mismatch , gapopen, sstart, send, new_sseq ] - + if check_sequence_order(qseq, logger) == 'reverse' : qseq = str(allele_sequence.reverse_complement()) else: @@ -832,43 +832,43 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if not sample_value in snp_dict[core_name] : snp_dict[core_name][sample_value] = [] snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) - - + + # execute again blast with the reference query the previous query found to get the aligment format to get the SNPs if not core_name in match_alignment_dict : match_alignment_dict[core_name] = {} if not sample_value in match_alignment_dict[core_name] : match_alignment_dict[core_name][sample_value] = get_aligments_for_deletions (new_sseq, str(qseq)) - - - - - - + + + + + + # convert the sequence to protein if not core_name in protein_dict : protein_dict[core_name] = {} if not sample_value in protein_dict[core_name] : protein_dict[core_name][sample_value] = [] - protein_dict[core_name][sample_value] = nucleotide_to_protein_aligment(new_sseq, qseq ) + protein_dict[core_name][sample_value] = nucleotide_to_protein_aligment(new_sseq, qseq ) else: logger.error('ERROR : Stop codon was not found for the core %s and the sample %s', core_name, sample_value) - + #if int(s_length) > int(query_length) : elif int(s_length) > max(schema_variability[core_name]) : - #elif int(s_length) > int(query_length) : + #elif int(s_length) > int(query_length) : #print ('there is a insertion of ', gapopen ,' bases in the sequence') #print ('qlen is: ',qlen, ' seq_len is : ', length, 'query_reference_length is : ', query_length) #query_seq = Seq.Seq(qseq) tga_stop_codon = qseq.endswith('TGA') sseq = sseq.replace('-','') stop_index = get_stop_codon_index(sseq, tga_stop_codon, qseq.find('-')) - + if stop_index != False: new_sequence_lenght = stop_index +3 ### adding ASM allele to the asm_allele_matrix if it is not already include new_sseq = sseq[0:new_sequence_lenght] - + if not core_name in insertions_dict : insertions_dict[core_name] = [] if not new_sseq in insertions_dict[core_name] : @@ -895,7 +895,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if not sample_value in list_insertions[core_name] : list_insertions[core_name][sample_value] = {} list_insertions[core_name][sample_value][insert_allele] = [qseqid, sseqid, bitscore, str(query_length) , s_length, str(new_sequence_lenght), mismatch , gapopen, sstart, send, new_sseq ] - + if check_sequence_order(qseq, logger) == 'reverse' : qseq = str(allele_sequence.reverse_complement()) else: @@ -906,36 +906,36 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if not sample_value in snp_dict[core_name] : snp_dict[core_name][sample_value] = [] snp_dict[core_name][sample_value] = get_snp(new_sseq, qseq) - - ''' + + ''' cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 80, outfmt= 5, max_target_seqs=10, max_hsps=10,num_threads=3) out, err = cline(stdin = qseq) psiblast_xml = StringIO(out) - blast_records = NCBIXML.parse(psiblast_xml) + blast_records = NCBIXML.parse(psiblast_xml) for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: match_alignment = [['sample', match.sbjct],['match', match.match], ['schema',match.query]] - ''' + ''' if not core_name in match_alignment_dict : match_alignment_dict[core_name] = {} if not sample_value in match_alignment_dict[core_name] : - match_alignment_dict[core_name][sample_value] = get_alignment_for_indels (blast_db_name, qseq) + match_alignment_dict[core_name][sample_value] = get_alignment_for_indels (blast_db_name, qseq) # index_not_match = [m.start() for m in re.finditer(' ', match.match)] - + # convert the sequence to protein if not core_name in protein_dict : protein_dict[core_name] = {} if not sample_value in protein_dict[core_name] : #protein_dict[core_name][sample_value] = [] protein_dict[core_name][sample_value] = nucleotide_to_protein_aligment(new_sseq, qseq ) - + # get the SNP from the alignment - - + + else: samples_matrix_dict[sample_value].append('ERROR ') - + print ('ERROR when looking the allele match for core gene ', core_name, 'at sample ', sample_value ) logger.debug ('matching genes = %s', matching_genes_dict) @@ -959,8 +959,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, logger.debug ('list of proteins = %s' , protein_dict) logger.debug ('---------------------------------------------------') - - + + # print ( 'valor de retorno ', samples_matrix_dict) result_file = os.path.join ( outputdir, 'result.tsv') @@ -970,7 +970,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, out_fh.write ('Sample Name\t'+'\t'.join( full_gene_list) + '\n') for key in sorted (samples_matrix_dict): out_fh.write (key + '\t' + '\t'.join(samples_matrix_dict[key])+ '\n') - + # saving paralog sequence to file logger.info('Saving paralog information to file..') paralog_file = os.path.join(outputdir, 'paralog.tsv') @@ -980,7 +980,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, for core in sorted (paralog_dict[sample]): for paralog in paralog_dict[sample][core] : paralog_fh.write(sample + '\t' + core + '\t' + '\t'.join (paralog) + '\n') - + # saving inferred alleles to file logger.info('Saving inferred alleles information to file..') inferred_file = os.path.join(outputdir, 'inferred_alleles.tsv') @@ -1007,7 +1007,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, #matching_alleles = '\t'.join (matching_genes_dict[samples][contig]) matching_alleles = '\t'.join (contig) matching_fh.write(samples + '\t' + contigs +'\t' + matching_alleles + '\n') - + # saving insertions bases in contigs to file logger.info('Saving insert bases information to file..') insertions_file = os.path.join(outputdir, 'insertions.tsv') @@ -1017,7 +1017,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, for sample in list_insertions[c_gene] : for insert in list_insertions[c_gene][sample]: insertions_fh.write(c_gene + '\t' + sample + '\t' + insert + '\t' + '\t'.join(list_insertions[c_gene][sample][insert]) + '\n') - + # saving deletions bases in contigs to file logger.info('Saving deleted bases information to file..') deletions_file = os.path.join(outputdir, 'deletions.tsv') @@ -1027,7 +1027,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, for sample in list_deletions[c_gene] : for delete in list_deletions[c_gene][sample]: deletions_fh.write(c_gene + '\t' + sample + '\t' + delete + '\t' + '\t'.join(list_deletions[c_gene][sample][delete]) + '\n') - + # saving PLOT to file logger.info('Saving PLOT information to file..') plot_file = os.path.join(outputdir, 'plot.tsv') @@ -1037,7 +1037,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, for core in sorted (plot_dict[sample]): for plot in plot_dict[sample][core] : plot_fh.write(sample + '\t' + core + '\t' + '\t'.join (plot) + '\n') - + # saving SNPs to file logger.info('Saving SNPs information to file..') snp_file = os.path.join(outputdir, 'snp.tsv') @@ -1047,10 +1047,10 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, for sample in sorted (snp_dict[core]): for snp in snp_dict[core][sample] : snp_fh.write(core + '\t' + sample + '\t' + '\t'.join (snp) + '\n') - - + + match_alignment_dict - + logger.info('Saving matching alignment information to files..') alignment_dir = os.path.join(outputdir,'alignments') if os.path.exists(alignment_dir) : @@ -1064,8 +1064,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, match_alignment_fh.write( '\t'.join(header_match_alignment) + '\n') for match_align in match_alignment_dict[core][sample] : match_alignment_fh.write(core + '\t'+ sample +'\t'+ '\t'.join(match_align) + '\n') - - + + # saving protein in a separated file logger.info('Saving protein information to files..') protein_dir = os.path.join(outputdir,'proteins') @@ -1080,8 +1080,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, protein_fh.write( '\t'.join(header_protein) + '\n') for protein in protein_dict[core][sample] : protein_fh.write(core + '\t'+ sample +'\t'+ '\t'.join(protein) + '\n') - - + + ### create summary information logger.info('Saving summary information to file..') summary_result = create_summary (samples_matrix_dict, logger) @@ -1089,14 +1089,14 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, with open (summary_file , 'w') as summ_fh: for line in summary_result : summ_fh.write(line + '\n') - - - - + + + + return True if __name__ == '__main__' : - version = ' Taranis 0.0.3' + version = ' Taranis 0.3.0' if sys.argv[1] == '-v' or sys.argv[1] == '--version': print( version, '\n') exit (0) @@ -1115,7 +1115,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if not valid_core_gene_files : print ('There are not valid fasta files in ', arguments.coregenedir , ' directory. Check log file for more information ') exit(0) - + valid_sample_files = get_fasta_file_list(arguments.inputdir, logger) if not valid_sample_files : print ('There are not valid fasta files in ', arguments.inputdir , ' directory. Check log file for more information ') @@ -1143,7 +1143,7 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, logger.info('Deleting the temporary directory to clean up the temporary files created') shutil.rmtree(os.path.join(arguments.outputdir, 'tmp')) exit(0) - + ####################################################### # Prepare the samples files ####################################################### @@ -1168,11 +1168,11 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, logger.info('Deleting the temporary directory to clean up the temporary files created') shutil.rmtree(os.path.join(arguments.outputdir, 'tmp')) exit(0) - + ### core schema annotation ######### ''' - - + + annotation_core_list =[] for annotation_core_file in core_first_alleles_files : annotation_core_list.append(get_gene_annotation (annotation_core_file, tmp_core_gene_dir, logger)) @@ -1180,8 +1180,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, annotation_sample_list =[] for annotation_sample_file in valid_sample_files : annotation_sample_list.append(get_gene_annotation (annotation_sample_file, tmp_samples_dir, logger)) - - + + ''' #reference_query_directory = os.path.join(tmp_core_gene_dir,'first_alleles') ########## Modified to get all alleles instead of the first one ############# @@ -1194,4 +1194,4 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, print ('script ends ') - \ No newline at end of file + From fa4422a41e7b2a16130875e9f9dcc3d467bdc7bb Mon Sep 17 00:00:00 2001 From: saramonzon Date: Sat, 27 Oct 2018 22:32:11 +0200 Subject: [PATCH 2/4] Added cpus to help --- taranis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taranis.py b/taranis.py index 0d3fd2e..e004937 100644 --- a/taranis.py +++ b/taranis.py @@ -85,7 +85,7 @@ def check_arg(args=None): parser.add_argument('-coregenedir', help = 'Directory where the core gene files are located ') parser.add_argument('-inputdir', help ='Directory where are located the sample fasta files') parser.add_argument('-outputdir', help = 'Directory where the result files will be stored') - #parser.add_argument('-cpus', required= False, help = 'Number of CPUS to be used in the program. Default is 3.', default = 3) + parser.add_argument('-cpus', required= False, help = 'Number of CPUS to be used in the program. Default is 3.', default = 3) parser.add_argument('-updateschema' , required=False, help = 'Create a new schema with the new locus found. Default is True.', default = True) parser.add_argument('-percentlength', required=False, help = 'Allowed length percentage to be considered as ASM or ALM. Outside of this limit it is considered as LNF Default is 20.', default = 20) return parser.parse_args() From b590bcd912d0f3106eddad61b52b14bebe58fb26 Mon Sep 17 00:00:00 2001 From: saramonzon Date: Mon, 29 Oct 2018 10:57:01 +0100 Subject: [PATCH 3/4] fix version check and lenght contig. --- taranis.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) mode change 100644 => 100755 taranis.py diff --git a/taranis.py b/taranis.py old mode 100644 new mode 100755 index e004937..ad505fc --- a/taranis.py +++ b/taranis.py @@ -57,7 +57,7 @@ def check_program_is_exec_version (program, version, logger): if version_str == "b''" : version_str = subprocess.getoutput( str (program + ' -version')) - tmp_re = re.search(r'.*: (\d.+)\.\d\+',version_str) + tmp_re = re.search(r'.*: (\d.+)\.\d*\+',version_str) present_version = float(tmp_re.group(1)) #if not re.search(version, version_str): if present_version < float(version) : @@ -732,7 +732,8 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, if int(s_length) < int(query_length) : ## check if the blast alignment could be clasified as PLOT seq_id_split = sseqid.split('_') - length_sseqid = seq_id_split[3] + #length_sseqid = seq_id_split[3] + length_sseqid = len(sseq) if sstart == length_sseqid or send == length_sseqid: samples_matrix_dict[sample_value].append('PLOT') logger.info('PLOT found at sample %s, for gene %s', sample_value, core_name) From 74c114366de42d1ec5cc64d1be540c5c4d143970 Mon Sep 17 00:00:00 2001 From: saramonzon Date: Mon, 29 Oct 2018 13:32:10 +0100 Subject: [PATCH 4/4] Need to fix contig len --- taranis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taranis.py b/taranis.py index ad505fc..3d0f7fa 100755 --- a/taranis.py +++ b/taranis.py @@ -675,10 +675,10 @@ def allele_call_nucleotides ( core_gene_dict_files, reference_query_directory, continue for line in out_lines : values = line.split('\t') - if int(values[8]) > bigger_bitscore: + if float(values[8]) > bigger_bitscore: qseqid , sseqid , pident , qlen , s_length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend ,sseq , qseq = values #print('q len seq is : ', len(qseq), ' s len seq is : ', len(sseq)) - bigger_bitscore = int(bitscore) + bigger_bitscore = float(bitscore) #cline = NcbiblastnCommandline(db=Gene_Blast_DB_name, evalue=0.001, outfmt=5, max_target_seqs=10, max_hsps=10,num_threads=1, query='/srv/project_wgmlst/seqSphere_listeria_cgMLST_test/targets/lmo0001.fasta')= values #print ( 'number of matches is : ', len(out_lines)) #print ('qlen is: ',qlen, ' seq_len is : ', length , 'query_reference_length is : ', query_length)