diff --git a/.github/workflows/dockerhub_push_release.yml b/.github/workflows/dockerhub_push_release.yml deleted file mode 100644 index e8b6638..0000000 --- a/.github/workflows/dockerhub_push_release.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: deploy release -# This builds the docker image and pushes it to DockerHub -on: - release: - types: [published] -jobs: - push_dockerhub: - name: Push new Docker image to Docker Hub (release) - runs-on: ubuntu-latest - # Only run for the official repo, for releases and merged PRs - if: ${{ github.repository == 'BU-ISCIII/taranis' }} - env: - DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} - DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }} - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Build new docker image - run: docker build --no-cache . -t buisciii/taranis:${{ github.event.release.tag_name }} - - - name: Push Docker image to DockerHub (develop) - run: | - echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin - docker push buisciii/taranis:${{ github.event.release.tag_name }} diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml new file mode 100644 index 0000000..9d043bb --- /dev/null +++ b/.github/workflows/python_lint.yml @@ -0,0 +1,35 @@ +name: python_lint + +on: + push: + paths: + - '**.py' + pull_request: + paths: + - '**.py' + +jobs: + flake8_py3: + runs-on: ubuntu-latest + steps: + - name: Setup Python + uses: actions/setup-python@v1 + with: + python-version: 3.9.x + architecture: x64 + - name: Checkout PyTorch + uses: actions/checkout@master + - name: Install flake8 + run: pip install flake8 + - name: Run flake8 + run: flake8 --ignore E501,W503,E203,W605 + + black_lint: + runs-on: ubuntu-latest + steps: + - name: Setup + uses: actions/checkout@v2 + - name: Install black in jupyter + run: pip install black[jupyter] + - name: Check code lints with Black + uses: psf/black@stable diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml deleted file mode 100644 index ed66541..0000000 --- a/.github/workflows/tests.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: tests ci -# This workflow runs the pipeline with the minimal test dataset to check that it completes any errors -on: - push: - branches: [develop] - pull_request_target: - branches: [develop] - release: - types: [published] - -jobs: - push_dockerhub: - name: Push new Docker image to Docker Hub (dev) - runs-on: ubuntu-latest - # Only run for the official repo, for releases and merged PRs - if: ${{ github.repository == 'BU-ISCIII/taranis' }} - env: - DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} - DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }} - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Build new docker image - run: docker build --no-cache . -t buisciii/taranis:dev - - - name: Push Docker image to DockerHub (develop) - run: | - echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin - docker push buisciii/taranis:dev - run-tests: - name: Run tests - needs: push_dockerhub - runs-on: ubuntu-latest - steps: - - name: Run pipeline with test data - run: | - docker run buisciii/taranis:dev bash -c /opt/taranis/test/test.sh diff --git a/setup.py b/setup.py index 4b6fad4..eda9691 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -version = "2.2.0" +version = "3.0.0" with open("README.md") as f: readme = f.read() diff --git a/taranis/__main__.py b/taranis/__main__.py index d08cb47..a281777 100644 --- a/taranis/__main__.py +++ b/taranis/__main__.py @@ -52,7 +52,7 @@ def run_taranis(): ) # stderr.print("[green] `._,._,'\n", highlight=False) - __version__ = "2.1.0" + __version__ = "3.0.0" stderr.print( "\n" "[grey39] Taranis version {}".format(__version__), highlight=False ) @@ -166,6 +166,12 @@ def taranis_cli(verbose, log_file): default=False, help="Remove no CDS alleles from the schema.", ) +@click.option( + "--output-allele-annot/--no-output-allele-annot", + required=False, + default=True, + help="get extension annotation for all alleles in locus", +) @click.option( "--genus", required=False, @@ -184,29 +190,41 @@ def taranis_cli(verbose, log_file): default="Genus", help="Use genus-specific BLAST databases for Prokka schema genes annotation (needs --genus). Default is False.", ) +@click.option( + "--cpus", + required=False, + multiple=False, + type=int, + default=1, + help="Number of cpus used for execution", +) def analyze_schema( inputdir, output, remove_subset, remove_duplicated, remove_no_cds, + output_allele_annot, genus, species, usegenus, + cpus, ): schema_files = taranis.utils.get_files_in_folder(inputdir, "fasta") """ - schema_analyze = {} + schema_analyze = [] for schema_file in schema_files: schema_obj = taranis.analyze_schema.AnalyzeSchema(schema_file, output, remove_subset, remove_duplicated, remove_no_cds, genus, species, usegenus) - schema_analyze.update(schema_obj.analyze_allele_in_schema()) - - """ + schema_analyze.append(schema_obj.analyze_allele_in_schema()) + import pdb; pdb.set_trace() + _ = taranis.analyze_schema.collect_statistics(schema_analyze, output, output_allele_annot) + sys.exit(0) # for schema_file in schema_files: + """ results = [] start = time.perf_counter() - with concurrent.futures.ProcessPoolExecutor() as executor: + with concurrent.futures.ProcessPoolExecutor(max_workers=cpus) as executor: futures = [ executor.submit( taranis.analyze_schema.parallel_execution, @@ -224,10 +242,11 @@ def analyze_schema( # Collect results as they complete for future in concurrent.futures.as_completed(futures): results.append(future.result()) - _ = taranis.analyze_schema.collect_statistics(results, output) + _ = taranis.analyze_schema.collect_statistics(results, output, output_allele_annot) finish = time.perf_counter() print(f"Schema analyze finish in {round((finish-start)/60, 2)} minutes") + # Reference alleles @taranis_cli.command(help_priority=2) @click.option( diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py index 20ef08c..d9a1e99 100644 --- a/taranis/allele_calling.py +++ b/taranis/allele_calling.py @@ -5,12 +5,14 @@ import taranis.utils import taranis.blast + # import numpy import pandas as pd from pathlib import Path import pdb + log = logging.getLogger(__name__) stderr = rich.console.Console( stderr=True, @@ -19,6 +21,7 @@ force_terminal=taranis.utils.rich_force_colors(), ) + class AlleleCalling: def __init__(self, prediction, sample_file, schema, reference_alleles, out_folder): self.prediction = prediction @@ -27,9 +30,25 @@ def __init__(self, prediction, sample_file, schema, reference_alleles, out_folde self.ref_alleles = reference_alleles self.out_folder = out_folder self.s_name = Path(sample_file).stem - self.blast_dir = os.path.join(out_folder,"blastdb") + self.blast_dir = os.path.join(out_folder, "blastdb") self.blast_sample = os.path.join(self.blast_dir, self.s_name) - self.blast_heading = ["qseqid", "sseqid", "pident", "qlen", "length", "mismatch", "gapopen", "evalue", "bitscore", "sstart", "send", "qstart", "qend", "sseq", "qseq"] + self.blast_heading = [ + "qseqid", + "sseqid", + "pident", + "qlen", + "length", + "mismatch", + "gapopen", + "evalue", + "bitscore", + "sstart", + "send", + "qstart", + "qend", + "sseq", + "qseq", + ] def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene): """_summary_ @@ -39,23 +58,22 @@ def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene) allele_name (_type_): _description_ sample_contig (_type_): _description_ schema_gene (_type_): _description_ - """ + """ s_alleles_blast = taranis.blast.Blast("nucl") ref_allele_blast_dir = os.path.join(self.blast_dir, "ref_alleles") query_path = os.path.join(self.out_folder, "tmp", allele_name) - # Write to file the sequence to find out the loci name that fully match + # Write to file the sequence to find out the loci name that fully match f_name = taranis.utils.write_fasta_file(query_path, query_seq, allele_name) query_file = os.path.join(query_path, f_name) _ = s_alleles_blast.create_blastdb(schema_gene, ref_allele_blast_dir) - # Blast with sample sequence to find the allele in the schema + # Blast with sample sequence to find the allele in the schema seq_blast_match = s_alleles_blast.run_blast(query_file, perc_identity=100) pdb.set_trace() if len(seq_blast_match) >= 1: - # allele is named as NIPHEM - + # allele is named as NIPHEM + # Hacer un blast con la query esta secuencia y la database del alelo # Create blast db with sample file - pass elif len(seq_blast_match) == 1: @@ -63,14 +81,13 @@ def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene) else: pass - - def search_alleles (self, ref_allele): + def search_alleles(self, ref_allele): allele_name = Path(ref_allele).stem - schema_gene = os.path.join(self.schema, allele_name + ".fasta") + schema_gene = os.path.join(self.schema, allele_name + ".fasta") allele_name = Path(ref_allele).stem # run blast with sample as db and reference allele as query sample_blast_match = self.sample_blast.run_blast(ref_allele) - if len(sample_blast_match) > 0 : + if len(sample_blast_match) > 0: pd_lines = pd.DataFrame([item.split("\t") for item in sample_blast_match]) pd_lines.columns = self.blast_heading pd_lines["pident"] = pd_lines["pident"].apply(pd.to_numeric) @@ -84,16 +101,17 @@ def search_alleles (self, ref_allele): # sel_row = np_lines[mask, :] = np_lines[mask, :] # query_seq = sel_row[0,14] sample_contig = sel_max["sseqid"] - abbr = self.assign_allele_type(query_seq, allele_name, sample_contig, schema_gene) + abbr = self.assign_allele_type( + query_seq, allele_name, sample_contig, schema_gene + ) else: # Sample does not have a reference allele to be matched # Keep LNF info # ver el codigo de espe - #lnf_tpr_tag() + # lnf_tpr_tag() pass pdb.set_trace() - def analyze_sample(self): # Create blast db with sample file self.sample_blast = taranis.blast.Blast("nucl") @@ -107,4 +125,3 @@ def analyze_sample(self): pdb.set_trace() return - diff --git a/taranis/allele_calling_old.py b/taranis/allele_calling_old.py index 72d3294..e8be72f 100644 --- a/taranis/allele_calling_old.py +++ b/taranis/allele_calling_old.py @@ -28,16 +28,24 @@ import plotly.graph_objects as go -def check_blast (reference_allele, sample_files, db_name, logger) : ## N +def check_blast(reference_allele, sample_files, db_name, logger): ## N for s_file in sample_files: - f_name = os.path.basename(s_file).split('.') + f_name = os.path.basename(s_file).split(".") dir_name = os.path.dirname(s_file) - blast_dir = os.path.join(dir_name, db_name,f_name[0]) - blast_db = os.path.join(blast_dir,f_name[0]) - if not os.path.exists(blast_dir) : - logger.error('Blast db folder for sample %s does not exist', f_name) + blast_dir = os.path.join(dir_name, db_name, f_name[0]) + blast_db = os.path.join(blast_dir, f_name[0]) + if not os.path.exists(blast_dir): + logger.error("Blast db folder for sample %s does not exist", f_name) return False - cline = NcbiblastnCommandline(db=blast_db, evalue=0.001, outfmt=5, max_target_seqs=10, max_hsps=10,num_threads=1, query=reference_allele) + cline = NcbiblastnCommandline( + db=blast_db, + evalue=0.001, + outfmt=5, + max_target_seqs=10, + max_hsps=10, + num_threads=1, + query=reference_allele, + ) out, err = cline() psiblast_xml = StringIO(out) @@ -51,16 +59,18 @@ def check_blast (reference_allele, sample_files, db_name, logger) : ## N alleleMatchid = int((blast_record.query_id.split("_"))[-1]) return True + # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · # # Parse samples and core genes schema fasta files to dictionary # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · # -def parsing_fasta_file_to_dict (fasta_file, logger): + +def parsing_fasta_file_to_dict(fasta_file, logger): fasta_dict = {} fasta_dict_ordered = {} for contig in SeqIO.parse(fasta_file, "fasta"): fasta_dict[str(contig.id)] = str(contig.seq.upper()) - logger.debug('file %s parsed to dictionary', fasta_file) + logger.debug("file %s parsed to dictionary", fasta_file) for key in sorted(list(fasta_dict.keys())): fasta_dict_ordered[key] = fasta_dict[key] @@ -71,9 +81,11 @@ def parsing_fasta_file_to_dict (fasta_file, logger): # Get core genes schema info before allele calling analysis # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · # -#def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, logger): -def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, genus, species, usegenus, logger): +# def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, logger): +def prepare_core_gene( + core_gene_file_list, store_dir, ref_alleles_dir, genus, species, usegenus, logger +): ## Initialize dict for keeping id-allele, quality, length variability, length statistics and annotation info for each schema core gene alleles_in_locus_dict = {} schema_quality = {} @@ -81,13 +93,11 @@ def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, genus, s schema_variability = {} schema_statistics = {} - ## Process each schema core gene - blast_dir = os.path.join(store_dir,'blastdb') - logger.info('start preparation of core genes files') + blast_dir = os.path.join(store_dir, "blastdb") + logger.info("start preparation of core genes files") for fasta_file in core_gene_file_list: - - f_name = os.path.basename(fasta_file).split('.') + f_name = os.path.basename(fasta_file).split(".") # Parse core gene fasta file and keep id-sequence info in dictionary fasta_file_parsed_dict = parsing_fasta_file_to_dict(fasta_file, logger) @@ -96,8 +106,8 @@ def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, genus, s alleles_in_locus_dict[f_name[0]] = fasta_file_parsed_dict # dump fasta file into pickle file - #with open (file_list[-1],'wb') as f: - # pickle.dump(fasta_file_parsed_dict, f) + # with open (file_list[-1],'wb') as f: + # pickle.dump(fasta_file_parsed_dict, f) # Get core gene alleles quality locus_quality = check_core_gene_quality(fasta_file, logger) @@ -106,63 +116,90 @@ def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, genus, s schema_quality[f_name[0]] = locus_quality # Get gene and product annotation for core gene using reference allele(s) - ref_allele = os.path.join(ref_alleles_dir, f_name[0] + '.fasta') + ref_allele = os.path.join(ref_alleles_dir, f_name[0] + ".fasta") - gene_annot, product_annot = get_gene_annotation (ref_allele, store_dir, genus, species, usegenus, logger) - #gene_annot, product_annot = get_gene_annotation (ref_allele, store_dir, logger) + gene_annot, product_annot = get_gene_annotation( + ref_allele, store_dir, genus, species, usegenus, logger + ) + # gene_annot, product_annot = get_gene_annotation (ref_allele, store_dir, logger) if f_name[0] not in annotation_core_dict.keys(): annotation_core_dict[f_name[0]] = {} annotation_core_dict[f_name[0]] = [gene_annot, product_annot] # Get core gene alleles length to keep length variability and statistics info alleles_len = [] - for allele in fasta_file_parsed_dict : + for allele in fasta_file_parsed_dict: alleles_len.append(len(fasta_file_parsed_dict[allele])) - #alleles_in_locus = list (SeqIO.parse(fasta_file, "fasta")) ## parse - #for allele in alleles_in_locus : ## parse - #alleles_len.append(len(str(allele.seq))) ## parse + # alleles_in_locus = list (SeqIO.parse(fasta_file, "fasta")) ## parse + # for allele in alleles_in_locus : ## parse + # alleles_len.append(len(str(allele.seq))) ## parse - schema_variability[f_name[0]]=list(set(alleles_len)) + schema_variability[f_name[0]] = list(set(alleles_len)) if len(alleles_len) == 1: stdev = 0 else: stdev = statistics.stdev(alleles_len) - schema_statistics[f_name[0]]=[statistics.mean(alleles_len), stdev, min(alleles_len), max(alleles_len)] - - return alleles_in_locus_dict, annotation_core_dict, schema_variability, schema_statistics, schema_quality + schema_statistics[f_name[0]] = [ + statistics.mean(alleles_len), + stdev, + min(alleles_len), + max(alleles_len), + ] + + return ( + alleles_in_locus_dict, + annotation_core_dict, + schema_variability, + schema_statistics, + schema_quality, + ) # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · # # Get Prodigal training file from reference genome for samples gene prediction # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · # -def prodigal_training(reference_genome_file, prodigal_dir, logger): - f_name = os.path.basename(reference_genome_file).split('.')[0] - prodigal_train_dir = os.path.join(prodigal_dir, 'training') +def prodigal_training(reference_genome_file, prodigal_dir, logger): + f_name = os.path.basename(reference_genome_file).split(".")[0] + prodigal_train_dir = os.path.join(prodigal_dir, "training") - output_prodigal_train_dir = os.path.join(prodigal_train_dir, f_name + '.trn') + output_prodigal_train_dir = os.path.join(prodigal_train_dir, f_name + ".trn") if not os.path.exists(prodigal_train_dir): try: os.makedirs(prodigal_train_dir) - logger.debug('Created prodigal directory for training file %s', f_name) + logger.debug("Created prodigal directory for training file %s", f_name) except: - logger.info('Cannot create prodigal directory for training file %s', f_name) - print ('Error when creating the directory %s for training file', prodigal_train_dir) + logger.info("Cannot create prodigal directory for training file %s", f_name) + print( + "Error when creating the directory %s for training file", + prodigal_train_dir, + ) exit(0) - prodigal_command = ['prodigal' , '-i', reference_genome_file, '-t', output_prodigal_train_dir] - prodigal_result = subprocess.run(prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - # if prodigal_result.stderr: - # logger.error('cannot create training file for %s', f_name) - # logger.error('prodigal returning error code %s', prodigal_result.stderr) - # return False + prodigal_command = [ + "prodigal", + "-i", + reference_genome_file, + "-t", + output_prodigal_train_dir, + ] + prodigal_result = subprocess.run( + prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + # if prodigal_result.stderr: + # logger.error('cannot create training file for %s', f_name) + # logger.error('prodigal returning error code %s', prodigal_result.stderr) + # return False else: - logger.info('Skeeping prodigal training file creation for %s, as it has already been created', f_name) + logger.info( + "Skeeping prodigal training file creation for %s, as it has already been created", + f_name, + ) return output_prodigal_train_dir @@ -171,33 +208,59 @@ def prodigal_training(reference_genome_file, prodigal_dir, logger): # Get Prodigal sample gene prediction # # · * · * · * · * · * · * · * · * · * # -def prodigal_prediction(file_name, prodigal_dir, prodigal_train_dir, logger): - f_name = '.'.join(os.path.basename(file_name).split('.')[:-1]) - prodigal_dir_sample = os.path.join(prodigal_dir,f_name) +def prodigal_prediction(file_name, prodigal_dir, prodigal_train_dir, logger): + f_name = ".".join(os.path.basename(file_name).split(".")[:-1]) + prodigal_dir_sample = os.path.join(prodigal_dir, f_name) - output_prodigal_coord = os.path.join(prodigal_dir_sample, f_name + '_coord.gff') ## no - output_prodigal_prot = os.path.join(prodigal_dir_sample, f_name + '_prot.faa') ## no - output_prodigal_dna = os.path.join(prodigal_dir_sample, f_name + '_dna.faa') + output_prodigal_coord = os.path.join( + prodigal_dir_sample, f_name + "_coord.gff" + ) ## no + output_prodigal_prot = os.path.join( + prodigal_dir_sample, f_name + "_prot.faa" + ) ## no + output_prodigal_dna = os.path.join(prodigal_dir_sample, f_name + "_dna.faa") if not os.path.exists(prodigal_dir_sample): try: os.makedirs(prodigal_dir_sample) - logger.debug('Created prodigal directory for Core Gene %s', f_name) + logger.debug("Created prodigal directory for Core Gene %s", f_name) except: - logger.info('Cannot create prodigal directory for Core Gene %s' , f_name) - print ('Error when creating the directory %s for prodigal genes prediction', prodigal_dir_sample) + logger.info("Cannot create prodigal directory for Core Gene %s", f_name) + print( + "Error when creating the directory %s for prodigal genes prediction", + prodigal_dir_sample, + ) exit(0) - prodigal_command = ['prodigal' , '-i', file_name , '-t', prodigal_train_dir, '-f', 'gff', '-o', output_prodigal_coord, '-a', output_prodigal_prot, '-d', output_prodigal_dna] - prodigal_result = subprocess.run(prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + prodigal_command = [ + "prodigal", + "-i", + file_name, + "-t", + prodigal_train_dir, + "-f", + "gff", + "-o", + output_prodigal_coord, + "-a", + output_prodigal_prot, + "-d", + output_prodigal_dna, + ] + prodigal_result = subprocess.run( + prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) # if prodigal_result.stderr: - # logger.error('cannot predict genes for %s ', f_name) - # logger.error('prodigal returning error code %s', prodigal_result.stderr) - #return False + # logger.error('cannot predict genes for %s ', f_name) + # logger.error('prodigal returning error code %s', prodigal_result.stderr) + # return False else: - logger.info('Skeeping prodigal genes prediction for %s, as it has already been made', f_name) + logger.info( + "Skeeping prodigal genes prediction for %s, as it has already been made", + f_name, + ) return True @@ -206,64 +269,111 @@ def prodigal_prediction(file_name, prodigal_dir, prodigal_train_dir, logger): # Get Prodigal predicted gene sequence equivalent to BLAST result matching bad quality allele or to no Exact Match BLAST result in allele calling analysis # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * # -def get_prodigal_sequence(blast_sseq, contig_blast_id, prodigal_directory, sample_name, blast_parameters, logger): +def get_prodigal_sequence( + blast_sseq, + contig_blast_id, + prodigal_directory, + sample_name, + blast_parameters, + logger, +): prodigal_directory_sample = os.path.join(prodigal_directory, sample_name) - genes_file = os.path.join(prodigal_directory_sample, sample_name + '_dna.faa') + genes_file = os.path.join(prodigal_directory_sample, sample_name + "_dna.faa") ## Create directory for storing prodigal genes prediction per contig BLAST databases - blastdb_per_contig_directory = 'blastdb_per_contig' - full_path_blastdb_per_contig = os.path.join(prodigal_directory_sample, blastdb_per_contig_directory) + blastdb_per_contig_directory = "blastdb_per_contig" + full_path_blastdb_per_contig = os.path.join( + prodigal_directory_sample, blastdb_per_contig_directory + ) if not os.path.exists(full_path_blastdb_per_contig): try: os.makedirs(full_path_blastdb_per_contig) - logger.info('Directory %s has been created', full_path_blastdb_per_contig) + logger.info("Directory %s has been created", full_path_blastdb_per_contig) except: - print ('Cannot create the directory ', full_path_blastdb_per_contig) - logger.info('Directory %s cannot be created', full_path_blastdb_per_contig) - exit (0) + print("Cannot create the directory ", full_path_blastdb_per_contig) + logger.info("Directory %s cannot be created", full_path_blastdb_per_contig) + exit(0) ## Create directory for storing prodigal genes prediction sequences per contig - prodigal_genes_per_contig_directory = 'prodigal_genes_per_contig' - full_path_prodigal_genes_per_contig = os.path.join(prodigal_directory_sample, prodigal_genes_per_contig_directory) + prodigal_genes_per_contig_directory = "prodigal_genes_per_contig" + full_path_prodigal_genes_per_contig = os.path.join( + prodigal_directory_sample, prodigal_genes_per_contig_directory + ) if not os.path.exists(full_path_prodigal_genes_per_contig): try: os.makedirs(full_path_prodigal_genes_per_contig) - logger.info('Directory %s has been created', full_path_prodigal_genes_per_contig) + logger.info( + "Directory %s has been created", full_path_prodigal_genes_per_contig + ) except: - print ('Cannot create the directory ', full_path_prodigal_genes_per_contig) - logger.info('Directory %s cannot be created', full_path_prodigal_genes_per_contig) - exit (0) + print("Cannot create the directory ", full_path_prodigal_genes_per_contig) + logger.info( + "Directory %s cannot be created", full_path_prodigal_genes_per_contig + ) + exit(0) ## Parse prodigal genes prediction fasta file predicted_genes = SeqIO.parse(genes_file, "fasta") ## Create fasta file containing Prodigal predicted genes sequences for X contig in sample - contig_genes_path = os.path.join(full_path_prodigal_genes_per_contig, contig_blast_id + '.fasta') - with open (contig_genes_path, 'w') as out_fh: + contig_genes_path = os.path.join( + full_path_prodigal_genes_per_contig, contig_blast_id + ".fasta" + ) + with open(contig_genes_path, "w") as out_fh: for rec in predicted_genes: - contig_prodigal_id = '_'.join((rec.id).split("_")[:-1]) + contig_prodigal_id = "_".join((rec.id).split("_")[:-1]) if contig_prodigal_id == contig_blast_id: - out_fh.write ('>' + str(rec.description) + '\n' + str(rec.seq) + '\n') + out_fh.write(">" + str(rec.description) + "\n" + str(rec.seq) + "\n") ## Create local BLAST database for Prodigal predicted genes sequences for X contig in sample - if not create_blastdb(contig_genes_path, full_path_blastdb_per_contig, 'nucl', logger): - print('Error when creating the blastdb for samples files. Check log file for more information. \n ') + if not create_blastdb( + contig_genes_path, full_path_blastdb_per_contig, "nucl", logger + ): + print( + "Error when creating the blastdb for samples files. Check log file for more information. \n " + ) return False ## Local BLAST Prodigal predicted genes sequences database VS BLAST sequence obtained in sample in allele calling analysis - blast_db_name = os.path.join(full_path_blastdb_per_contig, contig_blast_id, contig_blast_id) - - cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 90, outfmt= blast_parameters, max_target_seqs=10, max_hsps=10, num_threads=1) - out, err = cline(stdin = blast_sseq) + blast_db_name = os.path.join( + full_path_blastdb_per_contig, contig_blast_id, contig_blast_id + ) + + cline = NcbiblastnCommandline( + db=blast_db_name, + evalue=0.001, + perc_identity=90, + outfmt=blast_parameters, + max_target_seqs=10, + max_hsps=10, + num_threads=1, + ) + out, err = cline(stdin=blast_sseq) out_lines = out.splitlines() bigger_bitscore = 0 - if len (out_lines) > 0 : - for line in out_lines : - values = line.split('\t') - if float(values[8]) > bigger_bitscore: - qseqid , sseqid , pident , qlen , s_length , mismatch , r_gapopen , r_evalue , bitscore , sstart , send , qstart , qend ,sseq , qseq = values + if len(out_lines) > 0: + for line in out_lines: + values = line.split("\t") + if float(values[8]) > bigger_bitscore: + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = values bigger_bitscore = float(bitscore) ## Get complete Prodigal sequence matching allele calling BLAST sequence using ID @@ -272,41 +382,51 @@ def get_prodigal_sequence(blast_sseq, contig_blast_id, prodigal_directory, sampl for rec in predicted_genes_in_contig: if rec.id == sseqid: predicted_gene_sequence = str(rec.seq) - start_prodigal = str(rec.description.split( '#')[1]) - end_prodigal = str(rec.description.split('#')[2]) + start_prodigal = str(rec.description.split("#")[1]) + end_prodigal = str(rec.description.split("#")[2]) break ## Sequence not found by Prodigal when there are no BLAST results matching allele calling BLAST sequence - if len (out_lines) == 0: - predicted_gene_sequence = 'Sequence not found by Prodigal' - start_prodigal = '-' - end_prodigal = '-' + if len(out_lines) == 0: + predicted_gene_sequence = "Sequence not found by Prodigal" + start_prodigal = "-" + end_prodigal = "-" - return predicted_gene_sequence, start_prodigal, end_prodigal ### start_prodigal y end_prodigal para report prodigal + return ( + predicted_gene_sequence, + start_prodigal, + end_prodigal, + ) ### start_prodigal y end_prodigal para report prodigal # · * · * · * · * · * · * · * · * · * · * · * · * # # Get samples info before allele calling analysis # # · * · * · * · * · * · * · * · * · * · * · * · * # -def prepare_samples(sample_file_list, store_dir, reference_genome_file, logger): +def prepare_samples(sample_file_list, store_dir, reference_genome_file, logger): ## Initialize dictionary for keeping id-contig contigs_in_sample_dict = {} ## Paths for samples blastdb, Prodigal genes prediction and BLAST results - blast_dir = os.path.join(store_dir,'blastdb') - prodigal_dir = os.path.join(store_dir,'prodigal') - blast_results_seq_dir = os.path.join(store_dir,'blast_results', 'blast_results_seq') + blast_dir = os.path.join(store_dir, "blastdb") + prodigal_dir = os.path.join(store_dir, "prodigal") + blast_results_seq_dir = os.path.join( + store_dir, "blast_results", "blast_results_seq" + ) ## Get training file for Prodigal genes prediction - output_prodigal_train_dir = prodigal_training(reference_genome_file, prodigal_dir, logger) + output_prodigal_train_dir = prodigal_training( + reference_genome_file, prodigal_dir, logger + ) if not output_prodigal_train_dir: - print('Error when creating training file for genes prediction. Check log file for more information. \n ') + print( + "Error when creating training file for genes prediction. Check log file for more information. \n " + ) return False for fasta_file in sample_file_list: - f_name = '.'.join(os.path.basename(fasta_file).split('.')[:-1]) + f_name = ".".join(os.path.basename(fasta_file).split(".")[:-1]) # Get samples id-contig dictionary fasta_file_parsed_dict = parsing_fasta_file_to_dict(fasta_file, logger) @@ -315,8 +435,8 @@ def prepare_samples(sample_file_list, store_dir, reference_genome_file, logger): contigs_in_sample_dict[f_name] = fasta_file_parsed_dict # dump fasta file into pickle file - #with open (file_list[-1],'wb') as f: # generación de diccionarios de contigs para cada muestra - # pickle.dump(fasta_file_parsed_dict, f) + # with open (file_list[-1],'wb') as f: # generación de diccionarios de contigs para cada muestra + # pickle.dump(fasta_file_parsed_dict, f) # Create directory for storing BLAST results using reference allele(s) blast_results_seq_per_sample_dir = os.path.join(blast_results_seq_dir, f_name) @@ -324,35 +444,51 @@ def prepare_samples(sample_file_list, store_dir, reference_genome_file, logger): if not os.path.exists(blast_results_seq_per_sample_dir): try: os.makedirs(blast_results_seq_per_sample_dir) - logger.debug('Created blast results directory for sample %s', f_name) + logger.debug("Created blast results directory for sample %s", f_name) except: - logger.info('Cannot create blast results directory for sample %s', f_name) - print ('Error when creating the directory for blast results', blast_results_seq_per_sample_dir) + logger.info( + "Cannot create blast results directory for sample %s", f_name + ) + print( + "Error when creating the directory for blast results", + blast_results_seq_per_sample_dir, + ) exit(0) # Prodigal genes prediction for each sample - if not prodigal_prediction(fasta_file, prodigal_dir, output_prodigal_train_dir, logger): - print('Error when predicting genes for samples files. Check log file for more information. \n ') + if not prodigal_prediction( + fasta_file, prodigal_dir, output_prodigal_train_dir, logger + ): + print( + "Error when predicting genes for samples files. Check log file for more information. \n " + ) return False # Create local BLAST db for each sample fasta file - if not create_blastdb(fasta_file, blast_dir, 'nucl', logger): - print('Error when creating the blastdb for samples files. Check log file for more information. \n ') + if not create_blastdb(fasta_file, blast_dir, "nucl", logger): + print( + "Error when creating the blastdb for samples files. Check log file for more information. \n " + ) return False return contigs_in_sample_dict + # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * # # Get established length thresholds for allele tagging in allele calling analysis # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * # -def length_thresholds(core_name, schema_statistics, percent): ### logger +def length_thresholds(core_name, schema_statistics, percent): ### logger locus_mean = int(schema_statistics[core_name][0]) if percent != "SD": - max_length_threshold = math.ceil(locus_mean + ((locus_mean * float(percent)) / 100)) - min_length_threshold = math.floor(locus_mean - ((locus_mean * float(percent)) / 100)) + max_length_threshold = math.ceil( + locus_mean + ((locus_mean * float(percent)) / 100) + ) + min_length_threshold = math.floor( + locus_mean - ((locus_mean * float(percent)) / 100) + ) else: percent = float(schema_statistics[core_name][1]) @@ -366,103 +502,156 @@ def length_thresholds(core_name, schema_statistics, percent): ### logger # Convert dna sequence to protein sequence # # · * · * · * · * · * · * · * · * · * · * · # -def convert_to_protein (sequence) : +def convert_to_protein(sequence): seq = Seq.Seq(sequence) protein = str(seq.translate()) return protein + # · * · * · * · * · * · * · * · * · * · * · * · * · * # # Get SNPs between BLAST sequence and matching allele # # · * · * · * · * · * · * · * · * · * · * · * · * · * # -def get_snp (sample, query) : - prot_annotation = {'S': 'polar' ,'T': 'polar' ,'Y': 'polar' ,'Q': 'polar' ,'N': 'polar' ,'C': 'polar' ,'S': 'polar' , - 'F': 'nonpolar' ,'L': 'nonpolar','I': 'nonpolar','M': 'nonpolar','P': 'nonpolar','V': 'nonpolar','A': 'nonpolar','W': 'nonpolar','G': 'nonpolar', - 'D' : 'acidic', 'E' :'acidic', - 'H': 'basic' , 'K': 'basic' , 'R' : 'basic', - '-': '-----', '*' : 'Stop codon'} +def get_snp(sample, query): + prot_annotation = { + "S": "polar", + "T": "polar", + "Y": "polar", + "Q": "polar", + "N": "polar", + "C": "polar", + "S": "polar", + "F": "nonpolar", + "L": "nonpolar", + "I": "nonpolar", + "M": "nonpolar", + "P": "nonpolar", + "V": "nonpolar", + "A": "nonpolar", + "W": "nonpolar", + "G": "nonpolar", + "D": "acidic", + "E": "acidic", + "H": "basic", + "K": "basic", + "R": "basic", + "-": "-----", + "*": "Stop codon", + } snp_list = [] - sample = sample.replace('-','') - #length = max(len(sample), len(query)) + sample = sample.replace("-", "") + # length = max(len(sample), len(query)) length = len(query) # normalize the length of the sample for the iteration - if len(sample) < length : + if len(sample) < length: need_to_add = length - len(sample) - sample = sample + need_to_add * '-' + sample = sample + need_to_add * "-" # convert to Seq class to translate to protein seq_sample = Seq.Seq(sample) seq_query = Seq.Seq(query) for index in range(length): - if seq_query[index] != seq_sample[index] : + if seq_query[index] != seq_sample[index]: triple_index = index - (index % 3) codon_seq = seq_sample[triple_index : triple_index + 3] codon_que = seq_query[triple_index : triple_index + 3] - if not '-' in str(codon_seq) : + if not "-" in str(codon_seq): prot_seq = str(codon_seq.translate()) prot_que = str(codon_que.translate()) else: - prot_seq = '-' - prot_que = str(seq_query[triple_index: ].translate()) - if prot_annotation[prot_que[0]] == prot_annotation[prot_seq[0]] : - missense_synonym = 'Synonymous' - elif prot_seq == '*' : - missense_synonym = 'Nonsense' + prot_seq = "-" + prot_que = str(seq_query[triple_index:].translate()) + if prot_annotation[prot_que[0]] == prot_annotation[prot_seq[0]]: + missense_synonym = "Synonymous" + elif prot_seq == "*": + missense_synonym = "Nonsense" else: - missense_synonym = 'Missense' - #snp_list.append([str(index+1),str(seq_sample[index]) + '/' + str(seq_query[index]), str(codon_seq) + '/'+ str(codon_que), - snp_list.append([str(index+1),str(seq_query[index]) + '/' + str(seq_sample[index]), str(codon_que) + '/'+ str(codon_seq), - # when one of the sequence ends but not the other we will translate the remain sequence to proteins - # in that case we will only annotate the first protein. Using [0] as key of the dictionary annotation - prot_que + '/' + prot_seq, missense_synonym, prot_annotation[prot_que[0]] + ' / ' + prot_annotation[prot_seq[0]]]) - if '-' in str(codon_seq) : + missense_synonym = "Missense" + # snp_list.append([str(index+1),str(seq_sample[index]) + '/' + str(seq_query[index]), str(codon_seq) + '/'+ str(codon_que), + snp_list.append( + [ + str(index + 1), + str(seq_query[index]) + "/" + str(seq_sample[index]), + str(codon_que) + "/" + str(codon_seq), + # when one of the sequence ends but not the other we will translate the remain sequence to proteins + # in that case we will only annotate the first protein. Using [0] as key of the dictionary annotation + prot_que + "/" + prot_seq, + missense_synonym, + prot_annotation[prot_que[0]] + " / " + prot_annotation[prot_seq[0]], + ] + ) + if "-" in str(codon_seq): break return snp_list -def nucleotide_to_protein_alignment (sample_seq, query_seq ) : ### Sustituido por get_alignment +def nucleotide_to_protein_alignment( + sample_seq, query_seq +): ### Sustituido por get_alignment aligment = [] sample_prot = convert_to_protein(sample_seq) query_prot = convert_to_protein(query_seq) minimun_length = min(len(sample_prot), len(query_prot)) for i in range(minimun_length): - if sample_prot[i] == query_prot[i] : - aligment.append('|') + if sample_prot[i] == query_prot[i]: + aligment.append("|") else: - aligment.append(' ') - protein_alignment = [['sample', sample_prot],['match', ''.join(aligment)], ['schema', query_prot]] + aligment.append(" ") + protein_alignment = [ + ["sample", sample_prot], + ["match", "".join(aligment)], + ["schema", query_prot], + ] return protein_alignment -def get_alignment_for_indels (blast_db_name, qseq) : ### Sustituido por get_alignment - #match_alignment =[] - cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 80, outfmt= 5, max_target_seqs=10, max_hsps=10,num_threads=1) - out, err = cline(stdin = qseq) +def get_alignment_for_indels(blast_db_name, qseq): ### Sustituido por get_alignment + # match_alignment =[] + cline = NcbiblastnCommandline( + db=blast_db_name, + evalue=0.001, + perc_identity=80, + outfmt=5, + max_target_seqs=10, + max_hsps=10, + num_threads=1, + ) + out, err = cline(stdin=qseq) psiblast_xml = StringIO(out) blast_records = NCBIXML.parse(psiblast_xml) for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: - match_alignment = [['sample', match.sbjct],['match', match.match], ['schema',match.query]] + match_alignment = [ + ["sample", match.sbjct], + ["match", match.match], + ["schema", match.query], + ] return match_alignment -def get_alignment_for_deletions (sample_seq, query_seq): ### Sustituido por get_alignment +def get_alignment_for_deletions( + sample_seq, query_seq +): ### Sustituido por get_alignment index_found = False alignments = pairwise2.align.globalxx(sample_seq, query_seq) - for index in range(len(alignments)) : - if alignments[index][4] == len(query_seq) : + for index in range(len(alignments)): + if alignments[index][4] == len(query_seq): index_found = True break - if not index_found : + if not index_found: index = 0 - values = format_alignment(*alignments[index]).split('\n') - match_alignment = [['sample', values[0]],['match', values[1]], ['schema',values[2]]] + values = format_alignment(*alignments[index]).split("\n") + match_alignment = [ + ["sample", values[0]], + ["match", values[1]], + ["schema", values[2]], + ] return match_alignment @@ -470,8 +659,10 @@ def get_alignment_for_deletions (sample_seq, query_seq): ### Sustituido por get_ # Get DNA and protein alignment between the final sequence found in the sample and the matching allele # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * # -def get_alignment (sample_seq, query_seq, reward, penalty, gapopen, gapextend, seq_type = "dna"): +def get_alignment( + sample_seq, query_seq, reward, penalty, gapopen, gapextend, seq_type="dna" +): ## If sequences alignment type desired is "protein" convert dna sequences to protein if seq_type == "protein": sample_seq = convert_to_protein(sample_seq) @@ -479,9 +670,15 @@ def get_alignment (sample_seq, query_seq, reward, penalty, gapopen, gapextend, s ## Get dna/protein alignment between final sequence found and matching allele # arguments pairwise2.align.globalms: match, mismatch, gap opening, gap extending - alignments = pairwise2.align.localms(sample_seq, query_seq, reward, penalty, -gapopen, -gapextend) - values = format_alignment(*alignments[0]).split('\n') - match_alignment = [['sample', values[0]],['match', values[1]], ['schema',values[2]]] + alignments = pairwise2.align.localms( + sample_seq, query_seq, reward, penalty, -gapopen, -gapextend + ) + values = format_alignment(*alignments[0]).split("\n") + match_alignment = [ + ["sample", values[0]], + ["match", values[1]], + ["schema", values[2]], + ] return match_alignment @@ -490,102 +687,135 @@ def get_alignment (sample_seq, query_seq, reward, penalty, gapopen, gapextend, s # Tag LNF cases and keep LNF info # # · * · * · * · * · * · * · * · * # -def lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, qseqid, pident, s_length, new_sequence_length, perc_identity_ref, coverage, schema_quality, annotation_core_dict, count_dict, logger): +def lnf_tpr_tag( + core_name, + sample_name, + alleles_in_locus_dict, + samples_matrix_dict, + lnf_tpr_dict, + schema_statistics, + locus_alleles_path, + qseqid, + pident, + s_length, + new_sequence_length, + perc_identity_ref, + coverage, + schema_quality, + annotation_core_dict, + count_dict, + logger, +): gene_annot, product_annot = annotation_core_dict[core_name] - if qseqid == '-': - samples_matrix_dict[sample_name].append('LNF') - tag_report = 'LNF' - matching_allele_length = '-' + if qseqid == "-": + samples_matrix_dict[sample_name].append("LNF") + tag_report = "LNF" + matching_allele_length = "-" else: - if new_sequence_length == '-': - samples_matrix_dict[sample_name].append('LNF_' + str(qseqid)) - tag_report = 'LNF' + if new_sequence_length == "-": + samples_matrix_dict[sample_name].append("LNF_" + str(qseqid)) + tag_report = "LNF" else: - samples_matrix_dict[sample_name].append('TPR_' + str(qseqid)) - tag_report = 'TPR' + samples_matrix_dict[sample_name].append("TPR_" + str(qseqid)) + tag_report = "TPR" matching_allele_seq = alleles_in_locus_dict[core_name][qseqid] matching_allele_length = len(matching_allele_seq) - #alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse - #for allele in alleles_in_locus : ## parse - #if allele.id == qseqid : ## parse - #break ## parse - #matching_allele_seq = str(allele.seq) ## parse - #matching_allele_length = len(matching_allele_seq) ## parse + # alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse + # for allele in alleles_in_locus : ## parse + # if allele.id == qseqid : ## parse + # break ## parse + # matching_allele_seq = str(allele.seq) ## parse + # matching_allele_length = len(matching_allele_seq) ## parse - if pident == '-': + if pident == "-": # (los dos BLAST sin resultado) - coverage_blast = '-' - coverage_new_sequence = '-' - add_info = 'Locus not found' - logger.info('Locus not found at sample %s, for gene %s', sample_name, core_name) + coverage_blast = "-" + coverage_new_sequence = "-" + add_info = "Locus not found" + logger.info("Locus not found at sample %s, for gene %s", sample_name, core_name) # Get allele quality - allele_quality = '-' + allele_quality = "-" # (recuento tags para plot) - count_dict[sample_name]['not_found'] += 1 - count_dict[sample_name]['total'] += 1 + count_dict[sample_name]["not_found"] += 1 + count_dict[sample_name]["total"] += 1 elif 90 > float(pident): # (BLAST 90 sin resultado y BLAST 70 con resultado) - coverage_blast = '-' - coverage_new_sequence = '-' - add_info = 'BLAST sequence ID under threshold: {}%'.format(perc_identity_ref) - logger.info('BLAST sequence ID %s under threshold at sample %s, for gene %s', pident, sample_name, core_name) + coverage_blast = "-" + coverage_new_sequence = "-" + add_info = "BLAST sequence ID under threshold: {}%".format(perc_identity_ref) + logger.info( + "BLAST sequence ID %s under threshold at sample %s, for gene %s", + pident, + sample_name, + core_name, + ) # Get allele quality - allele_quality = '-' + allele_quality = "-" # (recuento tags para plot) - count_dict[sample_name]['low_id'] += 1 - count_dict[sample_name]['total'] += 1 + count_dict[sample_name]["low_id"] += 1 + count_dict[sample_name]["total"] += 1 - elif 90 <= float(pident) and new_sequence_length == '-': + elif 90 <= float(pident) and new_sequence_length == "-": # (BLAST 90 con resultado, bajo coverage BLAST) locus_mean = int(schema_statistics[core_name][0]) coverage_blast = int(s_length) / locus_mean - #coverage_blast = int(s_length) / matching_allele_length - coverage_new_sequence = '-' + # coverage_blast = int(s_length) / matching_allele_length + coverage_new_sequence = "-" if coverage_blast < 1: - add_info = 'BLAST sequence coverage under threshold: {}%'.format(coverage) + add_info = "BLAST sequence coverage under threshold: {}%".format(coverage) else: - add_info = 'BLAST sequence coverage above threshold: {}%'.format(coverage) - logger.info('BLAST sequence coverage %s under threshold at sample %s, for gene %s', coverage_blast, sample_name, core_name) + add_info = "BLAST sequence coverage above threshold: {}%".format(coverage) + logger.info( + "BLAST sequence coverage %s under threshold at sample %s, for gene %s", + coverage_blast, + sample_name, + core_name, + ) # Get allele quality - allele_quality = '-' + allele_quality = "-" # (recuento tags para plot) - count_dict[sample_name]['low_coverage'] += 1 - count_dict[sample_name]['total'] += 1 + count_dict[sample_name]["low_coverage"] += 1 + count_dict[sample_name]["total"] += 1 - elif 90 <= float(pident) and new_sequence_length != '-': + elif 90 <= float(pident) and new_sequence_length != "-": # (BLAST 90 con resultado, buen coverage BLAST, bajo coverage new_sseq) locus_mean = int(schema_statistics[core_name][0]) coverage_blast = int(s_length) / locus_mean * 100 - #coverage_blast = int(s_length) / matching_allele_length + # coverage_blast = int(s_length) / matching_allele_length coverage_new_sequence = new_sequence_length / matching_allele_length * 100 if coverage_new_sequence < 1: - add_info = 'New sequence coverage under threshold: {}%'.format(coverage) + add_info = "New sequence coverage under threshold: {}%".format(coverage) else: - add_info = 'New sequence coverage above threshold: {}%'.format(coverage) - logger.info('New sequence coverage %s under threshold at sample %s, for gene %s', coverage_new_sequence, sample_name, core_name) + add_info = "New sequence coverage above threshold: {}%".format(coverage) + logger.info( + "New sequence coverage %s under threshold at sample %s, for gene %s", + coverage_new_sequence, + sample_name, + core_name, + ) # Get allele quality allele_quality = schema_quality[core_name][qseqid] # (recuento tags para plot) - count_dict[sample_name]['total'] += 1 + count_dict[sample_name]["total"] += 1 for count_class in count_dict[sample_name]: if count_class in allele_quality: count_dict[sample_name][count_class] += 1 - #if "bad_quality" in allele_quality: - # count_dict[sample_name]['bad_quality'] += 1 + # if "bad_quality" in allele_quality: + # count_dict[sample_name]['bad_quality'] += 1 ## Keeping LNF and TPR report info if not core_name in lnf_tpr_dict: @@ -593,7 +823,22 @@ def lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_di if not sample_name in lnf_tpr_dict[core_name]: lnf_tpr_dict[core_name][sample_name] = [] - lnf_tpr_dict[core_name][sample_name].append([gene_annot, product_annot, tag_report, qseqid, allele_quality, pident, str(coverage_blast), str(coverage_new_sequence), str(matching_allele_length), str(s_length), str(new_sequence_length), add_info]) ### Meter secuencias alelo, blast y new_sseq (si las hay)? + lnf_tpr_dict[core_name][sample_name].append( + [ + gene_annot, + product_annot, + tag_report, + qseqid, + allele_quality, + pident, + str(coverage_blast), + str(coverage_new_sequence), + str(matching_allele_length), + str(s_length), + str(new_sequence_length), + add_info, + ] + ) ### Meter secuencias alelo, blast y new_sseq (si las hay)? return True @@ -602,20 +847,37 @@ def lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_di # Tag paralog and exact match cases and keep info # # · * · * · * · * · * · * · * · * · * · * · * · * # -def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, tag_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_dict, logger): - logger.info('Found %s at sample %s for core gene %s ', tag, sample_name, core_name) - - paralog_quality_count = [] # (lista para contabilizar parálogos debido a bad o good quality) +def paralog_exact_tag( + sample_name, + core_name, + tag, + schema_quality, + matching_genes_dict, + samples_matrix_dict, + allele_found, + tag_dict, + prodigal_report, + prodigal_directory, + blast_parameters, + annotation_core_dict, + count_dict, + logger, +): + logger.info("Found %s at sample %s for core gene %s ", tag, sample_name, core_name) + + paralog_quality_count = ( + [] + ) # (lista para contabilizar parálogos debido a bad o good quality) gene_annot, product_annot = annotation_core_dict[core_name] - if not sample_name in tag_dict : + if not sample_name in tag_dict: tag_dict[sample_name] = {} - if not core_name in tag_dict[sample_name] : - tag_dict[sample_name][core_name]= [] + if not core_name in tag_dict[sample_name]: + tag_dict[sample_name][core_name] = [] - if tag == 'EXACT': + if tag == "EXACT": allele = list(allele_found.keys())[0] qseqid = allele_found[allele][0] tag = qseqid @@ -623,8 +885,24 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene samples_matrix_dict[sample_name].append(tag) for sequence in allele_found: - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = allele_found[sequence] - sseq = sseq.replace('-', '') + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = allele_found[sequence] + sseq = sseq.replace("-", "") # Get allele quality allele_quality = schema_quality[core_name][qseqid] @@ -633,35 +911,87 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene paralog_quality_count.append(allele_quality) # Get prodigal gene prediction if allele quality is 'bad_quality' - if 'bad_quality' in allele_quality: - complete_predicted_seq, start_prodigal, end_prodigal = get_prodigal_sequence(sseq, sseqid, prodigal_directory, sample_name, blast_parameters, logger) + if "bad_quality" in allele_quality: + ( + complete_predicted_seq, + start_prodigal, + end_prodigal, + ) = get_prodigal_sequence( + sseq, sseqid, prodigal_directory, sample_name, blast_parameters, logger + ) ##### informe prodigal ##### - prodigal_report.append([core_name, sample_name, qseqid, tag, sstart, send, start_prodigal, end_prodigal, sseq, complete_predicted_seq]) + prodigal_report.append( + [ + core_name, + sample_name, + qseqid, + tag, + sstart, + send, + start_prodigal, + end_prodigal, + sseq, + complete_predicted_seq, + ] + ) else: - complete_predicted_seq = '-' + complete_predicted_seq = "-" - if not sseqid in matching_genes_dict[sample_name] : + if not sseqid in matching_genes_dict[sample_name]: matching_genes_dict[sample_name][sseqid] = [] - if sstart > send : - #matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'-', tag]) - matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, send,'-', tag]) + if sstart > send: + # matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'-', tag]) + matching_genes_dict[sample_name][sseqid].append( + [core_name, qseqid, sstart, send, "-", tag] + ) else: - #matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'+', tag]) - matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, send,'+', tag]) + # matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'+', tag]) + matching_genes_dict[sample_name][sseqid].append( + [core_name, qseqid, sstart, send, "+", tag] + ) ## Keeping paralog NIPH/NIPHEM report info - if tag == 'NIPH' or tag == 'NIPHEM': - tag_dict[sample_name][core_name].append([gene_annot, product_annot, tag, pident, qseqid, allele_quality, sseqid, bitscore, sstart, send, sseq, complete_predicted_seq]) + if tag == "NIPH" or tag == "NIPHEM": + tag_dict[sample_name][core_name].append( + [ + gene_annot, + product_annot, + tag, + pident, + qseqid, + allele_quality, + sseqid, + bitscore, + sstart, + send, + sseq, + complete_predicted_seq, + ] + ) else: - tag_dict[sample_name][core_name] = [gene_annot, product_annot, qseqid, allele_quality, sseqid, s_length, sstart, send, sseq, complete_predicted_seq] + tag_dict[sample_name][core_name] = [ + gene_annot, + product_annot, + qseqid, + allele_quality, + sseqid, + s_length, + sstart, + send, + sseq, + complete_predicted_seq, + ] # (recuento tags para plot) - count_dict[sample_name]['total'] += 1 + count_dict[sample_name]["total"] += 1 for count_class in count_dict[sample_name]: if count_class in allele_quality: - if "no_start_stop" not in count_class and "no_start_stop" in allele_quality: + if ( + "no_start_stop" not in count_class + and "no_start_stop" in allele_quality + ): if count_class == "bad_quality": count_dict[sample_name][count_class] += 1 else: @@ -673,10 +1003,13 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene for paralog_quality in paralog_quality_count: count += 1 if "bad_quality" in paralog_quality: - count_dict[sample_name]['total'] += 1 + count_dict[sample_name]["total"] += 1 for count_class in count_dict[sample_name]: if count_class in paralog_quality: - if "no_start_stop" not in count_class and "no_start_stop" in paralog_quality: + if ( + "no_start_stop" not in count_class + and "no_start_stop" in paralog_quality + ): if count_class == "bad_quality": count_dict[sample_name][count_class] += 1 else: @@ -687,8 +1020,8 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene else: if count == len(paralog_quality_count): - count_dict[sample_name]['total'] += 1 - count_dict[sample_name]['good_quality'] += 1 + count_dict[sample_name]["total"] += 1 + count_dict[sample_name]["good_quality"] += 1 return True @@ -697,97 +1030,225 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene # Tag INF/ASM/ALM/PLOT cases and keep info # # · * · * · * · * · * · * · * · * · * · * # -def inf_asm_alm_tag(core_name, sample_name, tag, blast_values, allele_quality, new_sseq, matching_allele_length, tag_dict, list_tag, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_dict, logger): +def inf_asm_alm_tag( + core_name, + sample_name, + tag, + blast_values, + allele_quality, + new_sseq, + matching_allele_length, + tag_dict, + list_tag, + samples_matrix_dict, + matching_genes_dict, + prodigal_report, + start_prodigal, + end_prodigal, + complete_predicted_seq, + annotation_core_dict, + count_dict, + logger, +): gene_annot, product_annot = annotation_core_dict[core_name] - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = blast_values - - sseq = sseq.replace('-', '') + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = blast_values + + sseq = sseq.replace("-", "") s_length = len(sseq) new_sequence_length = len(new_sseq) - logger.info('Found %s at sample %s for core gene %s ', tag, sample_name, core_name) + logger.info("Found %s at sample %s for core gene %s ", tag, sample_name, core_name) - if tag == 'PLOT': - tag_allele = tag + '_' + str(qseqid) + if tag == "PLOT": + tag_allele = tag + "_" + str(qseqid) else: # Adding ASM/ALM/INF allele to the allele_matrix if it is not already include if not core_name in tag_dict: tag_dict[core_name] = [] - if not new_sseq in tag_dict[core_name] : + if not new_sseq in tag_dict[core_name]: tag_dict[core_name].append(new_sseq) # Find the index of ASM/ALM/INF to include it in the sample matrix dict index_tag = tag_dict[core_name].index(new_sseq) - tag_allele = tag + '_' + core_name + '_' + str(qseqid) + '_' + str(index_tag) + tag_allele = tag + "_" + core_name + "_" + str(qseqid) + "_" + str(index_tag) samples_matrix_dict[sample_name].append(tag_allele) # Keeping INF/ASM/ALM/PLOT report info - if not core_name in list_tag : + if not core_name in list_tag: list_tag[core_name] = {} - if not sample_name in list_tag[core_name] : + if not sample_name in list_tag[core_name]: list_tag[core_name][sample_name] = {} - if tag == 'INF': - list_tag[core_name][sample_name][tag_allele] = [gene_annot, product_annot, qseqid, allele_quality, sseqid, bitscore, str(matching_allele_length), str(s_length), str(new_sequence_length), mismatch , r_gapopen, sstart, send, new_sseq, complete_predicted_seq] + if tag == "INF": + list_tag[core_name][sample_name][tag_allele] = [ + gene_annot, + product_annot, + qseqid, + allele_quality, + sseqid, + bitscore, + str(matching_allele_length), + str(s_length), + str(new_sequence_length), + mismatch, + r_gapopen, + sstart, + send, + new_sseq, + complete_predicted_seq, + ] # (recuento tags para plots) - count_dict[sample_name]['total'] += 1 + count_dict[sample_name]["total"] += 1 for count_class in count_dict[sample_name]: if count_class in allele_quality: count_dict[sample_name][count_class] += 1 - #if "bad_quality" in allele_quality: - # count_dict[sample_name]['bad_quality'] += 1 - - elif tag == 'PLOT': - list_tag[core_name][sample_name] = [gene_annot, product_annot, qseqid, allele_quality, sseqid, bitscore, sstart, send, sseq, new_sseq] + # if "bad_quality" in allele_quality: + # count_dict[sample_name]['bad_quality'] += 1 + + elif tag == "PLOT": + list_tag[core_name][sample_name] = [ + gene_annot, + product_annot, + qseqid, + allele_quality, + sseqid, + bitscore, + sstart, + send, + sseq, + new_sseq, + ] # (recuento tags para plots) - count_dict[sample_name]['total'] += 1 + count_dict[sample_name]["total"] += 1 - else : - if tag == 'ASM': - newsseq_vs_blastseq = 'shorter' - elif tag == 'ALM': - newsseq_vs_blastseq = 'longer' + else: + if tag == "ASM": + newsseq_vs_blastseq = "shorter" + elif tag == "ALM": + newsseq_vs_blastseq = "longer" if len(sseq) < matching_allele_length: - add_info = 'Global effect: DELETION. BLAST sequence length shorter than matching allele sequence length / Net result: ' + tag + '. Final gene sequence length ' + newsseq_vs_blastseq + ' than matching allele sequence length' + add_info = ( + "Global effect: DELETION. BLAST sequence length shorter than matching allele sequence length / Net result: " + + tag + + ". Final gene sequence length " + + newsseq_vs_blastseq + + " than matching allele sequence length" + ) elif len(sseq) == matching_allele_length: - add_info = 'Global effect: BASE SUBSTITUTION. BLAST sequence length equal to matching allele sequence length / Net result: ' + tag + '. Final gene sequence length ' + newsseq_vs_blastseq + ' than matching allele sequence length' + add_info = ( + "Global effect: BASE SUBSTITUTION. BLAST sequence length equal to matching allele sequence length / Net result: " + + tag + + ". Final gene sequence length " + + newsseq_vs_blastseq + + " than matching allele sequence length" + ) elif len(sseq) > matching_allele_length: - add_info = 'Global effect: INSERTION. BLAST sequence length longer than matching allele sequence length / Net result: ' + tag + '. Final gene sequence length ' + newsseq_vs_blastseq + ' than matching allele sequence length' - - list_tag[core_name][sample_name][tag_allele] = [gene_annot, product_annot, qseqid, allele_quality, sseqid, bitscore, str(matching_allele_length), str(s_length), str(new_sequence_length), mismatch , r_gapopen, sstart, send, new_sseq, add_info, complete_predicted_seq] + add_info = ( + "Global effect: INSERTION. BLAST sequence length longer than matching allele sequence length / Net result: " + + tag + + ". Final gene sequence length " + + newsseq_vs_blastseq + + " than matching allele sequence length" + ) + + list_tag[core_name][sample_name][tag_allele] = [ + gene_annot, + product_annot, + qseqid, + allele_quality, + sseqid, + bitscore, + str(matching_allele_length), + str(s_length), + str(new_sequence_length), + mismatch, + r_gapopen, + sstart, + send, + new_sseq, + add_info, + complete_predicted_seq, + ] # (recuento tags para plots) - if tag == 'ASM': - count_dict[sample_name]['total'] += 1 + if tag == "ASM": + count_dict[sample_name]["total"] += 1 for mut_type in count_dict[sample_name]: if mut_type in add_info.lower(): count_dict[sample_name][mut_type] += 1 - elif tag == 'ALM': - count_dict[sample_name]['total'] += 1 + elif tag == "ALM": + count_dict[sample_name]["total"] += 1 for mut_type in count_dict[sample_name]: if mut_type in add_info.lower(): count_dict[sample_name][mut_type] += 1 - if not sseqid in matching_genes_dict[sample_name] : + if not sseqid in matching_genes_dict[sample_name]: matching_genes_dict[sample_name][sseqid] = [] - if sstart > send : - #matching_genes_dict[sample_name][sseqid].append([core_name, str(int(sstart)-new_sequence_length -1), sstart,'-', tag_allele]) - matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, str(int(sstart)-new_sequence_length -1), sstart,'-', tag_allele]) + if sstart > send: + # matching_genes_dict[sample_name][sseqid].append([core_name, str(int(sstart)-new_sequence_length -1), sstart,'-', tag_allele]) + matching_genes_dict[sample_name][sseqid].append( + [ + core_name, + qseqid, + str(int(sstart) - new_sequence_length - 1), + sstart, + "-", + tag_allele, + ] + ) else: - #matching_genes_dict[sample_name][sseqid].append([core_name, sstart, str(int(sstart)+ new_sequence_length),'+', tag_allele]) - matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, str(int(sstart)+ new_sequence_length),'+', tag_allele]) + # matching_genes_dict[sample_name][sseqid].append([core_name, sstart, str(int(sstart)+ new_sequence_length),'+', tag_allele]) + matching_genes_dict[sample_name][sseqid].append( + [ + core_name, + qseqid, + sstart, + str(int(sstart) + new_sequence_length), + "+", + tag_allele, + ] + ) ##### informe prodigal ##### - prodigal_report.append([core_name, sample_name, qseqid, tag_allele, sstart, send, start_prodigal, end_prodigal, sseq, complete_predicted_seq]) + prodigal_report.append( + [ + core_name, + sample_name, + qseqid, + tag_allele, + sstart, + send, + start_prodigal, + end_prodigal, + sseq, + complete_predicted_seq, + ] + ) return True @@ -796,24 +1257,41 @@ def inf_asm_alm_tag(core_name, sample_name, tag, blast_values, allele_quality, n # Keep best results info after BLAST using results from previous reference allele BLAST as database VS ALL alleles in locus as query in allele calling analysis # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · # -def get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found, logger) : - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = values +def get_blast_results( + sample_name, values, contigs_in_sample_dict, allele_found, logger +): + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = values ## Get contig ID and BLAST sequence - sseqid_blast = "_".join(sseqid.split('_')[1:]) - sseq_no_gaps = sseq.replace('-', '') - + sseqid_blast = "_".join(sseqid.split("_")[1:]) + sseq_no_gaps = sseq.replace("-", "") ## Get start and end positions in contig searching for BLAST sequence index in contig sequence # Get contig sequence accession_sequence = contigs_in_sample_dict[sample_name][sseqid_blast] - #for record in sample_contigs: ## parse - #if record.id == sseqid_blast : ## parse - #break ## parse - #accession_sequence = str(record.seq) ## parse + # for record in sample_contigs: ## parse + # if record.id == sseqid_blast : ## parse + # break ## parse + # accession_sequence = str(record.seq) ## parse # Try to get BLAST sequence index in contig. If index -> error because different contig sequence and BLAST sequence # direction, obtain reverse complement BLAST sequence and try again. @@ -834,26 +1312,55 @@ def get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found sstart_new = str(max(sseq_index_1, sseq_index_2)) send_new = str(min(sseq_index_1, sseq_index_2)) - ## Keep BLAST results info discarding subsets allele_is_subset = False - if len(allele_found) > 0 : - for allele_id in allele_found : - min_index = min(int(allele_found[allele_id][9]), int(allele_found[allele_id][10])) - max_index = max(int(allele_found[allele_id][9]), int(allele_found[allele_id][10])) - if int(sstart_new) in range(min_index, max_index + 1) or int(send_new) in range(min_index, max_index + 1): # if both genome locations overlap - if sseqid_blast == allele_found[allele_id][1]: # if both sequences are in the same contig - logger.info('Found allele %s that starts or ends at the same position as %s ' , qseqid, allele_id) + if len(allele_found) > 0: + for allele_id in allele_found: + min_index = min( + int(allele_found[allele_id][9]), int(allele_found[allele_id][10]) + ) + max_index = max( + int(allele_found[allele_id][9]), int(allele_found[allele_id][10]) + ) + if int(sstart_new) in range(min_index, max_index + 1) or int( + send_new + ) in range( + min_index, max_index + 1 + ): # if both genome locations overlap + if ( + sseqid_blast == allele_found[allele_id][1] + ): # if both sequences are in the same contig + logger.info( + "Found allele %s that starts or ends at the same position as %s ", + qseqid, + allele_id, + ) allele_is_subset = True break - if len(allele_found) == 0 or not allele_is_subset : - contig_id_start = str(sseqid_blast + '_'+ sstart_new) + if len(allele_found) == 0 or not allele_is_subset: + contig_id_start = str(sseqid_blast + "_" + sstart_new) # Skip the allele found in the 100% identity and 100% alignment if not contig_id_start in allele_found: - allele_found[contig_id_start] = [qseqid, sseqid_blast, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart_new, send_new, '-', '-', sseq, qseq] + allele_found[contig_id_start] = [ + qseqid, + sseqid_blast, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart_new, + send_new, + "-", + "-", + sseq, + qseq, + ] return True @@ -862,35 +1369,55 @@ def get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found # Get SNPs and ADN and protein alignment # # · * · * · * · * · * · * · * · * · * · # -def keep_snp_alignment_info(sseq, new_sseq, matching_allele_seq, qseqid, query_direction, core_name, sample_name, reward, penalty, gapopen, gapextend, snp_dict, match_alignment_dict, protein_dict, logger): +def keep_snp_alignment_info( + sseq, + new_sseq, + matching_allele_seq, + qseqid, + query_direction, + core_name, + sample_name, + reward, + penalty, + gapopen, + gapextend, + snp_dict, + match_alignment_dict, + protein_dict, + logger, +): ## Check allele sequence direction - if query_direction == 'reverse': + if query_direction == "reverse": matching_allele_seq = str(Seq.Seq(matching_allele_seq).reverse_complement()) else: matching_allele_seq = str(matching_allele_seq) ## Get the SNP information snp_information = get_snp(sseq, matching_allele_seq) - if len(snp_information) > 0 : - if not core_name in snp_dict : + if len(snp_information) > 0: + if not core_name in snp_dict: snp_dict[core_name] = {} - if not sample_name in snp_dict[core_name] : + if not sample_name in snp_dict[core_name]: snp_dict[core_name][sample_name] = {} - snp_dict[core_name][sample_name][qseqid]= snp_information + snp_dict[core_name][sample_name][qseqid] = snp_information ## Get new sequence-allele sequence dna alignment - if not core_name in match_alignment_dict : + if not core_name in match_alignment_dict: match_alignment_dict[core_name] = {} - if not sample_name in match_alignment_dict[core_name] : - match_alignment_dict[core_name][sample_name] = get_alignment (new_sseq, matching_allele_seq, reward, penalty, gapopen, gapextend) + if not sample_name in match_alignment_dict[core_name]: + match_alignment_dict[core_name][sample_name] = get_alignment( + new_sseq, matching_allele_seq, reward, penalty, gapopen, gapextend + ) ## Get new sequence-allele sequence protein alignment - if not core_name in protein_dict : + if not core_name in protein_dict: protein_dict[core_name] = {} - if not sample_name in protein_dict[core_name] : + if not sample_name in protein_dict[core_name]: protein_dict[core_name][sample_name] = [] - protein_dict[core_name][sample_name] = get_alignment (new_sseq, matching_allele_seq, reward, penalty, gapopen, gapextend, "protein") + protein_dict[core_name][sample_name] = get_alignment( + new_sseq, matching_allele_seq, reward, penalty, gapopen, gapextend, "protein" + ) return True @@ -899,52 +1426,81 @@ def keep_snp_alignment_info(sseq, new_sseq, matching_allele_seq, qseqid, query_d # Create allele tag summary for each sample # # · * · * · * · * · * · * · * · * · * · * · # -def create_summary (samples_matrix_dict, logger) : +def create_summary(samples_matrix_dict, logger): summary_dict = {} summary_result_list = [] - summary_heading_list = ['Exact match', 'INF', 'ASM', 'ALM', 'LNF', 'TPR', 'NIPH', 'NIPHEM', 'PLOT', 'ERROR'] - summary_result_list.append('File\t' + '\t'.join(summary_heading_list)) - for key in sorted (samples_matrix_dict) : - - summary_dict[key] = {'Exact match':0, 'INF':0, 'ASM':0, 'ALM':0, 'LNF':0, 'TPR':0,'NIPH':0, 'NIPHEM':0, 'PLOT':0, 'ERROR':0} - for values in samples_matrix_dict[key] : - if 'INF_' in values : - summary_dict[key]['INF'] += 1 - elif 'ASM_' in values : - summary_dict[key]['ASM'] += 1 - elif 'ALM_' in values : - summary_dict[key]['ALM'] += 1 - elif 'LNF' in values : - summary_dict[key]['LNF'] += 1 - elif 'TPR' in values : - summary_dict[key]['TPR'] += 1 - elif 'NIPH' == values : - summary_dict[key]['NIPH'] += 1 - elif 'NIPHEM' == values : - summary_dict[key]['NIPHEM'] += 1 - elif 'PLOT' in values : - summary_dict[key]['PLOT'] += 1 - elif 'ERROR' in values : - summary_dict[key]['ERROR'] += 1 + summary_heading_list = [ + "Exact match", + "INF", + "ASM", + "ALM", + "LNF", + "TPR", + "NIPH", + "NIPHEM", + "PLOT", + "ERROR", + ] + summary_result_list.append("File\t" + "\t".join(summary_heading_list)) + for key in sorted(samples_matrix_dict): + summary_dict[key] = { + "Exact match": 0, + "INF": 0, + "ASM": 0, + "ALM": 0, + "LNF": 0, + "TPR": 0, + "NIPH": 0, + "NIPHEM": 0, + "PLOT": 0, + "ERROR": 0, + } + for values in samples_matrix_dict[key]: + if "INF_" in values: + summary_dict[key]["INF"] += 1 + elif "ASM_" in values: + summary_dict[key]["ASM"] += 1 + elif "ALM_" in values: + summary_dict[key]["ALM"] += 1 + elif "LNF" in values: + summary_dict[key]["LNF"] += 1 + elif "TPR" in values: + summary_dict[key]["TPR"] += 1 + elif "NIPH" == values: + summary_dict[key]["NIPH"] += 1 + elif "NIPHEM" == values: + summary_dict[key]["NIPHEM"] += 1 + elif "PLOT" in values: + summary_dict[key]["PLOT"] += 1 + elif "ERROR" in values: + summary_dict[key]["ERROR"] += 1 else: try: number = int(values) - summary_dict[key]['Exact match'] +=1 + summary_dict[key]["Exact match"] += 1 except: - if '_' in values : + if "_" in values: tmp_value = values try: number = int(tmp_value[-1]) - summary_dict[key]['Exact match'] +=1 + summary_dict[key]["Exact match"] += 1 except: - logger.debug('The value %s, was found when collecting summary information for the %s', values, summary_dict[key] ) + logger.debug( + "The value %s, was found when collecting summary information for the %s", + values, + summary_dict[key], + ) else: - logger.debug('The value %s, was found when collecting summary information for the %s', values, summary_dict[key] ) + logger.debug( + "The value %s, was found when collecting summary information for the %s", + values, + summary_dict[key], + ) summary_sample_list = [] - for item in summary_heading_list : + for item in summary_heading_list: summary_sample_list.append(str(summary_dict[key][item])) - summary_result_list.append(key + '\t' +'\t'.join(summary_sample_list)) + summary_result_list.append(key + "\t" + "\t".join(summary_sample_list)) return summary_result_list @@ -952,24 +1508,55 @@ def create_summary (samples_matrix_dict, logger) : # Get gene and product annotation for core gene using Prokka # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · # -### (tsv para algunos locus? Utils para analyze schema?) -def get_gene_annotation (annotation_file, annotation_dir, genus, species, usegenus, logger) : - name_file = os.path.basename(annotation_file).split('.') - annotation_dir = os.path.join (annotation_dir, 'annotation', name_file[0]) - - if usegenus == 'true': - annotation_result = subprocess.run (['prokka', annotation_file, '--outdir', annotation_dir, - '--genus', genus, '--species', species, '--usegenus', - '--gcode', '11', '--prefix', name_file[0], '--quiet']) - - elif usegenus == 'false': - annotation_result = subprocess.run (['prokka', annotation_file, '--outdir', annotation_dir, - '--genus', genus, '--species', species, - '--gcode', '11', '--prefix', name_file[0], '--quiet']) +### (tsv para algunos locus? Utils para analyze schema?) +def get_gene_annotation( + annotation_file, annotation_dir, genus, species, usegenus, logger +): + name_file = os.path.basename(annotation_file).split(".") + annotation_dir = os.path.join(annotation_dir, "annotation", name_file[0]) + + if usegenus == "true": + annotation_result = subprocess.run( + [ + "prokka", + annotation_file, + "--outdir", + annotation_dir, + "--genus", + genus, + "--species", + species, + "--usegenus", + "--gcode", + "11", + "--prefix", + name_file[0], + "--quiet", + ] + ) + + elif usegenus == "false": + annotation_result = subprocess.run( + [ + "prokka", + annotation_file, + "--outdir", + annotation_dir, + "--genus", + genus, + "--species", + species, + "--gcode", + "11", + "--prefix", + name_file[0], + "--quiet", + ] + ) annot_tsv = [] - tsv_path = os.path.join (annotation_dir, name_file[0] + '.tsv') + tsv_path = os.path.join(annotation_dir, name_file[0] + ".tsv") try: with open(tsv_path) as tsvfile: @@ -978,31 +1565,31 @@ def get_gene_annotation (annotation_file, annotation_dir, genus, species, usegen annot_tsv.append(line) if len(annot_tsv) > 1: - gene_index = annot_tsv[0].index("gene") product_index = annot_tsv[0].index("product") try: - if '_' in annot_tsv[1][2]: - gene_annot = annot_tsv[1][gene_index].split('_')[0] + if "_" in annot_tsv[1][2]: + gene_annot = annot_tsv[1][gene_index].split("_")[0] else: gene_annot = annot_tsv[1][gene_index] except: - gene_annot = 'Not found by Prokka' + gene_annot = "Not found by Prokka" try: product_annot = annot_tsv[1][product_index] except: - product_annot = 'Not found by Prokka' + product_annot = "Not found by Prokka" else: - gene_annot = 'Not found by Prokka' - product_annot = 'Not found by Prokka' + gene_annot = "Not found by Prokka" + product_annot = "Not found by Prokka" except: - gene_annot = 'Not found by Prokka' - product_annot = 'Not found by Prokka' + gene_annot = "Not found by Prokka" + product_annot = "Not found by Prokka" return gene_annot, product_annot + """ def get_gene_annotation (annotation_file, annotation_dir, logger) : name_file = os.path.basename(annotation_file).split('.') @@ -1079,7 +1666,7 @@ def get_gene_annotation (annotation_file, annotation_dir, logger) : """ -def analize_annotation_files (in_file, logger) : ## N +def analize_annotation_files(in_file, logger): ## N examiner = GFF.GFFExaminer() file_fh = open(in_file) datos = examiner.available_limits(in_file) @@ -1087,24 +1674,33 @@ def analize_annotation_files (in_file, logger) : ## N return True -def get_inferred_allele_number(core_dict, logger): ## N - #This function will look for the highest locus number and it will return a safe high value +def get_inferred_allele_number(core_dict, logger): ## N + # This function will look for the highest locus number and it will return a safe high value # that will be added to the schema database - logger.debug('running get_inferred_allele_number function') + logger.debug("running get_inferred_allele_number function") int_keys = [] for key in core_dict.keys(): int_keys.append(key) max_value = max(int_keys) digit_length = len(str(max_value)) - return True #str 1 ( #'1'+ '0'*digit_length + 2) + return True # str 1 ( #'1'+ '0'*digit_length + 2) # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * # # Get ST profile for each samples based on allele calling results # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * # -def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_list_files, sample_list_files, logger): - ## logger + +def get_ST_profile( + outputdir, + profile_csv_path, + exact_dict, + inf_dict, + core_gene_list_files, + sample_list_files, + logger, +): + ## logger csv_read = [] ST_profiles_dict = {} @@ -1118,15 +1714,17 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_ for line in csvreader: csv_read.append(line) - profile_header = csv_read[0][1:len(core_gene_list_files) + 1] + profile_header = csv_read[0][1 : len(core_gene_list_files) + 1] for ST_index in range(1, len(csv_read)): ST_profiles_dict[csv_read[ST_index][0]] = {} for core_index in range(len(profile_header)): - ST_profiles_dict[csv_read[ST_index][0]][profile_header[core_index]] = csv_read[ST_index][core_index + 1] + ST_profiles_dict[csv_read[ST_index][0]][ + profile_header[core_index] + ] = csv_read[ST_index][core_index + 1] for sample_file in sample_list_files: - sample_name = '.'.join(os.path.basename(sample_file).split('.')[:-1]) + sample_name = ".".join(os.path.basename(sample_file).split(".")[:-1]) st_counter = 0 for ST in ST_profiles_dict: @@ -1140,14 +1738,16 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_ if core_name in exact_dict[sample_name]: allele_in_sample = exact_dict[sample_name][core_name][2] - if not '_' in allele_in_ST: - if '_' in allele_in_sample: - allele_in_sample = allele_in_sample.split('_')[1] + if not "_" in allele_in_ST: + if "_" in allele_in_sample: + allele_in_sample = allele_in_sample.split("_")[1] if st_counter == 0: if sample_name not in analysis_profiles_dict: analysis_profiles_dict[sample_name] = {} - analysis_profiles_dict[sample_name][core_name] = allele_in_sample + analysis_profiles_dict[sample_name][ + core_name + ] = allele_in_sample if allele_in_sample == allele_in_ST: core_counter += 1 @@ -1165,14 +1765,16 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_ allele_in_sample = inf_dict[sample_name][core_name][2] if sample_name not in analysis_profiles_dict: analysis_profiles_dict[sample_name] = {} - analysis_profiles_dict[sample_name][core_name] = allele_in_sample + analysis_profiles_dict[sample_name][ + core_name + ] = allele_in_sample else: if st_counter == 0: if sample_name not in analysis_profiles_dict: analysis_profiles_dict[sample_name] = {} - if allele_in_ST == 'N' and "allele_in_sample" not in locals(): + if allele_in_ST == "N" and "allele_in_sample" not in locals(): core_counter += 1 st_counter += 1 @@ -1199,10 +1801,12 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_ if sample_name in analysis_profiles_dict: if len(analysis_profiles_dict[sample_name]) == len(profile_header): new_st_id = str(len(ST_profiles_dict) + 1) - ST_profiles_dict[new_st_id + "_INF"] = analysis_profile_dict[sample_name] + ST_profiles_dict[new_st_id + "_INF"] = analysis_profile_dict[ + sample_name + ] inf_ST[new_st_id] = analysis_profile_dict[sample_name] - samples_profiles_dict[sample_name]=new_st_id + "_INF" + samples_profiles_dict[sample_name] = new_st_id + "_INF" if "New" not in count_st: count_st["New"] = {} @@ -1211,24 +1815,24 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_ count_st["New"][new_st_id] += 1 else: - samples_profiles_dict[sample_name] = '-' + samples_profiles_dict[sample_name] = "-" if "Unknown" not in count_st: count_st["Unknown"] = 0 count_st["Unknown"] += 1 else: - samples_profiles_dict[sample_name] = '-' + samples_profiles_dict[sample_name] = "-" if "Unknown" not in count_st: count_st["Unknown"] = 0 count_st["Unknown"] += 1 ## Create ST profile results report - save_st_profile_results (outputdir, samples_profiles_dict, logger) + save_st_profile_results(outputdir, samples_profiles_dict, logger) ## Obtain interactive piechart - logger.info('Creating interactive ST results piechart') - create_sunburst_plot_st (outputdir, count_st, logger) + logger.info("Creating interactive ST results piechart") + create_sunburst_plot_st(outputdir, count_st, logger) return True, inf_ST @@ -1237,24 +1841,24 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_ # Create ST results report # # · * · * · * · * · * · * # -def save_st_profile_results (outputdir, samples_profiles_dict, logger): - header_stprofile = ['Sample Name', 'ST'] +def save_st_profile_results(outputdir, samples_profiles_dict, logger): + header_stprofile = ["Sample Name", "ST"] - if samples_profiles_dict != '': + if samples_profiles_dict != "": ## Saving ST profile to file - logger.info('Saving ST profile information to file..') - stprofile_file = os.path.join(outputdir, 'stprofile.tsv') - with open (stprofile_file , 'w') as st_fh : - st_fh.write('\t'.join(header_stprofile)+ '\n') + logger.info("Saving ST profile information to file..") + stprofile_file = os.path.join(outputdir, "stprofile.tsv") + with open(stprofile_file, "w") as st_fh: + st_fh.write("\t".join(header_stprofile) + "\n") for sample in sorted(samples_profiles_dict): - st_fh.write(sample + '\t' + samples_profiles_dict[sample] + '\n') + st_fh.write(sample + "\t" + samples_profiles_dict[sample] + "\n") return True -def create_sunburst_plot_st (outputdir, count_st, logger): - ### logger? +def create_sunburst_plot_st(outputdir, count_st, logger): + ### logger? counts = [] st_ids = ["ST"] st_labels = ["ST"] @@ -1263,7 +1867,6 @@ def create_sunburst_plot_st (outputdir, count_st, logger): total_samples = 0 for st_type in count_st: - if type(count_st[st_type]) == dict: total_st_type_count = sum(count_st[st_type].values()) else: @@ -1285,17 +1888,19 @@ def create_sunburst_plot_st (outputdir, count_st, logger): counts.insert(0, total_samples) - fig = go.Figure(go.Sunburst( - ids = st_ids, - labels = st_labels, - parents = st_parents, - values = counts, - branchvalues = "total", - )) + fig = go.Figure( + go.Sunburst( + ids=st_ids, + labels=st_labels, + parents=st_parents, + values=counts, + branchvalues="total", + ) + ) - fig.update_layout(margin = dict(t=0, l=0, r=0, b=0)) + fig.update_layout(margin=dict(t=0, l=0, r=0, b=0)) - plotsdir = os.path.join(outputdir, 'plots', 'samples_st.html') + plotsdir = os.path.join(outputdir, "plots", "samples_st.html") fig.write_html(plotsdir) @@ -1306,32 +1911,38 @@ def create_sunburst_plot_st (outputdir, count_st, logger): # Update ST profile file adding new ST found # # · * · * · * · * · * · * · * · * · * · * · # -def update_st_profile (updateprofile, profile_csv_path, outputdir, inf_ST, core_gene_list_files, logger): +def update_st_profile( + updateprofile, profile_csv_path, outputdir, inf_ST, core_gene_list_files, logger +): ## Create a copy of ST profile file if updateprofile = 'new' - if updateprofile == 'new': + if updateprofile == "new": no_updated_profile_csv_path = profile_csv_path - profile_csv_path_name = os.path.basename(no_updated_profile_csv_path).split('.')[0] - profile_csv_path = os.path.join(outputdir, profile_csv_path_name + '_updated' + '.csv') + profile_csv_path_name = os.path.basename(no_updated_profile_csv_path).split( + "." + )[0] + profile_csv_path = os.path.join( + outputdir, profile_csv_path_name + "_updated" + ".csv" + ) shutil.copyfile(no_updated_profile_csv_path, profile_csv_path) - logger.info('Copying ST profile file to update profiles') + logger.info("Copying ST profile file to update profiles") ## Update ST profile file - logger.info('Updating ST profile file adding new INF ST') + logger.info("Updating ST profile file adding new INF ST") - with open (profile_csv_path, 'r') as csvfile: + with open(profile_csv_path, "r") as csvfile: csvreader = csv.reader(csvfile, delimiter="\t") for line in csvreader: - profile_header = line[0][1:len(core_gene_list_files) + 1] + profile_header = line[0][1 : len(core_gene_list_files) + 1] break - with open (profile_csv_path, 'a') as profile_fh: + with open(profile_csv_path, "a") as profile_fh: for ST in inf_ST: locus_ST_list = [] locus_ST_list.append(ST) for locus in profile_header: locus_ST_list.append(inf_ST[ST][locus]) - profile_fh.write ('\t'.join(locus_ST_list)+ '\n') + profile_fh.write("\t".join(locus_ST_list) + "\n") return True @@ -1340,312 +1951,760 @@ def update_st_profile (updateprofile, profile_csv_path, outputdir, inf_ST, core_ # Create allele calling results reports # # · * · * · * · * · * · * · * · * · * · # -def save_allele_call_results (outputdir, full_gene_list, samples_matrix_dict, exact_dict, paralog_dict, inf_dict, plot_dict, matching_genes_dict, list_asm, list_alm, lnf_tpr_dict, snp_dict, match_alignment_dict, protein_dict, prodigal_report, shorter_seq_coverage, longer_seq_coverage, equal_seq_coverage, shorter_blast_seq_coverage, longer_blast_seq_coverage, equal_blast_seq_coverage, logger): - header_matching_alleles_contig = ['Sample Name', 'Contig', 'Core Gene', 'Allele', 'Contig Start', 'Contig Stop', 'Direction', 'Codification'] - header_exact = ['Core Gene', 'Sample Name', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig', 'Query length', 'Contig start', 'Contig end', 'Sequence', 'Predicted Sequence'] - header_paralogs = ['Core Gene','Sample Name', 'Gene Annotation', 'Product Annotation', 'Paralog Tag', 'ID %', 'Allele', 'Allele Quality', 'Contig', 'Bitscore', 'Contig start', 'Contig end', 'Sequence', 'Predicted Sequence'] - header_inferred = ['Core Gene','Sample Name', 'INF tag', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig', 'Bitscore', 'Query length', 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence', 'Predicted Sequence'] - header_asm = ['Core Gene', 'Sample Name', 'ASM tag', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig', 'Bitscore', 'Query length', 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence', 'Additional info', 'Predicted Sequence'] - header_alm = ['Core Gene', 'Sample Name', 'ALM tag', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig', 'Bitscore', 'Query length', 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence', 'Additional info', 'Predicted Sequence'] - header_plot = ['Core Gene', 'Sample Name', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig','Bitscore', 'Contig start', 'Contig end', 'Sequence', 'Predicted Sequence'] - header_lnf_tpr = ['Core Gene', 'Sample Name', 'Gene Annotation', 'Product Annotation', 'Tag', 'Allele', 'Allele Quality', 'ID %', 'Blast sequence coverage %', 'New sequence coverage %', 'Query length', 'Contig length', 'New sequence length', 'Additional info'] - header_snp = ['Core Gene', 'Sample Name', 'Allele', 'Position', 'Mutation Schema/Sample', 'Codon Schema/Sample','Amino acid in Schema/Sample', 'Mutation type','Annotation Schema/Sample'] - header_protein = ['Core Gene','Sample Name', 'Protein in ' , 'Protein sequence'] - header_match_alignment = ['Core Gene','Sample Name','Alignment', 'Sequence'] - header_stprofile = ['Sample Name', 'ST'] +def save_allele_call_results( + outputdir, + full_gene_list, + samples_matrix_dict, + exact_dict, + paralog_dict, + inf_dict, + plot_dict, + matching_genes_dict, + list_asm, + list_alm, + lnf_tpr_dict, + snp_dict, + match_alignment_dict, + protein_dict, + prodigal_report, + shorter_seq_coverage, + longer_seq_coverage, + equal_seq_coverage, + shorter_blast_seq_coverage, + longer_blast_seq_coverage, + equal_blast_seq_coverage, + logger, +): + header_matching_alleles_contig = [ + "Sample Name", + "Contig", + "Core Gene", + "Allele", + "Contig Start", + "Contig Stop", + "Direction", + "Codification", + ] + header_exact = [ + "Core Gene", + "Sample Name", + "Gene Annotation", + "Product Annotation", + "Allele", + "Allele Quality", + "Contig", + "Query length", + "Contig start", + "Contig end", + "Sequence", + "Predicted Sequence", + ] + header_paralogs = [ + "Core Gene", + "Sample Name", + "Gene Annotation", + "Product Annotation", + "Paralog Tag", + "ID %", + "Allele", + "Allele Quality", + "Contig", + "Bitscore", + "Contig start", + "Contig end", + "Sequence", + "Predicted Sequence", + ] + header_inferred = [ + "Core Gene", + "Sample Name", + "INF tag", + "Gene Annotation", + "Product Annotation", + "Allele", + "Allele Quality", + "Contig", + "Bitscore", + "Query length", + "Contig length", + "New sequence length", + "Mismatch", + "gaps", + "Contig start", + "Contig end", + "New sequence", + "Predicted Sequence", + ] + header_asm = [ + "Core Gene", + "Sample Name", + "ASM tag", + "Gene Annotation", + "Product Annotation", + "Allele", + "Allele Quality", + "Contig", + "Bitscore", + "Query length", + "Contig length", + "New sequence length", + "Mismatch", + "gaps", + "Contig start", + "Contig end", + "New sequence", + "Additional info", + "Predicted Sequence", + ] + header_alm = [ + "Core Gene", + "Sample Name", + "ALM tag", + "Gene Annotation", + "Product Annotation", + "Allele", + "Allele Quality", + "Contig", + "Bitscore", + "Query length", + "Contig length", + "New sequence length", + "Mismatch", + "gaps", + "Contig start", + "Contig end", + "New sequence", + "Additional info", + "Predicted Sequence", + ] + header_plot = [ + "Core Gene", + "Sample Name", + "Gene Annotation", + "Product Annotation", + "Allele", + "Allele Quality", + "Contig", + "Bitscore", + "Contig start", + "Contig end", + "Sequence", + "Predicted Sequence", + ] + header_lnf_tpr = [ + "Core Gene", + "Sample Name", + "Gene Annotation", + "Product Annotation", + "Tag", + "Allele", + "Allele Quality", + "ID %", + "Blast sequence coverage %", + "New sequence coverage %", + "Query length", + "Contig length", + "New sequence length", + "Additional info", + ] + header_snp = [ + "Core Gene", + "Sample Name", + "Allele", + "Position", + "Mutation Schema/Sample", + "Codon Schema/Sample", + "Amino acid in Schema/Sample", + "Mutation type", + "Annotation Schema/Sample", + ] + header_protein = ["Core Gene", "Sample Name", "Protein in ", "Protein sequence"] + header_match_alignment = ["Core Gene", "Sample Name", "Alignment", "Sequence"] + header_stprofile = ["Sample Name", "ST"] # Añadido header_prodigal_report para report prodigal -# header_prodigal_report = ['Core gene', 'Sample Name', 'Allele', 'Sequence type', 'BLAST start', 'BLAST end', 'Prodigal start', 'Prodigal end', 'BLAST sequence', 'Prodigal sequence'] + # header_prodigal_report = ['Core gene', 'Sample Name', 'Allele', 'Sequence type', 'BLAST start', 'BLAST end', 'Prodigal start', 'Prodigal end', 'BLAST sequence', 'Prodigal sequence'] # Añadido header_newsseq_coverage_report para determinar coverage threshold a imponer -# header_newsseq_coverage_report = ['Core gene', 'Sample Name', 'Query length', 'New sequence length', 'Locus mean', 'Coverage (new sequence/allele)', 'Coverage (new sequence/locus mean)'] + # header_newsseq_coverage_report = ['Core gene', 'Sample Name', 'Query length', 'New sequence length', 'Locus mean', 'Coverage (new sequence/allele)', 'Coverage (new sequence/locus mean)'] # Añadido header_blast_coverage_report para determinar coverage threshold a imponer -# header_blast_coverage_report = ['Core gene', 'Sample Name', 'Query length', 'Blast sequence length', 'Locus mean', 'Coverage (blast sequence/allele)', 'Coverage (blast sequence/locus mean)'] + # header_blast_coverage_report = ['Core gene', 'Sample Name', 'Query length', 'Blast sequence length', 'Locus mean', 'Coverage (blast sequence/allele)', 'Coverage (blast sequence/locus mean)'] ## Saving the result information to file - print ('Saving results to files \n') - result_file = os.path.join ( outputdir, 'result.tsv') - logger.info('Saving result information to file..') - with open (result_file, 'w') as out_fh: - out_fh.write ('Sample Name\t'+'\t'.join( full_gene_list) + '\n') - for key in sorted (samples_matrix_dict): - out_fh.write (key + '\t' + '\t'.join(samples_matrix_dict[key])+ '\n') + print("Saving results to files \n") + result_file = os.path.join(outputdir, "result.tsv") + logger.info("Saving result information to file..") + with open(result_file, "w") as out_fh: + out_fh.write("Sample Name\t" + "\t".join(full_gene_list) + "\n") + for key in sorted(samples_matrix_dict): + out_fh.write(key + "\t" + "\t".join(samples_matrix_dict[key]) + "\n") ## Saving exact matches to file - logger.info('Saving exact matches information to file..') - exact_file = os.path.join(outputdir, 'exact.tsv') - with open (exact_file , 'w') as exact_fh : - exact_fh.write('\t'.join(header_exact)+ '\n') + logger.info("Saving exact matches information to file..") + exact_file = os.path.join(outputdir, "exact.tsv") + with open(exact_file, "w") as exact_fh: + exact_fh.write("\t".join(header_exact) + "\n") for sample in sorted(exact_dict): for core in sorted(exact_dict[sample]): - exact_fh.write(core + '\t' + sample + '\t' + '\t'.join(exact_dict[sample][core]) + '\n') + exact_fh.write( + core + + "\t" + + sample + + "\t" + + "\t".join(exact_dict[sample][core]) + + "\n" + ) ## Saving paralog alleles to file - logger.info('Saving paralog information to file..') - paralog_file = os.path.join(outputdir, 'paralog.tsv') - with open (paralog_file , 'w') as paralog_fh : - paralog_fh.write('\t'.join(header_paralogs) + '\n') - for sample in sorted (paralog_dict) : - for core in sorted (paralog_dict[sample]): - for paralog in paralog_dict[sample][core] : - paralog_fh.write(core + '\t' + sample + '\t' + '\t'.join (paralog) + '\n') + logger.info("Saving paralog information to file..") + paralog_file = os.path.join(outputdir, "paralog.tsv") + with open(paralog_file, "w") as paralog_fh: + paralog_fh.write("\t".join(header_paralogs) + "\n") + for sample in sorted(paralog_dict): + for core in sorted(paralog_dict[sample]): + for paralog in paralog_dict[sample][core]: + paralog_fh.write( + core + "\t" + sample + "\t" + "\t".join(paralog) + "\n" + ) ## Saving inferred alleles to file - logger.info('Saving inferred alleles information to file..') - inferred_file = os.path.join(outputdir, 'inferred_alleles.tsv') - with open (inferred_file , 'w') as infer_fh : - infer_fh.write('\t'.join(header_inferred) + '\n') - for core in sorted (inf_dict) : - for sample in sorted (inf_dict[core]) : + logger.info("Saving inferred alleles information to file..") + inferred_file = os.path.join(outputdir, "inferred_alleles.tsv") + with open(inferred_file, "w") as infer_fh: + infer_fh.write("\t".join(header_inferred) + "\n") + for core in sorted(inf_dict): + for sample in sorted(inf_dict[core]): for inferred in inf_dict[core][sample]: # seq_in_inferred_allele = '\t'.join (inf_dict[sample]) - infer_fh.write(core + '\t' + sample + '\t' + inferred + '\t' + '\t'.join(inf_dict[core][sample][inferred]) + '\n') + infer_fh.write( + core + + "\t" + + sample + + "\t" + + inferred + + "\t" + + "\t".join(inf_dict[core][sample][inferred]) + + "\n" + ) ## Saving PLOTs to file - logger.info('Saving PLOT information to file..') - plot_file = os.path.join(outputdir, 'plot.tsv') - with open (plot_file , 'w') as plot_fh : - plot_fh.write('\t'.join(header_plot) + '\n') - for core in sorted (plot_dict) : - for sample in sorted (plot_dict[core]): - plot_fh.write(core + '\t' + sample + '\t' + '\t'.join(plot_dict[core][sample]) + '\n') + logger.info("Saving PLOT information to file..") + plot_file = os.path.join(outputdir, "plot.tsv") + with open(plot_file, "w") as plot_fh: + plot_fh.write("\t".join(header_plot) + "\n") + for core in sorted(plot_dict): + for sample in sorted(plot_dict[core]): + plot_fh.write( + core + + "\t" + + sample + + "\t" + + "\t".join(plot_dict[core][sample]) + + "\n" + ) ## Saving matching contigs to file - logger.info('Saving matching information to file..') - matching_file = os.path.join(outputdir, 'matching_contigs.tsv') - with open (matching_file , 'w') as matching_fh : - matching_fh.write('\t'.join(header_matching_alleles_contig) + '\n') - for samples in sorted (matching_genes_dict) : - for contigs in matching_genes_dict[samples] : - for contig in matching_genes_dict[samples] [contigs]: - matching_alleles = '\t'.join (contig) - matching_fh.write(samples + '\t' + contigs +'\t' + matching_alleles + '\n') + logger.info("Saving matching information to file..") + matching_file = os.path.join(outputdir, "matching_contigs.tsv") + with open(matching_file, "w") as matching_fh: + matching_fh.write("\t".join(header_matching_alleles_contig) + "\n") + for samples in sorted(matching_genes_dict): + for contigs in matching_genes_dict[samples]: + for contig in matching_genes_dict[samples][contigs]: + matching_alleles = "\t".join(contig) + matching_fh.write( + samples + "\t" + contigs + "\t" + matching_alleles + "\n" + ) ## Saving ASMs to file - logger.info('Saving asm information to file..') - asm_file = os.path.join(outputdir, 'asm.tsv') - with open (asm_file , 'w') as asm_fh : - asm_fh.write('\t'.join(header_asm)+ '\n') - for core in list_asm : - for sample in list_asm[core] : + logger.info("Saving asm information to file..") + asm_file = os.path.join(outputdir, "asm.tsv") + with open(asm_file, "w") as asm_fh: + asm_fh.write("\t".join(header_asm) + "\n") + for core in list_asm: + for sample in list_asm[core]: for asm in list_asm[core][sample]: - asm_fh.write(core + '\t' + sample + '\t' + asm + '\t' + '\t'.join(list_asm[core][sample][asm]) + '\n') + asm_fh.write( + core + + "\t" + + sample + + "\t" + + asm + + "\t" + + "\t".join(list_asm[core][sample][asm]) + + "\n" + ) ## Saving ALMs to file - logger.info('Saving alm information to file..') - alm_file = os.path.join(outputdir, 'alm.tsv') - with open (alm_file , 'w') as alm_fh : - alm_fh.write('\t'.join(header_alm)+ '\n') - for core in list_alm : - for sample in list_alm[core] : + logger.info("Saving alm information to file..") + alm_file = os.path.join(outputdir, "alm.tsv") + with open(alm_file, "w") as alm_fh: + alm_fh.write("\t".join(header_alm) + "\n") + for core in list_alm: + for sample in list_alm[core]: for alm in list_alm[core][sample]: - alm_fh.write(core + '\t' + sample + '\t' + alm + '\t' + '\t'.join(list_alm[core][sample][alm]) + '\n') + alm_fh.write( + core + + "\t" + + sample + + "\t" + + alm + + "\t" + + "\t".join(list_alm[core][sample][alm]) + + "\n" + ) ## Saving LNFs to file - logger.info('Saving lnf information to file..') - lnf_file = os.path.join(outputdir, 'lnf_tpr.tsv') - with open (lnf_file , 'w') as lnf_fh : - lnf_fh.write('\t'.join(header_lnf_tpr)+ '\n') - for core in lnf_tpr_dict : - for sample in lnf_tpr_dict[core] : - for lnf in lnf_tpr_dict[core][sample] : - lnf_fh.write(core + '\t' + sample + '\t' + '\t'.join(lnf) + '\n') + logger.info("Saving lnf information to file..") + lnf_file = os.path.join(outputdir, "lnf_tpr.tsv") + with open(lnf_file, "w") as lnf_fh: + lnf_fh.write("\t".join(header_lnf_tpr) + "\n") + for core in lnf_tpr_dict: + for sample in lnf_tpr_dict[core]: + for lnf in lnf_tpr_dict[core][sample]: + lnf_fh.write(core + "\t" + sample + "\t" + "\t".join(lnf) + "\n") ## Saving SNPs information to file - logger.info('Saving SNPs information to file..') - snp_file = os.path.join(outputdir, 'snp.tsv') - with open (snp_file , 'w') as snp_fh : - snp_fh.write('\t'.join(header_snp) + '\n') - for core in sorted (snp_dict) : - for sample in sorted (snp_dict[core]): - for allele_id_snp in snp_dict[core][sample] : - for snp in snp_dict[core][sample][allele_id_snp] : - snp_fh.write(core + '\t' + sample + '\t' + allele_id_snp + '\t' + '\t'.join (snp) + '\n') + logger.info("Saving SNPs information to file..") + snp_file = os.path.join(outputdir, "snp.tsv") + with open(snp_file, "w") as snp_fh: + snp_fh.write("\t".join(header_snp) + "\n") + for core in sorted(snp_dict): + for sample in sorted(snp_dict[core]): + for allele_id_snp in snp_dict[core][sample]: + for snp in snp_dict[core][sample][allele_id_snp]: + snp_fh.write( + core + + "\t" + + sample + + "\t" + + allele_id_snp + + "\t" + + "\t".join(snp) + + "\n" + ) ## Saving DNA sequences alignments to file - logger.info('Saving matching alignment information to files..') - alignment_dir = os.path.join(outputdir,'alignments') - if os.path.exists(alignment_dir) : + logger.info("Saving matching alignment information to files..") + alignment_dir = os.path.join(outputdir, "alignments") + if os.path.exists(alignment_dir): shutil.rmtree(alignment_dir) - logger.info('deleting the alignment files from previous execution') + logger.info("deleting the alignment files from previous execution") os.makedirs(alignment_dir) - for core in sorted(match_alignment_dict) : - for sample in sorted (match_alignment_dict[core]) : - match_alignment_file = os.path.join(alignment_dir, str('match_alignment_' + core + '_' + sample + '.txt')) - with open(match_alignment_file, 'w') as match_alignment_fh : - match_alignment_fh.write( '\t'.join(header_match_alignment) + '\n') - for match_align in match_alignment_dict[core][sample] : - match_alignment_fh.write(core + '\t'+ sample +'\t'+ '\t'.join(match_align) + '\n') + for core in sorted(match_alignment_dict): + for sample in sorted(match_alignment_dict[core]): + match_alignment_file = os.path.join( + alignment_dir, str("match_alignment_" + core + "_" + sample + ".txt") + ) + with open(match_alignment_file, "w") as match_alignment_fh: + match_alignment_fh.write("\t".join(header_match_alignment) + "\n") + for match_align in match_alignment_dict[core][sample]: + match_alignment_fh.write( + core + "\t" + sample + "\t" + "\t".join(match_align) + "\n" + ) ## Saving protein sequences alignments to file - logger.info('Saving protein information to files..') - protein_dir = os.path.join(outputdir,'proteins') - if os.path.exists(protein_dir) : + logger.info("Saving protein information to files..") + protein_dir = os.path.join(outputdir, "proteins") + if os.path.exists(protein_dir): shutil.rmtree(protein_dir) - logger.info('deleting the proteins files from previous execution') + logger.info("deleting the proteins files from previous execution") os.makedirs(protein_dir) - for core in sorted(protein_dict) : - for sample in sorted (protein_dict[core]) : - protein_file = os.path.join(protein_dir, str('protein_' + core + '_' + sample + '.txt')) - with open(protein_file, 'w') as protein_fh : - protein_fh.write( '\t'.join(header_protein) + '\n') - for protein in protein_dict[core][sample] : - protein_fh.write(core + '\t'+ sample +'\t'+ '\t'.join(protein) + '\n') + for core in sorted(protein_dict): + for sample in sorted(protein_dict[core]): + protein_file = os.path.join( + protein_dir, str("protein_" + core + "_" + sample + ".txt") + ) + with open(protein_file, "w") as protein_fh: + protein_fh.write("\t".join(header_protein) + "\n") + for protein in protein_dict[core][sample]: + protein_fh.write( + core + "\t" + sample + "\t" + "\t".join(protein) + "\n" + ) ## Saving summary information to file - logger.info('Saving summary information to file..') - summary_result = create_summary (samples_matrix_dict, logger) - summary_file = os.path.join( outputdir, 'summary_result.tsv') - with open (summary_file , 'w') as summ_fh: - for line in summary_result : - summ_fh.write(line + '\n') + logger.info("Saving summary information to file..") + summary_result = create_summary(samples_matrix_dict, logger) + summary_file = os.path.join(outputdir, "summary_result.tsv") + with open(summary_file, "w") as summ_fh: + for line in summary_result: + summ_fh.write(line + "\n") ## Modify the result file to remove the PLOT_ string for creating the file to use in the tree diagram -# logger.info('Saving result information for tree diagram') -# tree_diagram_file = os.path.join ( outputdir, 'result_for_tree_diagram.tsv') -# with open (result_file, 'r') as result_fh: -# with open(tree_diagram_file, 'w') as td_fh: -# for line in result_fh: -# tree_line = line.replace('PLOT_','') -# td_fh.write(tree_line) + # logger.info('Saving result information for tree diagram') + # tree_diagram_file = os.path.join ( outputdir, 'result_for_tree_diagram.tsv') + # with open (result_file, 'r') as result_fh: + # with open(tree_diagram_file, 'w') as td_fh: + # for line in result_fh: + # tree_line = line.replace('PLOT_','') + # td_fh.write(tree_line) ########################################################################################### # Guardando report de prodigal. Temporal -# prodigal_report_file = os.path.join (outputdir, 'prodigal_report.tsv') + # prodigal_report_file = os.path.join (outputdir, 'prodigal_report.tsv') # saving prodigal predictions to file -# with open (prodigal_report_file, 'w') as out_fh: -# out_fh.write ('\t'.join(header_prodigal_report)+ '\n') -# for prodigal_result in prodigal_report: -# out_fh.write ('\t'.join(prodigal_result)+ '\n') + # with open (prodigal_report_file, 'w') as out_fh: + # out_fh.write ('\t'.join(header_prodigal_report)+ '\n') + # for prodigal_result in prodigal_report: + # out_fh.write ('\t'.join(prodigal_result)+ '\n') # Guardando coverage de new_sseq para estimar el threshold a establecer. Temporal -# newsseq_coverage_file = os.path.join (outputdir, 'newsseq_coverage_report.tsv') + # newsseq_coverage_file = os.path.join (outputdir, 'newsseq_coverage_report.tsv') # saving the coverage information to file -# with open (newsseq_coverage_file, 'w') as out_fh: -# out_fh.write ('\t' + '\t'.join(header_newsseq_coverage_report)+ '\n') -# for coverage in shorter_seq_coverage: -# out_fh.write ('Shorter new sequence' + '\t' + '\t'.join(coverage)+ '\n') -# for coverage in longer_seq_coverage: -# out_fh.write ('Longer new sequence' + '\t' + '\t'.join(coverage)+ '\n') -# for coverage in equal_seq_coverage: -# out_fh.write ('Same length new sequence' + '\t' + '\t'.join(coverage)+ '\n') + # with open (newsseq_coverage_file, 'w') as out_fh: + # out_fh.write ('\t' + '\t'.join(header_newsseq_coverage_report)+ '\n') + # for coverage in shorter_seq_coverage: + # out_fh.write ('Shorter new sequence' + '\t' + '\t'.join(coverage)+ '\n') + # for coverage in longer_seq_coverage: + # out_fh.write ('Longer new sequence' + '\t' + '\t'.join(coverage)+ '\n') + # for coverage in equal_seq_coverage: + # out_fh.write ('Same length new sequence' + '\t' + '\t'.join(coverage)+ '\n') # Guardando coverage de la sseq obtenida tras blast para estimar el threshold a establecer. Temporal -# blast_coverage_file = os.path.join (outputdir, 'blast_coverage_report.tsv') + # blast_coverage_file = os.path.join (outputdir, 'blast_coverage_report.tsv') # saving the result information to file -# with open (blast_coverage_file, 'w') as out_fh: -# out_fh.write ('\t' + '\t'.join(header_blast_coverage_report)+ '\n') -# for coverage in shorter_blast_seq_coverage: -# out_fh.write ('Shorter blast sequence' + '\t' + '\t'.join(coverage)+ '\n') -# for coverage in longer_blast_seq_coverage: -# out_fh.write ('Longer blast sequence' + '\t' + '\t'.join(coverage)+ '\n') -# for coverage in equal_blast_seq_coverage: -# out_fh.write ('Same length blast sequence' + '\t' + '\t'.join(coverage)+ '\n') + # with open (blast_coverage_file, 'w') as out_fh: + # out_fh.write ('\t' + '\t'.join(header_blast_coverage_report)+ '\n') + # for coverage in shorter_blast_seq_coverage: + # out_fh.write ('Shorter blast sequence' + '\t' + '\t'.join(coverage)+ '\n') + # for coverage in longer_blast_seq_coverage: + # out_fh.write ('Longer blast sequence' + '\t' + '\t'.join(coverage)+ '\n') + # for coverage in equal_blast_seq_coverage: + # out_fh.write ('Same length blast sequence' + '\t' + '\t'.join(coverage)+ '\n') ########################################################################################### return True - -def save_allele_calling_plots (outputdir, sample_list_files, count_exact, count_inf, count_asm, count_alm, count_lnf, count_tpr, count_plot, count_niph, count_niphem, count_error, logger): - +def save_allele_calling_plots( + outputdir, + sample_list_files, + count_exact, + count_inf, + count_asm, + count_alm, + count_lnf, + count_tpr, + count_plot, + count_niph, + count_niphem, + count_error, + logger, +): ## Create result plots directory - plots_dir = os.path.join(outputdir,'plots') + plots_dir = os.path.join(outputdir, "plots") try: os.makedirs(plots_dir) except: - logger.info('Deleting the results plots directory for a previous execution without cleaning up') - shutil.rmtree(os.path.join(outputdir, 'plots')) + logger.info( + "Deleting the results plots directory for a previous execution without cleaning up" + ) + shutil.rmtree(os.path.join(outputdir, "plots")) try: os.makedirs(plots_dir) - logger.info ('Results plots folder %s has been created again', plots_dir) + logger.info("Results plots folder %s has been created again", plots_dir) except: - logger.info('Unable to create again the results plots directory %s', plots_dir) - print('Cannot create Results plots directory on ', plots_dir) + logger.info( + "Unable to create again the results plots directory %s", plots_dir + ) + print("Cannot create Results plots directory on ", plots_dir) exit(0) for sample_file in sample_list_files: - sample_name = '.'.join(os.path.basename(sample_file).split('.')[:-1]) + sample_name = ".".join(os.path.basename(sample_file).split(".")[:-1]) ## Obtain interactive piechart - logger.info('Creating interactive results piecharts') - create_sunburst_allele_call (outputdir, sample_name, count_exact[sample_name], count_inf[sample_name], count_asm[sample_name], count_alm[sample_name], count_lnf[sample_name], count_tpr[sample_name], count_plot[sample_name], count_niph[sample_name], count_niphem[sample_name], count_error[sample_name]) + logger.info("Creating interactive results piecharts") + create_sunburst_allele_call( + outputdir, + sample_name, + count_exact[sample_name], + count_inf[sample_name], + count_asm[sample_name], + count_alm[sample_name], + count_lnf[sample_name], + count_tpr[sample_name], + count_plot[sample_name], + count_niph[sample_name], + count_niphem[sample_name], + count_error[sample_name], + ) return True - -def create_sunburst_allele_call (outputdir, sample_name, count_exact, count_inf, count_asm, count_alm, count_lnf, count_tpr, count_plot, count_niph, count_niphem, count_error): - ### logger - - total_locus = count_exact['total'] + count_inf['total'] + count_asm['total'] + count_alm['total'] + count_lnf['total'] + count_tpr['total'] + count_plot['total'] \ - + count_niph['total'] + count_niphem['total'] + count_error['total'] - - tag_counts = [total_locus, count_exact['total'], count_exact['good_quality'], count_exact['bad_quality'], count_exact['no_start'], count_exact['no_start_stop'], - count_exact['no_stop'], count_exact['multiple_stop'], count_inf['total'], count_inf['good_quality'], count_inf['bad_quality'], count_inf['no_start'], - count_inf['no_start_stop'], count_inf['no_stop'], count_inf['multiple_stop'], count_asm['total'], count_asm['insertion'], count_asm['deletion'], - count_asm['substitution'], count_alm['total'], count_alm['insertion'], count_alm['deletion'], count_alm['substitution'], count_plot['total'], - count_niph['total'], count_niph['good_quality'], count_niph['bad_quality'], count_niph['no_start'], count_niph['no_start_stop'], count_niph['no_stop'], - count_niph['multiple_stop'], count_niphem['total'], count_niphem['good_quality'], count_niphem['bad_quality'], count_niphem['no_start'], - count_niphem['no_start_stop'], count_niphem['no_stop'], count_niphem['multiple_stop'], count_lnf['total'], count_lnf['not_found'], count_lnf['low_id'], - count_lnf['low_coverage'], count_tpr['total'], count_tpr['good_quality'], count_tpr['bad_quality'], count_tpr['no_start'], count_tpr['no_start_stop'], - count_tpr['no_stop'], count_tpr['multiple_stop'], count_error['total'], count_error['good_quality'], count_error['bad_quality'], count_error['no_start'], - count_error['no_start_stop'], count_error['no_stop'], count_error['multiple_stop']] - - fig=go.Figure(go.Sunburst( - ids=[ - sample_name, "Exact Match", "Good Quality - Exact Match", "Bad Quality - Exact Match", - "No start - Bad Quality - Exact Match", "No start-stop - Bad Quality - Exact Match", - "No stop - Bad Quality - Exact Match", "Multiple stop - Bad Quality - Exact Match", - "INF", "Good Quality - INF", "Bad Quality - INF", "No start - Bad Quality - INF", - "No start-stop - Bad Quality - INF", "No stop - Bad Quality - INF", "Multiple stop - Bad Quality - INF", - "ASM", "Insertion - ASM", "Deletion - ASM", "Substitution - ASM", "ALM", "Insertion - ALM", - "Deletion - ALM", "Substitution - ALM", "PLOT", "NIPH", "Good Quality - NIPH", - "Bad Quality - NIPH", "No start - Bad Quality - NIPH", "No start-stop - Bad Quality - NIPH", - "No stop - Bad Quality - NIPH", "Multiple stop - Bad Quality - NIPH", "NIPHEM", - "Good Quality - NIPHEM", "Bad Quality - NIPHEM", "No start - Bad Quality - NIPHEM", - "No start-stop - Bad Quality - NIPHEM", "No stop - Bad Quality - NIPHEM", - "Multiple stop - Bad Quality - NIPHEM", "LNF", "Not found", - "Low ID", "Low coverage", "TPR", "Good Quality - TPR", "Bad Quality - TPR", - "No start - Bad Quality - TPR", "No start-stop - Bad Quality - TPR", "No stop - Bad Quality - TPR", - "Multiple stop - Bad Quality - TPR", "Error", "Good Quality - Error", "Bad Quality - Error", - "No start - Bad Quality - Error", "No start-stop - Bad Quality - Error", - "No stop - Bad Quality - Error", "Multiple stop - Bad Quality - Error" - ], - labels= [ - sample_name, "Exact
Match", "Good
Quality", "Bad
Quality", - "No
start", "No
start-stop", "No
stop", "Multiple
stop", - "INF", "Good
Quality", "Bad
Quality", "No
start", - "No
start-stop", "No
stop", "Multiple
stop", "ASM", "Insertion", - "Deletion", "Substitution", "ALM", "Insertion", "Deletion", - "Substitution", "PLOT", "NIPH", "Good
Quality", "Bad
Quality", - "No
start", "No
start-stop", "No
stop", "Multiple
stop", - "NIPHEM", "Good
Quality", "Bad
Quality", "No
start", - "No
start-stop", "No
stop", "Multiple
stop", "LNF", "Not
found", - "Low
ID", "Low
coverage", "TPR", "Good
Quality", "Bad
Quality", - "No
start", "No
start-stop", "No
stop", "Multiple
stop", - "Error", "Good
Quality", "Bad
Quality","No
start", - "No
start-stop", "No
stop", "Multiple
stop" - ], - parents=[ - "", sample_name, "Exact Match", "Exact Match", "Bad Quality - Exact Match", - "Bad Quality - Exact Match", "Bad Quality - Exact Match", "Bad Quality - Exact Match", - sample_name, "INF", "INF", "Bad Quality - INF", "Bad Quality - INF", "Bad Quality - INF", - "Bad Quality - INF", sample_name, "ASM", "ASM", "ASM", sample_name, "ALM", "ALM", "ALM", sample_name, - sample_name, "NIPH", "NIPH", "Bad Quality - NIPH", "Bad Quality - NIPH", "Bad Quality - NIPH", - "Bad Quality - NIPH", sample_name, "NIPHEM", "NIPHEM", "Bad Quality - NIPHEM", - "Bad Quality - NIPHEM", "Bad Quality - NIPHEM", "Bad Quality - NIPHEM", sample_name, "LNF", - "LNF", "LNF", sample_name, "TPR", "TPR", "Bad Quality - TPR", "Bad Quality - TPR", - "Bad Quality - TPR", "Bad Quality - TPR", sample_name, "Error", "Error", "Bad Quality - Error", - "Bad Quality - Error", "Bad Quality - Error", "Bad Quality - Error" - ], - values=tag_counts, - branchvalues="total", - )) - - fig.update_layout(margin = dict(t=0, l=0, r=0, b=0)) - - plotsdir = os.path.join(outputdir, 'plots', sample_name + '.html') +def create_sunburst_allele_call( + outputdir, + sample_name, + count_exact, + count_inf, + count_asm, + count_alm, + count_lnf, + count_tpr, + count_plot, + count_niph, + count_niphem, + count_error, +): + ### logger + + total_locus = ( + count_exact["total"] + + count_inf["total"] + + count_asm["total"] + + count_alm["total"] + + count_lnf["total"] + + count_tpr["total"] + + count_plot["total"] + + count_niph["total"] + + count_niphem["total"] + + count_error["total"] + ) + + tag_counts = [ + total_locus, + count_exact["total"], + count_exact["good_quality"], + count_exact["bad_quality"], + count_exact["no_start"], + count_exact["no_start_stop"], + count_exact["no_stop"], + count_exact["multiple_stop"], + count_inf["total"], + count_inf["good_quality"], + count_inf["bad_quality"], + count_inf["no_start"], + count_inf["no_start_stop"], + count_inf["no_stop"], + count_inf["multiple_stop"], + count_asm["total"], + count_asm["insertion"], + count_asm["deletion"], + count_asm["substitution"], + count_alm["total"], + count_alm["insertion"], + count_alm["deletion"], + count_alm["substitution"], + count_plot["total"], + count_niph["total"], + count_niph["good_quality"], + count_niph["bad_quality"], + count_niph["no_start"], + count_niph["no_start_stop"], + count_niph["no_stop"], + count_niph["multiple_stop"], + count_niphem["total"], + count_niphem["good_quality"], + count_niphem["bad_quality"], + count_niphem["no_start"], + count_niphem["no_start_stop"], + count_niphem["no_stop"], + count_niphem["multiple_stop"], + count_lnf["total"], + count_lnf["not_found"], + count_lnf["low_id"], + count_lnf["low_coverage"], + count_tpr["total"], + count_tpr["good_quality"], + count_tpr["bad_quality"], + count_tpr["no_start"], + count_tpr["no_start_stop"], + count_tpr["no_stop"], + count_tpr["multiple_stop"], + count_error["total"], + count_error["good_quality"], + count_error["bad_quality"], + count_error["no_start"], + count_error["no_start_stop"], + count_error["no_stop"], + count_error["multiple_stop"], + ] + + fig = go.Figure( + go.Sunburst( + ids=[ + sample_name, + "Exact Match", + "Good Quality - Exact Match", + "Bad Quality - Exact Match", + "No start - Bad Quality - Exact Match", + "No start-stop - Bad Quality - Exact Match", + "No stop - Bad Quality - Exact Match", + "Multiple stop - Bad Quality - Exact Match", + "INF", + "Good Quality - INF", + "Bad Quality - INF", + "No start - Bad Quality - INF", + "No start-stop - Bad Quality - INF", + "No stop - Bad Quality - INF", + "Multiple stop - Bad Quality - INF", + "ASM", + "Insertion - ASM", + "Deletion - ASM", + "Substitution - ASM", + "ALM", + "Insertion - ALM", + "Deletion - ALM", + "Substitution - ALM", + "PLOT", + "NIPH", + "Good Quality - NIPH", + "Bad Quality - NIPH", + "No start - Bad Quality - NIPH", + "No start-stop - Bad Quality - NIPH", + "No stop - Bad Quality - NIPH", + "Multiple stop - Bad Quality - NIPH", + "NIPHEM", + "Good Quality - NIPHEM", + "Bad Quality - NIPHEM", + "No start - Bad Quality - NIPHEM", + "No start-stop - Bad Quality - NIPHEM", + "No stop - Bad Quality - NIPHEM", + "Multiple stop - Bad Quality - NIPHEM", + "LNF", + "Not found", + "Low ID", + "Low coverage", + "TPR", + "Good Quality - TPR", + "Bad Quality - TPR", + "No start - Bad Quality - TPR", + "No start-stop - Bad Quality - TPR", + "No stop - Bad Quality - TPR", + "Multiple stop - Bad Quality - TPR", + "Error", + "Good Quality - Error", + "Bad Quality - Error", + "No start - Bad Quality - Error", + "No start-stop - Bad Quality - Error", + "No stop - Bad Quality - Error", + "Multiple stop - Bad Quality - Error", + ], + labels=[ + sample_name, + "Exact
Match", + "Good
Quality", + "Bad
Quality", + "No
start", + "No
start-stop", + "No
stop", + "Multiple
stop", + "INF", + "Good
Quality", + "Bad
Quality", + "No
start", + "No
start-stop", + "No
stop", + "Multiple
stop", + "ASM", + "Insertion", + "Deletion", + "Substitution", + "ALM", + "Insertion", + "Deletion", + "Substitution", + "PLOT", + "NIPH", + "Good
Quality", + "Bad
Quality", + "No
start", + "No
start-stop", + "No
stop", + "Multiple
stop", + "NIPHEM", + "Good
Quality", + "Bad
Quality", + "No
start", + "No
start-stop", + "No
stop", + "Multiple
stop", + "LNF", + "Not
found", + "Low
ID", + "Low
coverage", + "TPR", + "Good
Quality", + "Bad
Quality", + "No
start", + "No
start-stop", + "No
stop", + "Multiple
stop", + "Error", + "Good
Quality", + "Bad
Quality", + "No
start", + "No
start-stop", + "No
stop", + "Multiple
stop", + ], + parents=[ + "", + sample_name, + "Exact Match", + "Exact Match", + "Bad Quality - Exact Match", + "Bad Quality - Exact Match", + "Bad Quality - Exact Match", + "Bad Quality - Exact Match", + sample_name, + "INF", + "INF", + "Bad Quality - INF", + "Bad Quality - INF", + "Bad Quality - INF", + "Bad Quality - INF", + sample_name, + "ASM", + "ASM", + "ASM", + sample_name, + "ALM", + "ALM", + "ALM", + sample_name, + sample_name, + "NIPH", + "NIPH", + "Bad Quality - NIPH", + "Bad Quality - NIPH", + "Bad Quality - NIPH", + "Bad Quality - NIPH", + sample_name, + "NIPHEM", + "NIPHEM", + "Bad Quality - NIPHEM", + "Bad Quality - NIPHEM", + "Bad Quality - NIPHEM", + "Bad Quality - NIPHEM", + sample_name, + "LNF", + "LNF", + "LNF", + sample_name, + "TPR", + "TPR", + "Bad Quality - TPR", + "Bad Quality - TPR", + "Bad Quality - TPR", + "Bad Quality - TPR", + sample_name, + "Error", + "Error", + "Bad Quality - Error", + "Bad Quality - Error", + "Bad Quality - Error", + "Bad Quality - Error", + ], + values=tag_counts, + branchvalues="total", + ) + ) + + fig.update_layout(margin=dict(t=0, l=0, r=0, b=0)) + + plotsdir = os.path.join(outputdir, "plots", sample_name + ".html") fig.write_html(plotsdir) @@ -1656,11 +2715,19 @@ def create_sunburst_allele_call (outputdir, sample_name, count_exact, count_inf, # Update core genes schema adding new inferred alleles found for each locus in allele calling analysis # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * # -def update_schema (updateschema, schemadir, outputdir, core_gene_list_files, inferred_alleles_dict, alleles_in_locus_dict, logger): +def update_schema( + updateschema, + schemadir, + outputdir, + core_gene_list_files, + inferred_alleles_dict, + alleles_in_locus_dict, + logger, +): if len(inferred_alleles_dict) > 0: ## Create a copy of core genes schema if updateschema = 'new' / 'New' - if updateschema == 'new': + if updateschema == "new": no_updated_schemadir = schemadir ##schemadir_name = os.path.dirname(no_updated_schemadir) ---> se puede usar si guardo finalmente el nuevo esquema en el mismo directorio que el antiguo esquema, pero para ello debo verificar si termina o no el path en / para eliminarlo o no del path antes de hacer el dirname schemadir_name = no_updated_schemadir.split("/") @@ -1669,52 +2736,82 @@ def update_schema (updateschema, schemadir, outputdir, core_gene_list_files, inf else: schemadir_name = schemadir_name[-1] - schemadir = os.path.join(outputdir, schemadir_name + '_updated') + schemadir = os.path.join(outputdir, schemadir_name + "_updated") try: shutil.copytree(no_updated_schemadir, schemadir) - logger.info ('Schema copy %s has been created to update schema', schemadir) + logger.info( + "Schema copy %s has been created to update schema", schemadir + ) except: - logger.info('Deleting preexisting directory') + logger.info("Deleting preexisting directory") shutil.rmtree(schemadir) try: shutil.copytree(no_updated_schemadir, schemadir) - logger.info ('Schema copy %s has been created to update schema', schemadir) + logger.info( + "Schema copy %s has been created to update schema", schemadir + ) except: - logger.info('Unable to create schema copy %s', schemadir) - print('Cannot create schema copy on ', schemadir) + logger.info("Unable to create schema copy %s", schemadir) + print("Cannot create schema copy on ", schemadir) exit(0) ## Get INF alleles for each core gene and update each locus fasta file for core_file in core_gene_list_files: - core_name = os.path.basename(core_file).split('.')[0] + core_name = os.path.basename(core_file).split(".")[0] if core_name in inferred_alleles_dict: - logger.info('Updating core gene file %s adding new INF alleles', core_file) + logger.info( + "Updating core gene file %s adding new INF alleles", core_file + ) inf_list = inferred_alleles_dict[core_name] try: - alleles_ids = [int(allele) for allele in alleles_in_locus_dict[core_name]] + alleles_ids = [ + int(allele) for allele in alleles_in_locus_dict[core_name] + ] allele_number = max(alleles_ids) - locus_schema_file = os.path.join(schemadir, core_name + '.fasta') - with open (locus_schema_file, 'a') as core_fh: + locus_schema_file = os.path.join(schemadir, core_name + ".fasta") + with open(locus_schema_file, "a") as core_fh: for inf in inf_list: allele_number += 1 - core_fh.write('\n' + '>' + str(allele_number) + ' # ' + 'INF by Taranis' + '\n' + inf + '\n') + core_fh.write( + "\n" + + ">" + + str(allele_number) + + " # " + + "INF by Taranis" + + "\n" + + inf + + "\n" + ) except: - alleles_ids = [int(allele.split('_')[-1]) for allele in alleles_in_locus_dict[core_name]] + alleles_ids = [ + int(allele.split("_")[-1]) + for allele in alleles_in_locus_dict[core_name] + ] allele_number = max(alleles_ids) - locus_schema_file = os.path.join(schemadir, core_name + '.fasta') - with open (locus_schema_file, 'a') as core_fh: + locus_schema_file = os.path.join(schemadir, core_name + ".fasta") + with open(locus_schema_file, "a") as core_fh: for inf in inf_list: allele_number += 1 - complete_inf_id = core_name + '_' + str(allele_number) - core_fh.write('\n' + '>' + complete_inf_id + ' # ' + 'INF by Taranis' + '\n' + inf + '\n') + complete_inf_id = core_name + "_" + str(allele_number) + core_fh.write( + "\n" + + ">" + + complete_inf_id + + " # " + + "INF by Taranis" + + "\n" + + inf + + "\n" + ) return True + """ def update_schema (updateschema, schemadir, storedir, core_gene_list_files, inferred_alleles_dict, alleles_in_locus_dict, logger): @@ -1751,32 +2848,67 @@ def update_schema (updateschema, schemadir, storedir, core_gene_list_files, infe # Allele calling analysis to find each core gene in schema and its variants in samples # # · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * # -def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in_locus_dict, contigs_in_sample_dict, query_directory, reference_alleles_directory, blast_db_directory, prodigal_directory, blast_results_seq_directory, blast_results_db_directory, inputdir, outputdir, cpus, percentlength, coverage, evalue, perc_identity_ref, perc_identity_loc, reward, penalty, gapopen, gapextend, max_target_seqs, max_hsps, num_threads, flankingnts, schema_variability, schema_statistics, schema_quality, annotation_core_dict, profile_csv_path, logger ): - prodigal_report = [] # TEMPORAL. prodigal_report para checkear las secuencias obtenidas con prodigal vs blast y las posiciones sstart y send +def allele_call_nucleotides( + core_gene_list_files, + sample_list_files, + alleles_in_locus_dict, + contigs_in_sample_dict, + query_directory, + reference_alleles_directory, + blast_db_directory, + prodigal_directory, + blast_results_seq_directory, + blast_results_db_directory, + inputdir, + outputdir, + cpus, + percentlength, + coverage, + evalue, + perc_identity_ref, + perc_identity_loc, + reward, + penalty, + gapopen, + gapextend, + max_target_seqs, + max_hsps, + num_threads, + flankingnts, + schema_variability, + schema_statistics, + schema_quality, + annotation_core_dict, + profile_csv_path, + logger, +): + prodigal_report = ( + [] + ) # TEMPORAL. prodigal_report para checkear las secuencias obtenidas con prodigal vs blast y las posiciones sstart y send # listas añadidas para calcular coverage medio de new_sseq con respecto a alelo para establecer coverage mínimo por debajo del cual considerar LNF - shorter_seq_coverage = [] # TEMPORAL - longer_seq_coverage = [] # TEMPORAL - equal_seq_coverage = [] # TEMPORAL + shorter_seq_coverage = [] # TEMPORAL + longer_seq_coverage = [] # TEMPORAL + equal_seq_coverage = [] # TEMPORAL # listas añadidas para calcular coverage medio de sseq con respecto a alelo tras blast para establecer coverage mínimo por debajo del cual considerar LNF - shorter_blast_seq_coverage = [] # TEMPORAL - longer_blast_seq_coverage = [] # TEMPORAL - equal_blast_seq_coverage = [] # TEMPORAL + shorter_blast_seq_coverage = [] # TEMPORAL + longer_blast_seq_coverage = [] # TEMPORAL + equal_blast_seq_coverage = [] # TEMPORAL full_gene_list = [] - samples_matrix_dict = {} # to keep allele number - matching_genes_dict = {} # to keep start and stop positions - exact_dict = {} # c/m: to keep exact matches found for each sample - inferred_alleles_dict = {} # to keep track of the new inferred alleles - inf_dict = {} # to keep inferred alleles found for each sample - paralog_dict = {} # to keep paralogs found for each sample - asm_dict = {} # c/m: to keep track of asm - alm_dict = {} # c/m: to keep track of alm - list_asm = {} # c/m: to keep asm found for each sample - list_alm = {} # c/m: to keep alm found for each sample - lnf_tpr_dict = {} # c/m: to keep locus not found for each sample - plot_dict = {} # c/m: to keep plots for each sample - snp_dict = {} # c/m: to keep snp information for each sample + samples_matrix_dict = {} # to keep allele number + matching_genes_dict = {} # to keep start and stop positions + exact_dict = {} # c/m: to keep exact matches found for each sample + inferred_alleles_dict = {} # to keep track of the new inferred alleles + inf_dict = {} # to keep inferred alleles found for each sample + paralog_dict = {} # to keep paralogs found for each sample + asm_dict = {} # c/m: to keep track of asm + alm_dict = {} # c/m: to keep track of alm + list_asm = {} # c/m: to keep asm found for each sample + list_alm = {} # c/m: to keep alm found for each sample + lnf_tpr_dict = {} # c/m: to keep locus not found for each sample + plot_dict = {} # c/m: to keep plots for each sample + snp_dict = {} # c/m: to keep snp information for each sample protein_dict = {} match_alignment_dict = {} @@ -1794,71 +2926,138 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"' - print('Allele calling starts') - pbar = ProgressBar () - + print("Allele calling starts") + pbar = ProgressBar() ## # # # # # # # # # # # # # # # # # # # # # # # # ## ## Processing the search for each schema core gene ## ## # # # # # # # # # # # # # # # # # # # # # # # # ## - for core_file in pbar(core_gene_list_files) : - core_name = os.path.basename(core_file).split('.')[0] + for core_file in pbar(core_gene_list_files): + core_name = os.path.basename(core_file).split(".")[0] full_gene_list.append(core_name) - logger.info('Processing core gene file %s ', core_file) + logger.info("Processing core gene file %s ", core_file) # Get path to this locus fasta file - locus_alleles_path = os.path.join(query_directory, str(core_name + '.fasta')) + locus_alleles_path = os.path.join(query_directory, str(core_name + ".fasta")) # Get path to reference allele fasta file for this locus - core_reference_allele_path = os.path.join(reference_alleles_directory, core_name + '.fasta') + core_reference_allele_path = os.path.join( + reference_alleles_directory, core_name + ".fasta" + ) # Get length thresholds for INF, ASM and ALM classification - max_length_threshold, min_length_threshold = length_thresholds(core_name, schema_statistics, percentlength) + max_length_threshold, min_length_threshold = length_thresholds( + core_name, schema_statistics, percentlength + ) # Get length thresholds for LNF, ASM and ALM classification - max_coverage_threshold, min_coverage_threshold = length_thresholds(core_name, schema_statistics, coverage) + max_coverage_threshold, min_coverage_threshold = length_thresholds( + core_name, schema_statistics, coverage + ) ## # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## ## Processing the search for each schema core gene in each sample ## ## # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## for sample_file in sample_list_files: - logger.info('Processing sample file %s ', sample_file) + logger.info("Processing sample file %s ", sample_file) - sample_name = '.'.join(os.path.basename(sample_file).split('.')[:-1]) + sample_name = ".".join(os.path.basename(sample_file).split(".")[:-1]) # (recuento tags para plots) if sample_name not in count_exact: - count_exact[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0} + count_exact[sample_name] = { + "good_quality": 0, + "bad_quality": 0, + "no_start": 0, + "no_start_stop": 0, + "no_stop": 0, + "multiple_stop": 0, + "total": 0, + } if sample_name not in count_inf: - count_inf[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0} + count_inf[sample_name] = { + "good_quality": 0, + "bad_quality": 0, + "no_start": 0, + "no_start_stop": 0, + "no_stop": 0, + "multiple_stop": 0, + "total": 0, + } if sample_name not in count_asm: - count_asm[sample_name] = {"insertion" : 0, "deletion" : 0, "substitution" : 0, "total" : 0} + count_asm[sample_name] = { + "insertion": 0, + "deletion": 0, + "substitution": 0, + "total": 0, + } if sample_name not in count_alm: - count_alm[sample_name] = {"insertion" : 0, "deletion" : 0, "substitution" : 0, "total" : 0} + count_alm[sample_name] = { + "insertion": 0, + "deletion": 0, + "substitution": 0, + "total": 0, + } if sample_name not in count_lnf: - count_lnf[sample_name] = {"not_found" : 0, "low_id" : 0, "low_coverage" : 0, "total" : 0} + count_lnf[sample_name] = { + "not_found": 0, + "low_id": 0, + "low_coverage": 0, + "total": 0, + } if sample_name not in count_tpr: - count_tpr[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0} + count_tpr[sample_name] = { + "good_quality": 0, + "bad_quality": 0, + "no_start": 0, + "no_start_stop": 0, + "no_stop": 0, + "multiple_stop": 0, + "total": 0, + } if sample_name not in count_plot: - count_plot[sample_name] = {"total" : 0} + count_plot[sample_name] = {"total": 0} if sample_name not in count_niph: - count_niph[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0} + count_niph[sample_name] = { + "good_quality": 0, + "bad_quality": 0, + "no_start": 0, + "no_start_stop": 0, + "no_stop": 0, + "multiple_stop": 0, + "total": 0, + } if sample_name not in count_niphem: - count_niphem[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0} + count_niphem[sample_name] = { + "good_quality": 0, + "bad_quality": 0, + "no_start": 0, + "no_start_stop": 0, + "no_stop": 0, + "multiple_stop": 0, + "total": 0, + } if sample_name not in count_error: - count_error[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0} - + count_error[sample_name] = { + "good_quality": 0, + "bad_quality": 0, + "no_start": 0, + "no_start_stop": 0, + "no_stop": 0, + "multiple_stop": 0, + "total": 0, + } # Initialize the sample list to add the number of alleles and the start, stop positions if not sample_name in samples_matrix_dict: @@ -1872,7 +3071,20 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in # Sample contigs VS reference allele(s) BLAST for locus detection in sample # # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # - cline = NcbiblastnCommandline(db=blast_db_name, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path) + cline = NcbiblastnCommandline( + db=blast_db_name, + evalue=evalue, + perc_identity=perc_identity_ref, + reward=reward, + penalty=penalty, + gapopen=gapopen, + gapextend=gapextend, + outfmt=blast_parameters, + max_target_seqs=max_target_seqs, + max_hsps=max_hsps, + num_threads=num_threads, + query=core_reference_allele_path, + ) out, err = cline() out_lines = out.splitlines() @@ -1881,49 +3093,113 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in # ······························································ # # LNF if there are no BLAST results for this gene in this sample # # ······························································ # - if len (out_lines) == 0: - + if len(out_lines) == 0: # Trying to get the allele number to avoid that a bad quality assembly impact on the tree diagram - cline = NcbiblastnCommandline(db=blast_db_name, evalue=evalue, perc_identity = 70, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=1, max_hsps=1, num_threads=1, query=core_reference_allele_path) + cline = NcbiblastnCommandline( + db=blast_db_name, + evalue=evalue, + perc_identity=70, + reward=reward, + penalty=penalty, + gapopen=gapopen, + gapextend=gapextend, + outfmt=blast_parameters, + max_target_seqs=1, + max_hsps=1, + num_threads=1, + query=core_reference_allele_path, + ) out, err = cline() out_lines = out.splitlines() - if len (out_lines) > 0 : - - for line in out_lines : - values = line.split('\t') - if float(values[8]) > bigger_bitscore: - qseqid , sseqid , pident , qlen , s_length , mismatch , r_gapopen , r_evalue , bitscore , sstart , send , qstart , qend ,sseq , qseq = values + if len(out_lines) > 0: + for line in out_lines: + values = line.split("\t") + if float(values[8]) > bigger_bitscore: + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = values bigger_bitscore = float(bitscore) # Keep LNF info - lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, qseqid, pident, '-', '-', perc_identity_ref, '-', schema_quality, annotation_core_dict, count_lnf, logger) + lnf_tpr_tag( + core_name, + sample_name, + alleles_in_locus_dict, + samples_matrix_dict, + lnf_tpr_dict, + schema_statistics, + locus_alleles_path, + qseqid, + pident, + "-", + "-", + perc_identity_ref, + "-", + schema_quality, + annotation_core_dict, + count_lnf, + logger, + ) else: # Keep LNF info - lnf_tpr_tag(core_name, sample_name, '-', samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, '-', '-', '-', '-', '-', '-', schema_quality, annotation_core_dict, count_lnf, logger) + lnf_tpr_tag( + core_name, + sample_name, + "-", + samples_matrix_dict, + lnf_tpr_dict, + schema_statistics, + locus_alleles_path, + "-", + "-", + "-", + "-", + "-", + "-", + schema_quality, + annotation_core_dict, + count_lnf, + logger, + ) continue ## Continue classification process if the core gene has been detected in sample after BLAST search - if len (out_lines) > 0: - + if len(out_lines) > 0: # Parse contigs for this sample - #contig_file = os.path.join(inputdir, sample_name + ".fasta") ## parse - #records = list(SeqIO.parse(contig_file, "fasta")) ## parse + # contig_file = os.path.join(inputdir, sample_name + ".fasta") ## parse + # records = list(SeqIO.parse(contig_file, "fasta")) ## parse ## Keep BLAST results after locus detection in sample using reference allele # Path to BLAST results fasta file - path_to_blast_seq = os.path.join(blast_results_seq_directory, sample_name, core_name + "_blast.fasta") + path_to_blast_seq = os.path.join( + blast_results_seq_directory, sample_name, core_name + "_blast.fasta" + ) - with open (path_to_blast_seq, 'w') as outblast_fh: + with open(path_to_blast_seq, "w") as outblast_fh: seq_number = 1 - for line in out_lines : - values = line.split('\t') + for line in out_lines: + values = line.split("\t") qseqid = values[0] if values[1] not in contigs_in_sample_dict[sample_name]: - sseqid = '|'.join(values[1].split('|')[1:-1]) + sseqid = "|".join(values[1].split("|")[1:-1]) else: sseqid = values[1] sstart = values[9] @@ -1932,10 +3208,10 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in # Get flanked BLAST sequences from contig for correct allele tagging accession_sequence = contigs_in_sample_dict[sample_name][sseqid] - #for record in records: ## parse - #if record.id == sseqid : ## parse - #break ## parse - #accession_sequence = str(record.seq) ## parse + # for record in records: ## parse + # if record.id == sseqid : ## parse + # break ## parse + # accession_sequence = str(record.seq) ## parse if int(send) > int(sstart): max_index = int(send) @@ -1946,94 +3222,188 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in if (flankingnts + 1) <= min_index: if flankingnts <= (len(accession_sequence) - max_index): - flanked_sseq = accession_sequence[ min_index -1 -flankingnts : max_index + flankingnts ] + flanked_sseq = accession_sequence[ + min_index + - 1 + - flankingnts : max_index + + flankingnts + ] else: - flanked_sseq = accession_sequence[ min_index -1 -flankingnts : ] + flanked_sseq = accession_sequence[ + min_index - 1 - flankingnts : + ] else: - flanked_sseq = accession_sequence[ : max_index + flankingnts ] - - seq_id = str(seq_number) + '_' + sseqid - outblast_fh.write('>' + seq_id + ' # ' + ' # '.join(values[0:13]) + '\n' + flanked_sseq + '\n' + '\n' ) + flanked_sseq = accession_sequence[: max_index + flankingnts] + + seq_id = str(seq_number) + "_" + sseqid + outblast_fh.write( + ">" + + seq_id + + " # " + + " # ".join(values[0:13]) + + "\n" + + flanked_sseq + + "\n" + + "\n" + ) seq_number += 1 ## Create local BLAST database for BLAST results after locus detection in sample using reference allele db_name = os.path.join(blast_results_db_directory, sample_name) - if not create_blastdb(path_to_blast_seq, db_name, 'nucl', logger): - print('Error when creating the blastdb for blast results file for locus %s at sample %s. Check log file for more information. \n ', core_name, sample_name) + if not create_blastdb(path_to_blast_seq, db_name, "nucl", logger): + print( + "Error when creating the blastdb for blast results file for locus %s at sample %s. Check log file for more information. \n ", + core_name, + sample_name, + ) return False # Path to local BLAST database for BLAST results after locus detection in sample using reference allele - locus_blast_db_name = os.path.join(blast_results_db_directory, sample_name, os.path.basename(core_name) + '_blast', os.path.basename(core_name) + '_blast') - + locus_blast_db_name = os.path.join( + blast_results_db_directory, + sample_name, + os.path.basename(core_name) + "_blast", + os.path.basename(core_name) + "_blast", + ) # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # # BLAST result sequences VS ALL alleles in locus BLAST for allele identification detection in sample # # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # - cline = NcbiblastnCommandline(db=locus_blast_db_name, evalue=evalue, perc_identity=perc_identity_loc, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt = blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=locus_alleles_path) + cline = NcbiblastnCommandline( + db=locus_blast_db_name, + evalue=evalue, + perc_identity=perc_identity_loc, + reward=reward, + penalty=penalty, + gapopen=gapopen, + gapextend=gapextend, + outfmt=blast_parameters, + max_target_seqs=max_target_seqs, + max_hsps=max_hsps, + num_threads=num_threads, + query=locus_alleles_path, + ) out, err = cline() out_lines = out.splitlines() - allele_found = {} # To keep filtered BLAST results + allele_found = {} # To keep filtered BLAST results ## Check if there is any BLAST result with ID = 100 ## for line in out_lines: - - values = line.split('\t') + values = line.split("\t") pident = values[2] if float(pident) == 100: - - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = values + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = values # Parse core gene fasta file to get matching allele sequence and length - #alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse - #for allele in alleles_in_locus : ## parse - #if allele.id == qseqid : ## parse - #break ## comentar parse - #matching_allele_seq = str(allele.seq) ## parse - #matching_allele_length = len(matching_allele_seq) ## parse + # alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse + # for allele in alleles_in_locus : ## parse + # if allele.id == qseqid : ## parse + # break ## comentar parse + # matching_allele_seq = str(allele.seq) ## parse + # matching_allele_length = len(matching_allele_seq) ## parse matching_allele_seq = alleles_in_locus_dict[core_name][qseqid] matching_allele_length = len(matching_allele_seq) # Keep BLAST results with ID = 100 and same length as matching allele if int(s_length) == matching_allele_length: - #get_blast_results (values, records, allele_found, logger) - get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found, logger) + # get_blast_results (values, records, allele_found, logger) + get_blast_results( + sample_name, + values, + contigs_in_sample_dict, + allele_found, + logger, + ) # ·································································································································· # # NIPHEM (paralog) if there are multiple BLAST results with ID = 100 and same length as matching allele for this gene in this sample # # ·································································································································· # if len(allele_found) > 1: - # Keep NIPHEM info - paralog_exact_tag(sample_name, core_name, 'NIPHEM', schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, paralog_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_niphem, logger) + paralog_exact_tag( + sample_name, + core_name, + "NIPHEM", + schema_quality, + matching_genes_dict, + samples_matrix_dict, + allele_found, + paralog_dict, + prodigal_report, + prodigal_directory, + blast_parameters, + annotation_core_dict, + count_niphem, + logger, + ) continue ## Check for possible paralogs with ID < 100 if there is only one BLAST result with ID = 100 and same length as matching allele - elif len(allele_found) == 1 : + elif len(allele_found) == 1: + for line in out_lines: + values = line.split("\t") - for line in out_lines : - values = line.split('\t') - - sseq_no_gaps = values[13].replace('-', '') + sseq_no_gaps = values[13].replace("-", "") s_length_no_gaps = len(sseq_no_gaps) # Keep BLAST result if its coverage is within min and max thresholds - if min_length_threshold <= s_length_no_gaps <= max_length_threshold: - #get_blast_results (values, records, allele_found, logger) - get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found, logger) + if ( + min_length_threshold + <= s_length_no_gaps + <= max_length_threshold + ): + # get_blast_results (values, records, allele_found, logger) + get_blast_results( + sample_name, + values, + contigs_in_sample_dict, + allele_found, + logger, + ) # ································································ # # EXACT MATCH if there is any paralog for this gene in this sample # # ································································ # - if len(allele_found) == 1 : - - paralog_exact_tag(sample_name, core_name, 'EXACT', schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, exact_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_exact, logger) + if len(allele_found) == 1: + paralog_exact_tag( + sample_name, + core_name, + "EXACT", + schema_quality, + matching_genes_dict, + samples_matrix_dict, + allele_found, + exact_dict, + prodigal_report, + prodigal_directory, + blast_parameters, + annotation_core_dict, + count_exact, + logger, + ) continue @@ -2041,111 +3411,267 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in # NIPH if there there are paralogs with ID < 100 for this gene in this sample # # ··········································································· # else: - - paralog_exact_tag(sample_name, core_name, 'NIPH', schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, paralog_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_niph, logger) + paralog_exact_tag( + sample_name, + core_name, + "NIPH", + schema_quality, + matching_genes_dict, + samples_matrix_dict, + allele_found, + paralog_dict, + prodigal_report, + prodigal_directory, + blast_parameters, + annotation_core_dict, + count_niph, + logger, + ) continue ## Look for the best BLAST result if there are no results with ID = 100 ## elif len(allele_found) == 0: - bigger_bitscore_seq_values = [] - for line in out_lines : - values = line.split('\t') + for line in out_lines: + values = line.split("\t") - if float(values[8]) > bigger_bitscore: - s_length_no_gaps = len(values[13].replace('-', '')) + if float(values[8]) > bigger_bitscore: + s_length_no_gaps = len(values[13].replace("-", "")) # Keep BLAST result if its coverage is within min and max thresholds and its bitscore is bigger than the one previously kept - if min_coverage_threshold <= s_length_no_gaps <= max_coverage_threshold: + if ( + min_coverage_threshold + <= s_length_no_gaps + <= max_coverage_threshold + ): bigger_bitscore_seq_values = values bigger_bitscore = float(bigger_bitscore_seq_values[8]) ## Check if best BLAST result out of coverage thresholds is a possible PLOT or LNF due to low coverage ## - #if len(allele_found) == 0: + # if len(allele_found) == 0: if len(bigger_bitscore_seq_values) == 0: - # Look for best bitscore BLAST result out of coverage thresholds to check possible PLOT or reporting LNF due to low coverage - for line in out_lines : - values = line.split('\t') - - if float(values[8]) > bigger_bitscore: - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = values - bigger_bitscore_seq_values_out_cov = values ### + for line in out_lines: + values = line.split("\t") + + if float(values[8]) > bigger_bitscore: + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = values + bigger_bitscore_seq_values_out_cov = values ### bigger_bitscore = float(bitscore) # Get BLAST values relatives to contig for bigger bitscore result - lnf_plot_found = {} ### - - get_blast_results (sample_name, bigger_bitscore_seq_values_out_cov, contigs_in_sample_dict, lnf_plot_found, logger) ### - - allele_id = str(list(lnf_plot_found.keys())[0]) ### - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = lnf_plot_found[allele_id] + lnf_plot_found = {} ### + + get_blast_results( + sample_name, + bigger_bitscore_seq_values_out_cov, + contigs_in_sample_dict, + lnf_plot_found, + logger, + ) ### + + allele_id = str(list(lnf_plot_found.keys())[0]) ### + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = lnf_plot_found[allele_id] # Get contig sequence and length for best bitscore BLAST result ID - #for record in records: ## parse - #if record.id == sseqid : ## parse - #break ## parse - #accession_sequence = record.seq ## parse - #length_sseqid = len(accession_sequence) ## parse + # for record in records: ## parse + # if record.id == sseqid : ## parse + # break ## parse + # accession_sequence = record.seq ## parse + # length_sseqid = len(accession_sequence) ## parse accession_sequence = contigs_in_sample_dict[sample_name][sseqid] length_sseqid = len(accession_sequence) # Check if best BLAST result out of coverage thresholds is a possible PLOT. If so, keep result info for later PLOT classification - if int(sstart) == length_sseqid or int(send) == length_sseqid or int(sstart) == 1 or int(send) == 1: - bigger_bitscore_seq_values = bigger_bitscore_seq_values_out_cov ### + if ( + int(sstart) == length_sseqid + or int(send) == length_sseqid + or int(sstart) == 1 + or int(send) == 1 + ): + bigger_bitscore_seq_values = ( + bigger_bitscore_seq_values_out_cov ### + ) # ·············································································································································· # # LNF if there are no BLAST results within coverage thresholds for this gene in this sample and best out threshold result is not a possible PLOT # # ·············································································································································· # else: # Get sequence length - s_length_no_gaps = len(bigger_bitscore_seq_values_out_cov[13].replace('-', '')) + s_length_no_gaps = len( + bigger_bitscore_seq_values_out_cov[13].replace("-", "") + ) # Keep LNF info - lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, qseqid, pident, s_length_no_gaps, '-', '-', coverage, schema_quality, annotation_core_dict, count_lnf, logger) + lnf_tpr_tag( + core_name, + sample_name, + alleles_in_locus_dict, + samples_matrix_dict, + lnf_tpr_dict, + schema_statistics, + locus_alleles_path, + qseqid, + pident, + s_length_no_gaps, + "-", + "-", + coverage, + schema_quality, + annotation_core_dict, + count_lnf, + logger, + ) ## Keep result with bigger bitscore in allele_found dict and look for possible paralogs ## if len(bigger_bitscore_seq_values) > 0: - - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = bigger_bitscore_seq_values - - #get_blast_results (bigger_bitscore_seq_values, records, allele_found, logger) - get_blast_results (sample_name, bigger_bitscore_seq_values, contigs_in_sample_dict, allele_found, logger) + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = bigger_bitscore_seq_values + + # get_blast_results (bigger_bitscore_seq_values, records, allele_found, logger) + get_blast_results( + sample_name, + bigger_bitscore_seq_values, + contigs_in_sample_dict, + allele_found, + logger, + ) # Possible paralogs search - for line in out_lines : - values = line.split('\t') - - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = values - sseq_no_gaps = sseq.replace('-', '') + for line in out_lines: + values = line.split("\t") + + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = values + sseq_no_gaps = sseq.replace("-", "") s_length_no_gaps = len(sseq_no_gaps) - if min_length_threshold <= s_length_no_gaps <= max_length_threshold: - - #get_blast_results (values, records, allele_found, logger) - get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found, logger) + if ( + min_length_threshold + <= s_length_no_gaps + <= max_length_threshold + ): + # get_blast_results (values, records, allele_found, logger) + get_blast_results( + sample_name, + values, + contigs_in_sample_dict, + allele_found, + logger, + ) # ····························································· # # NIPH if there there are paralogs for this gene in this sample # # ····························································· # - if len(allele_found) > 1 : - - paralog_exact_tag(sample_name, core_name, 'NIPH', schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, paralog_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_niph, logger) + if len(allele_found) > 1: + paralog_exact_tag( + sample_name, + core_name, + "NIPH", + schema_quality, + matching_genes_dict, + samples_matrix_dict, + allele_found, + paralog_dict, + prodigal_report, + prodigal_directory, + blast_parameters, + annotation_core_dict, + count_niph, + logger, + ) continue ## Continue classification if there are no paralogs ## - elif len(allele_found) == 1 : - + elif len(allele_found) == 1: allele_id = str(list(allele_found.keys())[0]) - qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = allele_found[allele_id] - - sseq_no_gaps = sseq.replace('-', '') + ( + qseqid, + sseqid, + pident, + qlen, + s_length, + mismatch, + r_gapopen, + r_evalue, + bitscore, + sstart, + send, + qstart, + qend, + sseq, + qseq, + ) = allele_found[allele_id] + + sseq_no_gaps = sseq.replace("-", "") s_length_no_gaps = len(sseq_no_gaps) # Get matching allele quality @@ -2153,48 +3679,99 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in # Get matching allele sequence and length - #alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse - #for allele in alleles_in_locus : ## parse - #if allele.id == qseqid : ## parse - #break ## parse - #matching_allele_seq = allele.seq ## parse - #matching_allele_length = len(matching_allele_seq) ## parse + # alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse + # for allele in alleles_in_locus : ## parse + # if allele.id == qseqid : ## parse + # break ## parse + # matching_allele_seq = allele.seq ## parse + # matching_allele_length = len(matching_allele_seq) ## parse - matching_allele_seq = alleles_in_locus_dict [core_name][qseqid] + matching_allele_seq = alleles_in_locus_dict[core_name][ + qseqid + ] matching_allele_length = len(matching_allele_seq) # Get contig sequence and length for ID found in BLAST - #for record in records: ## parse - #if record.id == sseqid : ## parse - #break ## parse - #accession_sequence = record.seq ## parse - #length_sseqid = len(accession_sequence) ## parse + # for record in records: ## parse + # if record.id == sseqid : ## parse + # break ## parse + # accession_sequence = record.seq ## parse + # length_sseqid = len(accession_sequence) ## parse - accession_sequence = contigs_in_sample_dict[sample_name][sseqid] + accession_sequence = contigs_in_sample_dict[sample_name][ + sseqid + ] length_sseqid = len(accession_sequence) # ········································································································· # # PLOT if found sequence is shorter than matching allele and it is located on the edge of the sample contig # # ········································································································· # - if int(sstart) == length_sseqid or int(send) == length_sseqid or int(sstart) == 1 or int(send) == 1: + if ( + int(sstart) == length_sseqid + or int(send) == length_sseqid + or int(sstart) == 1 + or int(send) == 1 + ): if int(s_length) < matching_allele_length: - ### sacar sec prodigal para PLOT? # Get prodigal predicted sequence if matching allele quality is "bad quality" - if 'bad_quality' in allele_quality: - complete_predicted_seq, start_prodigal, end_prodigal = get_prodigal_sequence(sseq_no_gaps, sseqid, prodigal_directory, sample_name, blast_parameters, logger) + if "bad_quality" in allele_quality: + ( + complete_predicted_seq, + start_prodigal, + end_prodigal, + ) = get_prodigal_sequence( + sseq_no_gaps, + sseqid, + prodigal_directory, + sample_name, + blast_parameters, + logger, + ) # Keep info for prodigal report - prodigal_report.append([core_name, sample_name, qseqid, 'PLOT', sstart, send, start_prodigal, end_prodigal, sseq_no_gaps, complete_predicted_seq]) + prodigal_report.append( + [ + core_name, + sample_name, + qseqid, + "PLOT", + sstart, + send, + start_prodigal, + end_prodigal, + sseq_no_gaps, + complete_predicted_seq, + ] + ) else: - complete_predicted_seq = '-' - start_prodigal = '-' - end_prodigal = '-' + complete_predicted_seq = "-" + start_prodigal = "-" + end_prodigal = "-" # Keep PLOT info - inf_asm_alm_tag(core_name, sample_name, 'PLOT', allele_found[allele_id], allele_quality, '-', matching_allele_length, '-', plot_dict, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_plot, logger) + inf_asm_alm_tag( + core_name, + sample_name, + "PLOT", + allele_found[allele_id], + allele_quality, + "-", + matching_allele_length, + "-", + plot_dict, + samples_matrix_dict, + matching_genes_dict, + prodigal_report, + start_prodigal, + end_prodigal, + complete_predicted_seq, + annotation_core_dict, + count_plot, + logger, + ) continue @@ -2203,121 +3780,345 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in # * * * * * * * * * * * * * * * * * * * * # ## Get Prodigal predicted sequence ## - complete_predicted_seq, start_prodigal, end_prodigal = get_prodigal_sequence(sseq_no_gaps, sseqid, prodigal_directory, sample_name, blast_parameters, logger) + ( + complete_predicted_seq, + start_prodigal, + end_prodigal, + ) = get_prodigal_sequence( + sseq_no_gaps, + sseqid, + prodigal_directory, + sample_name, + blast_parameters, + logger, + ) ## Search for new codon stop using contig sequence info ## # Check matching allele sequence direction - query_direction = check_sequence_order(matching_allele_seq, logger) + query_direction = check_sequence_order( + matching_allele_seq, logger + ) # Get extended BLAST sequence for stop codon search - if query_direction == 'reverse': - if int(send) > int (sstart): - sample_gene_sequence = accession_sequence[ : int(send) ] - sample_gene_sequence = str(Seq.Seq(sample_gene_sequence).reverse_complement()) + if query_direction == "reverse": + if int(send) > int(sstart): + sample_gene_sequence = accession_sequence[ + : int(send) + ] + sample_gene_sequence = str( + Seq.Seq( + sample_gene_sequence + ).reverse_complement() + ) else: - sample_gene_sequence = accession_sequence[ int(send) -1 : ] + sample_gene_sequence = accession_sequence[ + int(send) - 1 : + ] else: - if int(sstart) > int (send): - sample_gene_sequence = accession_sequence[ : int(sstart) ] - sample_gene_sequence = str(Seq.Seq(sample_gene_sequence).reverse_complement()) + if int(sstart) > int(send): + sample_gene_sequence = accession_sequence[ + : int(sstart) + ] + sample_gene_sequence = str( + Seq.Seq( + sample_gene_sequence + ).reverse_complement() + ) else: - sample_gene_sequence = accession_sequence[ int(sstart) -1 : ] + sample_gene_sequence = accession_sequence[ + int(sstart) - 1 : + ] # Get new stop codon index stop_index = get_stop_codon_index(sample_gene_sequence) ## Classification of final new sequence if it is found ## if stop_index != False: - new_sequence_length = stop_index +3 - new_sseq = str(sample_gene_sequence[0:new_sequence_length]) + new_sequence_length = stop_index + 3 + new_sseq = str( + sample_gene_sequence[0:new_sequence_length] + ) ######################################################################################################################### ### c/m: introducido para determinar qué umbral de coverage poner. TEMPORAL - new_sseq_coverage = new_sequence_length/matching_allele_length ### introduciendo coverage new_sseq /// debería ser con respecto a la media? + new_sseq_coverage = ( + new_sequence_length / matching_allele_length + ) ### introduciendo coverage new_sseq /// debería ser con respecto a la media? if new_sseq_coverage < 1: - shorter_seq_coverage.append([core_name, sample_name, str(matching_allele_length), str(new_sequence_length), str(schema_statistics[core_name][0]), str(new_sseq_coverage), str(new_sequence_length/schema_statistics[core_name][0])]) + shorter_seq_coverage.append( + [ + core_name, + sample_name, + str(matching_allele_length), + str(new_sequence_length), + str(schema_statistics[core_name][0]), + str(new_sseq_coverage), + str( + new_sequence_length + / schema_statistics[core_name][0] + ), + ] + ) elif new_sseq_coverage > 1: - longer_seq_coverage.append([core_name, sample_name, str(matching_allele_length), str(new_sequence_length), str(schema_statistics[core_name][0]), str(new_sseq_coverage), str(new_sequence_length/schema_statistics[core_name][0])]) + longer_seq_coverage.append( + [ + core_name, + sample_name, + str(matching_allele_length), + str(new_sequence_length), + str(schema_statistics[core_name][0]), + str(new_sseq_coverage), + str( + new_sequence_length + / schema_statistics[core_name][0] + ), + ] + ) elif new_sseq_coverage == 1: - equal_seq_coverage.append([core_name, sample_name, str(matching_allele_length), str(new_sequence_length), str(schema_statistics[core_name][0]), str(new_sseq_coverage), str(new_sequence_length/schema_statistics[core_name][0])]) + equal_seq_coverage.append( + [ + core_name, + sample_name, + str(matching_allele_length), + str(new_sequence_length), + str(schema_statistics[core_name][0]), + str(new_sseq_coverage), + str( + new_sequence_length + / schema_statistics[core_name][0] + ), + ] + ) ######################################################################################################################### # Get and keep SNP and DNA and protein alignment - keep_snp_alignment_info(sseq, new_sseq, matching_allele_seq, qseqid, query_direction, core_name, sample_name, reward, penalty, gapopen, gapextend, snp_dict, match_alignment_dict, protein_dict, logger) + keep_snp_alignment_info( + sseq, + new_sseq, + matching_allele_seq, + qseqid, + query_direction, + core_name, + sample_name, + reward, + penalty, + gapopen, + gapextend, + snp_dict, + match_alignment_dict, + protein_dict, + logger, + ) # ····································································································· # # INF if final new sequence length is within min and max length thresholds for this gene in this sample # # ····································································································· # - if min_length_threshold <= new_sequence_length <= max_length_threshold: - + if ( + min_length_threshold + <= new_sequence_length + <= max_length_threshold + ): # Keep INF info - inf_asm_alm_tag(core_name, sample_name, 'INF', allele_found[allele_id], allele_quality, new_sseq, matching_allele_length, inferred_alleles_dict, inf_dict, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_inf, logger) ### introducido start_prodigal, end_prodigal, complete_predicted_seq, prodigal_report como argumento a inf_asm_alm_tag para report prodigal, temporal + inf_asm_alm_tag( + core_name, + sample_name, + "INF", + allele_found[allele_id], + allele_quality, + new_sseq, + matching_allele_length, + inferred_alleles_dict, + inf_dict, + samples_matrix_dict, + matching_genes_dict, + prodigal_report, + start_prodigal, + end_prodigal, + complete_predicted_seq, + annotation_core_dict, + count_inf, + logger, + ) ### introducido start_prodigal, end_prodigal, complete_predicted_seq, prodigal_report como argumento a inf_asm_alm_tag para report prodigal, temporal # ············································································································································ # # ASM if final new sequence length is under min length threshold but its coverage is above min coverage threshold for this gene in this sample # # ············································································································································ # - elif min_coverage_threshold <= new_sequence_length < min_length_threshold: - + elif ( + min_coverage_threshold + <= new_sequence_length + < min_length_threshold + ): # Keep ASM info - inf_asm_alm_tag(core_name, sample_name, 'ASM', allele_found[allele_id], allele_quality, new_sseq, matching_allele_length, asm_dict, list_asm, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_asm, logger) + inf_asm_alm_tag( + core_name, + sample_name, + "ASM", + allele_found[allele_id], + allele_quality, + new_sseq, + matching_allele_length, + asm_dict, + list_asm, + samples_matrix_dict, + matching_genes_dict, + prodigal_report, + start_prodigal, + end_prodigal, + complete_predicted_seq, + annotation_core_dict, + count_asm, + logger, + ) # ············································································································································ # # ALM if final new sequence length is above max length threshold but its coverage is under max coverage threshold for this gene in this sample # # ············································································································································ # - elif max_length_threshold < new_sequence_length <= max_coverage_threshold: - + elif ( + max_length_threshold + < new_sequence_length + <= max_coverage_threshold + ): # Keep ALM info - inf_asm_alm_tag(core_name, sample_name, 'ALM', allele_found[allele_id], allele_quality, new_sseq, matching_allele_length, alm_dict, list_alm, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_alm, logger) ### introducido start_prodigal, end_prodigal, complete_predicted_seq, prodigal_report como argumento a inf_asm_alm_tag para report prodigal, temporal + inf_asm_alm_tag( + core_name, + sample_name, + "ALM", + allele_found[allele_id], + allele_quality, + new_sseq, + matching_allele_length, + alm_dict, + list_alm, + samples_matrix_dict, + matching_genes_dict, + prodigal_report, + start_prodigal, + end_prodigal, + complete_predicted_seq, + annotation_core_dict, + count_alm, + logger, + ) ### introducido start_prodigal, end_prodigal, complete_predicted_seq, prodigal_report como argumento a inf_asm_alm_tag para report prodigal, temporal # ························································································· # # TPR if final new sequence coverage is not within thresholds for this gene in this sample # # ························································································· # else: - # Keep TPR info - lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, qseqid, pident, s_length_no_gaps, new_sequence_length, '-', coverage, schema_quality, annotation_core_dict, count_tpr, logger) + lnf_tpr_tag( + core_name, + sample_name, + alleles_in_locus_dict, + samples_matrix_dict, + lnf_tpr_dict, + schema_statistics, + locus_alleles_path, + qseqid, + pident, + s_length_no_gaps, + new_sequence_length, + "-", + coverage, + schema_quality, + annotation_core_dict, + count_tpr, + logger, + ) # ········································ # # ERROR if final new sequence is not found # # ········································ # else: - logger.error('ERROR : Stop codon was not found for the core %s and the sample %s', core_name, sample_name) - samples_matrix_dict[sample_name].append('ERROR not stop codon') - if not sseqid in matching_genes_dict[sample_name] : + logger.error( + "ERROR : Stop codon was not found for the core %s and the sample %s", + core_name, + sample_name, + ) + samples_matrix_dict[sample_name].append( + "ERROR not stop codon" + ) + if not sseqid in matching_genes_dict[sample_name]: matching_genes_dict[sample_name][sseqid] = [] - if sstart > send : - #matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'-', 'ERROR']) - matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, send,'-', 'ERROR']) + if sstart > send: + # matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'-', 'ERROR']) + matching_genes_dict[sample_name][sseqid].append( + [core_name, qseqid, sstart, send, "-", "ERROR"] + ) else: - #matching_genes_dict[sample_name][sseqid].append([core_name, sstart,send,'+', 'ERROR']) - matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, send,'+', 'ERROR']) + # matching_genes_dict[sample_name][sseqid].append([core_name, sstart,send,'+', 'ERROR']) + matching_genes_dict[sample_name][sseqid].append( + [core_name, qseqid, sstart, send, "+", "ERROR"] + ) # (recuento tags para plot) - count_error[sample_name]['total'] += 1 + count_error[sample_name]["total"] += 1 for count_class in count_error[sample_name]: if count_class in allele_quality: - if "no_start_stop" not in count_class and "no_start_stop" in allele_quality: + if ( + "no_start_stop" not in count_class + and "no_start_stop" in allele_quality + ): if count_class == "bad_quality": - count_error[sample_name][count_class] += 1 + count_error[sample_name][ + count_class + ] += 1 else: count_error[sample_name][count_class] += 1 - ## Save results and create reports - if not save_allele_call_results (outputdir, full_gene_list, samples_matrix_dict, exact_dict, paralog_dict, inf_dict, plot_dict, matching_genes_dict, list_asm, list_alm, lnf_tpr_dict, snp_dict, match_alignment_dict, protein_dict, prodigal_report, shorter_seq_coverage, longer_seq_coverage, equal_seq_coverage, shorter_blast_seq_coverage, longer_blast_seq_coverage, equal_blast_seq_coverage, logger): - print('There is an error while saving the allele calling results. Check the log file to get more information \n') - # exit(0) - + if not save_allele_call_results( + outputdir, + full_gene_list, + samples_matrix_dict, + exact_dict, + paralog_dict, + inf_dict, + plot_dict, + matching_genes_dict, + list_asm, + list_alm, + lnf_tpr_dict, + snp_dict, + match_alignment_dict, + protein_dict, + prodigal_report, + shorter_seq_coverage, + longer_seq_coverage, + equal_seq_coverage, + shorter_blast_seq_coverage, + longer_blast_seq_coverage, + equal_blast_seq_coverage, + logger, + ): + print( + "There is an error while saving the allele calling results. Check the log file to get more information \n" + ) + # exit(0) ## Saving sample results plots - if not save_allele_calling_plots (outputdir, sample_list_files, count_exact, count_inf, count_asm, count_alm, count_lnf, count_tpr, count_plot, count_niph, count_niphem, count_error, logger): - print('There is an error while saving the allele calling results plots. Check the log file to get more information \n') - + if not save_allele_calling_plots( + outputdir, + sample_list_files, + count_exact, + count_inf, + count_asm, + count_alm, + count_lnf, + count_tpr, + count_plot, + count_niph, + count_niphem, + count_error, + logger, + ): + print( + "There is an error while saving the allele calling results plots. Check the log file to get more information \n" + ) return True, inferred_alleles_dict, inf_dict, exact_dict @@ -2326,8 +4127,9 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in # Processing gene by gene allele calling # # * * * * * * * * * * * * * * * * * * * # -def processing_allele_calling (arguments) : - ''' + +def processing_allele_calling(arguments): + """ Description: This is the main function for allele calling. With the support of additional functions it will create the output files @@ -2340,93 +4142,145 @@ def processing_allele_calling (arguments) : ???? Return: ???? - ''' + """ start_time = datetime.now() - print('Start the execution at :', start_time ) + print("Start the execution at :", start_time) # Open log file - logger = open_log ('taranis_wgMLST.log') - #print('Checking the pre-requisites.') + logger = open_log("taranis_wgMLST.log") + # print('Checking the pre-requisites.') ############################################################ ## Check additional programs are installed in your system ## ############################################################ - #pre_requisites_list = [['blastp', '2.9'], ['makeblastdb', '2.9']] - #if not check_prerequisites (pre_requisites_list, logger): + # pre_requisites_list = [['blastp', '2.9'], ['makeblastdb', '2.9']] + # if not check_prerequisites (pre_requisites_list, logger): # print ('your system does not fulfill the pre-requistes to run the script ') # exit(0) ###################################################### ## Check that given directories contain fasta files ## ###################################################### - print('Validating schema fasta files in ' , arguments.coregenedir , '\n') + print("Validating schema fasta files in ", arguments.coregenedir, "\n") valid_core_gene_files = get_fasta_file_list(arguments.coregenedir, logger) - if not valid_core_gene_files : - print ('There are not valid fasta files in ', arguments.coregenedir , ' directory. Check log file for more information ') + if not valid_core_gene_files: + print( + "There are not valid fasta files in ", + arguments.coregenedir, + " directory. Check log file for more information ", + ) exit(0) - print('Validating reference alleles fasta files in ' , arguments.refalleles , '\n') + print("Validating reference alleles fasta files in ", arguments.refalleles, "\n") valid_reference_alleles_files = get_fasta_file_list(arguments.refalleles, logger) - if not valid_reference_alleles_files : - print ('There are not valid reference alleles fasta files in ', arguments.refalleles, ' directory. Check log file for more information ') + if not valid_reference_alleles_files: + print( + "There are not valid reference alleles fasta files in ", + arguments.refalleles, + " directory. Check log file for more information ", + ) exit(0) - print('Validating sample fasta files in ' , arguments.inputdir , '\n') + print("Validating sample fasta files in ", arguments.inputdir, "\n") valid_sample_files = get_fasta_file_list(arguments.inputdir, logger) - if not valid_sample_files : - print ('There are not valid fasta files in ', arguments.inputdir , ' directory. Check log file for more information ') + if not valid_sample_files: + print( + "There are not valid fasta files in ", + arguments.inputdir, + " directory. Check log file for more information ", + ) exit(0) ################################# ## Prepare the coreMLST schema ## ################################# - tmp_core_gene_dir = os.path.join(arguments.outputdir,'tmp','cgMLST') + tmp_core_gene_dir = os.path.join(arguments.outputdir, "tmp", "cgMLST") try: os.makedirs(tmp_core_gene_dir) except: - logger.info('Deleting the temporary directory for a previous execution without cleaning up') - shutil.rmtree(os.path.join(arguments.outputdir, 'tmp')) + logger.info( + "Deleting the temporary directory for a previous execution without cleaning up" + ) + shutil.rmtree(os.path.join(arguments.outputdir, "tmp")) try: os.makedirs(tmp_core_gene_dir) - logger.info ('Temporary folder %s has been created again', tmp_core_gene_dir) + logger.info( + "Temporary folder %s has been created again", tmp_core_gene_dir + ) except: - logger.info('Unable to create again the temporary directory %s', tmp_core_gene_dir) - print('Cannot create temporary directory on ', tmp_core_gene_dir) + logger.info( + "Unable to create again the temporary directory %s", tmp_core_gene_dir + ) + print("Cannot create temporary directory on ", tmp_core_gene_dir) exit(0) - alleles_in_locus_dict, annotation_core_dict, schema_variability, schema_statistics, schema_quality = prepare_core_gene (valid_core_gene_files, tmp_core_gene_dir, arguments.refalleles, arguments.genus, arguments.species, str(arguments.usegenus).lower(), logger) - #alleles_in_locus_dict, annotation_core_dict, schema_variability, schema_statistics, schema_quality = prepare_core_gene (valid_core_gene_files, tmp_core_gene_dir, arguments.refalleles, arguments.outputdir, logger) + ( + alleles_in_locus_dict, + annotation_core_dict, + schema_variability, + schema_statistics, + schema_quality, + ) = prepare_core_gene( + valid_core_gene_files, + tmp_core_gene_dir, + arguments.refalleles, + arguments.genus, + arguments.species, + str(arguments.usegenus).lower(), + logger, + ) + # alleles_in_locus_dict, annotation_core_dict, schema_variability, schema_statistics, schema_quality = prepare_core_gene (valid_core_gene_files, tmp_core_gene_dir, arguments.refalleles, arguments.outputdir, logger) if not alleles_in_locus_dict: - print('There is an error while processing the schema preparation phase. Check the log file to get more information \n') - logger.info('Deleting the temporary directory to clean up the temporary files created') - shutil.rmtree(os.path.join(arguments.outputdir, 'tmp')) + print( + "There is an error while processing the schema preparation phase. Check the log file to get more information \n" + ) + logger.info( + "Deleting the temporary directory to clean up the temporary files created" + ) + shutil.rmtree(os.path.join(arguments.outputdir, "tmp")) exit(0) ############################### ## Prepare the samples files ## ############################### - tmp_samples_dir = os.path.join(arguments.outputdir,'tmp','samples') + tmp_samples_dir = os.path.join(arguments.outputdir, "tmp", "samples") try: os.makedirs(tmp_samples_dir) except: - logger.info('Deleting the temporary directory for a previous execution without properly cleaning up') + logger.info( + "Deleting the temporary directory for a previous execution without properly cleaning up" + ) shutil.rmtree(tmp_samples_dir) try: os.makedirs(tmp_samples_dir) - logger.info('Temporary folder %s has been created again', tmp_samples_dir) + logger.info("Temporary folder %s has been created again", tmp_samples_dir) except: - logger.info('Unable to create again the temporary directory %s', tmp_samples_dir) - shutil.rmtree(os.path.join(arguments.outputdir, 'tmp')) - logger.info('Cleaned up temporary directory ', ) - print('Cannot create temporary directory on ', tmp_samples_dir, 'Check the log file to get more information \n') + logger.info( + "Unable to create again the temporary directory %s", tmp_samples_dir + ) + shutil.rmtree(os.path.join(arguments.outputdir, "tmp")) + logger.info( + "Cleaned up temporary directory ", + ) + print( + "Cannot create temporary directory on ", + tmp_samples_dir, + "Check the log file to get more information \n", + ) exit(0) - contigs_in_sample_dict = prepare_samples(valid_sample_files, tmp_samples_dir, arguments.refgenome, logger) - if not contigs_in_sample_dict : - print('There is an error while processing the saving temporary files. Check the log file to get more information \n') - logger.info('Deleting the temporary directory to clean up the temporary files created') - shutil.rmtree(os.path.join(arguments.outputdir, 'tmp')) + contigs_in_sample_dict = prepare_samples( + valid_sample_files, tmp_samples_dir, arguments.refgenome, logger + ) + if not contigs_in_sample_dict: + print( + "There is an error while processing the saving temporary files. Check the log file to get more information \n" + ) + logger.info( + "Deleting the temporary directory to clean up the temporary files created" + ) + shutil.rmtree(os.path.join(arguments.outputdir, "tmp")) exit(0) ################################## @@ -2434,51 +4288,126 @@ def processing_allele_calling (arguments) : ################################## query_directory = arguments.coregenedir reference_alleles_directory = arguments.refalleles - blast_db_directory = os.path.join(tmp_samples_dir,'blastdb') - prodigal_directory = os.path.join(tmp_samples_dir,'prodigal') - blast_results_seq_directory = os.path.join(tmp_samples_dir,'blast_results', 'blast_results_seq') ### path a directorio donde guardar secuencias encontradas tras blast con alelo de referencia - blast_results_db_directory = os.path.join(tmp_samples_dir,'blast_results', 'blast_results_db') ### path a directorio donde guardar db de secuencias encontradas tras blast con alelo de referencia - - complete_allele_call, inferred_alleles_dict, inf_dict, exact_dict = allele_call_nucleotides(valid_core_gene_files, valid_sample_files, alleles_in_locus_dict, contigs_in_sample_dict, query_directory, reference_alleles_directory, blast_db_directory, prodigal_directory, blast_results_seq_directory, blast_results_db_directory, arguments.inputdir, arguments.outputdir, int(arguments.cpus), arguments.percentlength, arguments.coverage, float(arguments.evalue), int(arguments.perc_identity_ref), int(arguments.perc_identity_loc), int(arguments.reward), int(arguments.penalty), int(arguments.gapopen), int(arguments.gapextend), int(arguments.max_target_seqs), int(arguments.max_hsps), int(arguments.num_threads), int(arguments.flankingnts), schema_variability, schema_statistics, schema_quality, annotation_core_dict, arguments.profile, logger) + blast_db_directory = os.path.join(tmp_samples_dir, "blastdb") + prodigal_directory = os.path.join(tmp_samples_dir, "prodigal") + blast_results_seq_directory = os.path.join( + tmp_samples_dir, "blast_results", "blast_results_seq" + ) ### path a directorio donde guardar secuencias encontradas tras blast con alelo de referencia + blast_results_db_directory = os.path.join( + tmp_samples_dir, "blast_results", "blast_results_db" + ) ### path a directorio donde guardar db de secuencias encontradas tras blast con alelo de referencia + + ( + complete_allele_call, + inferred_alleles_dict, + inf_dict, + exact_dict, + ) = allele_call_nucleotides( + valid_core_gene_files, + valid_sample_files, + alleles_in_locus_dict, + contigs_in_sample_dict, + query_directory, + reference_alleles_directory, + blast_db_directory, + prodigal_directory, + blast_results_seq_directory, + blast_results_db_directory, + arguments.inputdir, + arguments.outputdir, + int(arguments.cpus), + arguments.percentlength, + arguments.coverage, + float(arguments.evalue), + int(arguments.perc_identity_ref), + int(arguments.perc_identity_loc), + int(arguments.reward), + int(arguments.penalty), + int(arguments.gapopen), + int(arguments.gapextend), + int(arguments.max_target_seqs), + int(arguments.max_hsps), + int(arguments.num_threads), + int(arguments.flankingnts), + schema_variability, + schema_statistics, + schema_quality, + annotation_core_dict, + arguments.profile, + logger, + ) if not complete_allele_call: - print('There is an error while processing the allele calling. Check the log file to get more information \n') + print( + "There is an error while processing the allele calling. Check the log file to get more information \n" + ) exit(0) ######################################################### ## Update core gene schema adding new inferred alleles ## ######################################################### if inferred_alleles_dict: - if str(arguments.updateschema).lower() == 'true' or str(arguments.updateschema).lower() == 'new': - if not update_schema (str(arguments.updateschema).lower(), arguments.coregenedir, arguments.outputdir, valid_core_gene_files, inferred_alleles_dict, alleles_in_locus_dict, logger): - print('There is an error adding new inferred alleles found to the core genes schema. Check the log file to get more information \n') + if ( + str(arguments.updateschema).lower() == "true" + or str(arguments.updateschema).lower() == "new" + ): + if not update_schema( + str(arguments.updateschema).lower(), + arguments.coregenedir, + arguments.outputdir, + valid_core_gene_files, + inferred_alleles_dict, + alleles_in_locus_dict, + logger, + ): + print( + "There is an error adding new inferred alleles found to the core genes schema. Check the log file to get more information \n" + ) exit(0) - if str(arguments.profile).lower() != 'false': - + if str(arguments.profile).lower() != "false": ############################ ## Get ST for each sample ## ############################ - complete_ST, inf_ST = get_ST_profile(arguments.outputdir, arguments.profile, exact_dict, inf_dict, valid_core_gene_files, valid_sample_files, logger) + complete_ST, inf_ST = get_ST_profile( + arguments.outputdir, + arguments.profile, + exact_dict, + inf_dict, + valid_core_gene_files, + valid_sample_files, + logger, + ) if not complete_ST: - print('There is an error while processing ST analysis. Check the log file to get more information \n') + print( + "There is an error while processing ST analysis. Check the log file to get more information \n" + ) exit(0) ########################################### ## Update ST profile file adding new STs ## ########################################### - if str(arguments.updateprofile).lower() == 'true' or str(arguments.updateprofile).lower() == 'new': + if ( + str(arguments.updateprofile).lower() == "true" + or str(arguments.updateprofile).lower() == "new" + ): if len(inf_ST) > 0: - if not update_st_profile (str(arguments.updateprofile).lower(), arguments.profile, arguments.outputdir, inf_ST, valid_core_gene_files, logger): - print('There is an error adding new STs found to the ST profile file. Check the log file to get more information \n') + if not update_st_profile( + str(arguments.updateprofile).lower(), + arguments.profile, + arguments.outputdir, + inf_ST, + valid_core_gene_files, + logger, + ): + print( + "There is an error adding new STs found to the ST profile file. Check the log file to get more information \n" + ) exit(0) - shutil.rmtree(os.path.join(arguments.outputdir, 'tmp')) + shutil.rmtree(os.path.join(arguments.outputdir, "tmp")) end_time = datetime.now() - print('completed execution at :', end_time ) + print("completed execution at :", end_time) return True - - - diff --git a/taranis/analyze_schema.py b/taranis/analyze_schema.py index cf218bf..c17fd99 100644 --- a/taranis/analyze_schema.py +++ b/taranis/analyze_schema.py @@ -4,9 +4,12 @@ import rich.console import statistics from pathlib import Path +import Bio.Data.CodonTable from Bio import SeqIO -from Bio.SeqRecord import SeqRecord + +# from Bio.SeqRecord import SeqRecord +from collections import OrderedDict import taranis.utils @@ -43,66 +46,89 @@ def __init__( self.species = species self.usegenus = usegenus - def check_allele_quality(self): - a_quality = {} + def check_allele_quality(self, prokka_annotation): + a_quality = OrderedDict() allele_seq = {} bad_quality_record = [] with open(self.schema_allele) as fh: for record in SeqIO.parse(self.schema_allele, "fasta"): - a_quality[record.id] = {"quality": "Good quality", "reason": "-"} + try: + prokka_ann = prokka_annotation[record.id] + except: + prokka_ann = "Not found in prokka" + a_quality[record.id] = { + "allele_name": self.allele_name, + "quality": "Good quality", + "reason": "-", + "direction": "forward", + "start_codon_alt": "standard", + "protein_seq": "", + "cds_coding": prokka_ann, + } allele_seq[record.id] = str(record.seq) - a_quality[record.id]["length"] = len(str(record.seq)) - if len(record.seq) % 3 != 0: - a_quality[record.id]["quality"] = "Bad quality" - a_quality[record.id]["reason"] = "Can not be converted to protein" - a_quality[record.id]["order"] = "-" - else: - sequence_order = taranis.utils.check_sequence_order(str(record.seq)) - if sequence_order == "Error": - a_quality[record.id]["quality"] = "Bad quality" - a_quality[record.id]["reason"] = "Start or end codon not found" - a_quality[record.id]["order"] = "-" - elif sequence_order == "reverse": - record_sequence = str(record.seq.reverse_complement()) + a_quality[record.id]["length"] = str(len(str(record.seq))) + a_quality[record.id]["dna_seq"] = str(record.seq) + sequence_direction = taranis.utils.get_seq_direction(str(record.seq)) + + if sequence_direction == "reverse": + record.seq = record.seq.reverse_complement() + a_quality[record.id]["direction"] = sequence_direction + elif sequence_direction == "Error": + a_quality[record.id]["direction"] = "-" + try: + a_quality[record.id]["protein_seq"] = str( + record.seq.translate(table=1, cds=True) + ) + + except Bio.Data.CodonTable.TranslationError as e: + if "not a start codon" in str(e): + try: + # Check if sequence has an alternative start codon + # for protein coding + a_quality[record.id]["protein_seq"] = str( + record.seq.translate(table=2, cds=True) + ) + a_quality[record.id]["start_codon_alt"] = "alternative" + except Bio.Data.CodonTable.TranslationError as e_2: + if "stop" in str(e_2): + a_quality[record.id]["reason"] = str(e_2).replace( + "'", "" + ) + else: + a_quality[record.id]["reason"] = str(e).replace("'", "") + a_quality[record.id]["quality"] = "Bad quality" else: - record_sequence = str(record.seq) - a_quality[record.id]["order"] = sequence_order - if record_sequence[0:3] not in taranis.utils.START_CODON_FORWARD: a_quality[record.id]["quality"] = "Bad quality" - a_quality[record.id]["reason"] = "Start codon not found" - elif record_sequence[-3:] not in taranis.utils.STOP_CODON_FORWARD: - a_quality[record.id]["quality"] = "Bad quality" - a_quality[record.id]["reason"] = "Stop codon not found" - - elif taranis.utils.find_multiple_stop_codons(record_sequence): - a_quality[record.id]["quality"] = "Bad quality" - a_quality[record.id]["reason"] = "Multiple stop codons found" + a_quality[record.id]["reason"] = str(e).replace("'", "") + if ( self.remove_no_cds and a_quality[record.id]["quality"] == "Bad quality" ): bad_quality_record.append(record.id) - if self.remove_duplicated: - # get the unique sequences and compare the length with all sequences - unique_seq = list(set(list(allele_seq.values()))) - if len(unique_seq) < len(allele_seq): - tmp_dict = {} - for rec_id, seq_value in allele_seq.items(): - if seq_value not in tmp_dict: - tmp_dict[seq_value] = 0 - else: - bad_quality_record.append(rec_id) - a_quality[rec_id]["quality"] ="Bad quality" - a_quality[rec_id]["reason"] ="Duplicate allele" - if self.remove_subset: - unique_seq = list(set(list(allele_seq.values()))) + # check if there are duplicated alleles + # get the unique sequences and compare the length with all sequences + unique_seq = list(set(list(allele_seq.values()))) + if len(unique_seq) < len(allele_seq): + tmp_dict = {} for rec_id, seq_value in allele_seq.items(): - unique_seq.remove(seq_value) - if seq_value in unique_seq: + if seq_value not in tmp_dict: + tmp_dict[seq_value] = 0 + else: + a_quality[rec_id]["quality"] = "Bad quality" + a_quality[rec_id]["reason"] = "Duplicate allele" + if self.remove_duplicated: + bad_quality_record.append(rec_id) + + for rec_id, seq_value in allele_seq.items(): + unique_seq.remove(seq_value) + if seq_value in unique_seq: + a_quality[rec_id]["quality"] = "Bad quality" + a_quality[rec_id]["reason"] = "Sub set allele" + if self.remove_subset: bad_quality_record.append(rec_id) - a_quality[rec_id]["quality"] ="Bad quality" - a_quality[rec_id]["reason"] ="Sub set allele" + new_schema_folder = os.path.join(self.output, "new_schema") _ = taranis.utils.create_new_folder(new_schema_folder) new_schema_file = os.path.join(new_schema_folder, self.allele_name + ".fasta") @@ -116,22 +142,38 @@ def check_allele_quality(self): SeqIO.write(record, fo, "fasta") # update the schema allele with the new file self.schema_allele = new_schema_file + + """ + if self.output_allele_annot: + # dump allele annotation to file + ann_heading = ["gene", "allele", "allele direction","nucleotide sequence", "protein sequence", "nucleotide sequence length", "star codon", "CDS coding", "allele quality", "bad quality reason" ] + ann_fields = ["direction", "dna_seq", "protein_seq", "length", "start_codon_alt","cds_coding", "quality", "reason"] + f_name = os.path.join(self.output, self.allele_name +"_allele_annotation.csv") + with open (f_name, "w") as fo: + fo.write(",".join(ann_heading) + "\n") + for allele in a_quality.keys(): + data_field = [a_quality[allele][field] for field in ann_fields] + fo.write(self.allele_name + "," + allele + "," + ",".join(data_field) + "\n") + """ + return a_quality def fetch_statistics_from_alleles(self, a_quality): - possible_bad_quality = ["Can not be converted to protein", "Start codon not found", "Stop codon not found", "Multiple stop codons found" ,"Duplicate allele", "Sub set allele"] + # POSIBLE_BAD_QUALITY = ["not a start codon", "not a stop codon", "Extra in frame stop codon", "is not a multiple of three", "Duplicate allele", "Sub set allele"] record_data = {} bad_quality_reason = {} a_length = [] bad_quality_counter = 0 for record_id in a_quality.keys(): record_data["allele_name"] = self.allele_name - a_length.append(a_quality[record_id]["length"]) + a_length.append(int(a_quality[record_id]["length"])) if a_quality[record_id]["quality"] == "Bad quality": bad_quality_counter += 1 - bad_quality_reason[a_quality[record_id]["reason"]] = ( - bad_quality_reason.get(a_quality[record_id]["reason"], 0) + 1 - ) + for reason in taranis.utils.POSIBLE_BAD_QUALITY: + if reason in a_quality[record_id]["reason"]: + bad_quality_reason[reason] = ( + bad_quality_reason.get(reason, 0) + 1 + ) total_alleles = len(a_length) record_data["min_length"] = min(a_length) record_data["max_length"] = max(a_length) @@ -140,25 +182,26 @@ def fetch_statistics_from_alleles(self, a_quality): record_data["good_percent"] = round( 100 * (total_alleles - bad_quality_counter) / total_alleles, 2 ) - for item in possible_bad_quality: - record_data[item] = bad_quality_reason[item] if item in bad_quality_reason else 0 - # record_data["bad_quality_reason"] = bad_quality_reason + for item in taranis.utils.POSIBLE_BAD_QUALITY: + record_data[item] = ( + bad_quality_reason[item] if item in bad_quality_reason else 0 + ) + return record_data def analyze_allele_in_schema(self): allele_data = {} - # Perform quality - a_quality = self.check_allele_quality() # run annotations prokka_folder = os.path.join(self.output, "prokka", self.allele_name) anotation_files = taranis.utils.create_annotation_files( self.schema_allele, prokka_folder, self.allele_name ) - allele_data["annotation_gene"] = taranis.utils.read_annotation_file( - anotation_files + ".tsv", self.allele_name - ).get(self.allele_name) - allele_data.update(self.fetch_statistics_from_alleles(a_quality)) - return allele_data + prokka_annotation = taranis.utils.read_annotation_file(anotation_files + ".gff") + + # Perform quality + a_quality = self.check_allele_quality(prokka_annotation) + allele_data = self.fetch_statistics_from_alleles(a_quality) + return [allele_data, a_quality] def parallel_execution( @@ -184,17 +227,30 @@ def parallel_execution( return schema_obj.analyze_allele_in_schema() - -def collect_statistics(stat_data, out_folder): +def collect_statistics(data, out_folder, output_allele_annot): def stats_graphics(stats_folder): - print(out_folder) + allele_range = [0, 300, 600, 1000, 1500] + graphic_folder = os.path.join(stats_folder, "graphics") _ = taranis.utils.create_new_folder(graphic_folder) # create graphic for alleles/number of genes - genes_alleles_df = stats_df["num_alleles"].value_counts().rename_axis("alleles").sort_index().reset_index(name="genes") - _ = taranis.utils.create_graphic(graphic_folder, "num_genes_per_allele.png", "lines", genes_alleles_df["alleles"].to_list(), genes_alleles_df["genes"].to_list(), ["Allele", "number of genes"],"title") + # genes_alleles_df = stats_df["num_alleles"].value_counts().rename_axis("alleles").sort_index().reset_index(name="genes") + group_alleles_df = stats_df.groupby( + pd.cut(stats_df["num_alleles"], allele_range) + ).count() + _ = taranis.utils.create_graphic( + graphic_folder, + "num_genes_per_allele.png", + "bar", + allele_range[1:], + group_alleles_df["num_alleles"].to_list(), + ["Allele", "number of genes"], + "title", + ) + # _ = taranis.utils.create_graphic(graphic_folder, "num_genes_per_allele.png", "lines", genes_alleles_df["alleles"].to_list(), genes_alleles_df["genes"].to_list(), ["Allele", "number of genes"],"title") # create pie graph for good quality - + + """ good_percent = [round(stats_df["good_percent"].mean(),2)] good_percent.append(100 - good_percent[0]) labels = ["Good quality", "Bad quality"] @@ -202,25 +258,90 @@ def stats_graphics(stats_folder): _ = taranis.utils.create_graphic(graphic_folder, "quality_of_locus.png", "pie", good_percent, "", labels, "Quality of locus") # create pie graph for bad quality reason. This is optional if there are # bad quality alleles + """ + sum_all_alleles = stats_df["num_alleles"].sum() + labels = [] values = [] for item in taranis.utils.POSIBLE_BAD_QUALITY: labels.append(item) values.append(stats_df[item].sum()) - if sum(values) > 0: - _ = taranis.utils.create_graphic(graphic_folder, "bad_quality_reason.png", "pie", values, "", labels, "Bad quality reason") - # create pie graph for not found gene name + labels.append("Good quality") + values.append(sum_all_alleles - sum(values)) + _ = taranis.utils.create_graphic( + graphic_folder, + "quality_percent.png", + "pie", + values, + "", + labels, + "Quality percent", + ) + # create box plot for allele length variability + _ = taranis.utils.create_graphic( + graphic_folder, + "allele_variability.png", + "box", + "", + stats_df["mean_length"].to_list(), + "", + "Allele variability", + ) + + summary_data = [] + a_quality = [] + for idx in range(len(data)): # pdb.set_trace() - times_not_found_gene = len(stats_df[stats_df["annotation_gene"] == "Not found by Prokka"]) - if times_not_found_gene > 0: - gene_not_found = [times_not_found_gene, len(stat_data)] - labels = ["Not found gene name", "Number of alleles"] - _ = taranis.utils.create_graphic(graphic_folder, "gene_not_found.png", "pie", gene_not_found, "", labels, "Quality of locus") - - stats_df = pd.DataFrame(stat_data) + summary_data.append(data[idx][0]) + a_quality.append(data[idx][1]) + + stats_df = pd.DataFrame(summary_data) + # a_quality = data[1] stats_folder = os.path.join(out_folder, "statistics") _ = taranis.utils.create_new_folder(stats_folder) _ = taranis.utils.write_data_to_file(stats_folder, "statistics.csv", stats_df) + # pdb.set_trace() stats_graphics(stats_folder) - print(stats_df) + if output_allele_annot: + # dump allele annotation to file + ann_heading = [ + "gene", + "allele", + "allele direction", + "nucleotide sequence", + "protein sequence", + "nucleotide sequence length", + "star codon", + "CDS coding", + "allele quality", + "bad quality reason", + ] + ann_fields = [ + "direction", + "dna_seq", + "protein_seq", + "length", + "start_codon_alt", + "cds_coding", + "quality", + "reason", + ] + # f_name = os.path.join(self.output, self.allele_name +"_allele_annotation.csv") + ann_data = ",".join(ann_heading) + "\n" + for gene in a_quality: + for allele in gene.keys(): + data_field = [gene[allele][field] for field in ann_fields] + ann_data += ( + gene[allele]["allele_name"] + + "," + + allele + + "," + + ",".join(data_field) + + "\n" + ) + + _ = taranis.utils.write_data_to_compress_filed( + out_folder, "allele_annotation.csv", ann_data + ) + return diff --git a/taranis/blast.py b/taranis/blast.py index e4b17f0..923b640 100644 --- a/taranis/blast.py +++ b/taranis/blast.py @@ -17,29 +17,57 @@ ) -class Blast(): +class Blast: def __init__(self, db_type): self.db_type = db_type - def create_blastdb (self, file_name, blast_dir): + def create_blastdb(self, file_name, blast_dir): self.f_name = Path(file_name).stem - db_dir = os.path.join(blast_dir,self.f_name) + db_dir = os.path.join(blast_dir, self.f_name) self.out_blast_dir = os.path.join(db_dir, self.f_name) - blast_command = ["makeblastdb" , "-in" , file_name , "-parse_seqids", "-dbtype", self.db_type, "-out" , self.out_blast_dir] + blast_command = [ + "makeblastdb", + "-in", + file_name, + "-parse_seqids", + "-dbtype", + self.db_type, + "-out", + self.out_blast_dir, + ] try: - _ = subprocess.run(blast_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + _ = subprocess.run( + blast_command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + ) except Exception as e: log.error("Unable to create blast db for %s ", self.f_name) log.error(e) - stderr.print(f"[red] Unable to create blast database for sample %s", self.f_name) + stderr.print( + f"[red] Unable to create blast database for sample %s", self.f_name + ) exit(1) return - - def run_blast(self, query, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, max_target_seqs=10, max_hsps=10, num_threads=1): + + def run_blast( + self, + query, + evalue=0.001, + perc_identity=90, + reward=1, + penalty=-2, + gapopen=1, + gapextend=1, + max_target_seqs=10, + max_hsps=10, + num_threads=1, + ): """_summary_ blastn -outfmt "6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq" -query /media/lchapado/Reference_data/proyectos_isciii/taranis/documentos_antiguos/pasteur_schema/lmo0002.fasta -db /media/lchapado/Reference_data/proyectos_isciii/taranis/test/blastdb/RA-L2073_R1/RA-L2073_R1 -evalue 0.001 -penalty -2 -reward 1 -gapopen 1 -gapextend 1 -perc_identity 100 > /media/lchapado/Reference_data/proyectos_isciii/taranis/test/blast_sample_locus002.txt - + Args: query (_type_): _description_ evalue (float, optional): _description_. Defaults to 0.001. @@ -54,14 +82,28 @@ def run_blast(self, query, evalue=0.001, perc_identity=90, reward=1, penalty=-2, """ blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"' pdb.set_trace() - #db=self.blast_dir, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path) - cline = NcbiblastnCommandline(db=self.out_blast_dir, evalue=evalue, perc_identity=perc_identity, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=query) + # db=self.blast_dir, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path) + cline = NcbiblastnCommandline( + db=self.out_blast_dir, + evalue=evalue, + perc_identity=perc_identity, + reward=reward, + penalty=penalty, + gapopen=gapopen, + gapextend=gapextend, + outfmt=blast_parameters, + max_target_seqs=max_target_seqs, + max_hsps=max_hsps, + num_threads=num_threads, + query=query, + ) try: out, _ = cline() except Exception as e: log.error("Unable to run blast for %s ", self.out_blast_dir) log.error(e) - stderr.print(f"[red] Unable to run blast for database %s", self.out_blast_dir) + stderr.print( + f"[red] Unable to run blast for database %s", self.out_blast_dir + ) exit(1) return out.splitlines() - \ No newline at end of file diff --git a/taranis/prediction.py b/taranis/prediction.py index 1706853..da2a395 100644 --- a/taranis/prediction.py +++ b/taranis/prediction.py @@ -15,7 +15,7 @@ ) -class Prediction(): +class Prediction: def __init__(self, genome_ref, sample_file, out_dir): self.genome_ref = genome_ref self.sample_file = sample_file @@ -33,33 +33,54 @@ def __init__(self, genome_ref, sample_file, out_dir): except OSError as e: log.error("Cannot create %s directory", self.out_dir) log.error(e) - stderr.print (f"[red] Unable to create {self.out_dir} folder") + stderr.print(f"[red] Unable to create {self.out_dir} folder") exit(1) def training(self): - prodigal_command = ["prodigal" , "-i", self.genome_ref, "-t", self.train] + prodigal_command = ["prodigal", "-i", self.genome_ref, "-t", self.train] try: - _ = subprocess.run(prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + _ = subprocess.run( + prodigal_command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + ) except Exception as e: log.error("Unable to execute prodigal command for training") log.error(e) - stderr.print (f"[red] Unable to run prodigal commmand. ERROR {e} ") + stderr.print(f"[red] Unable to run prodigal commmand. ERROR {e} ") exit(1) return - - def prediction(self): - - prodigal_command = ["prodigal" , "-i", self.sample_file , "-t", self.train, "-f", "gff", "-o", self.pred_coord, "-a", self.pred_protein, "-d", self.pred_gene] + prodigal_command = [ + "prodigal", + "-i", + self.sample_file, + "-t", + self.train, + "-f", + "gff", + "-o", + self.pred_coord, + "-a", + self.pred_protein, + "-d", + self.pred_gene, + ] try: - _ = subprocess.run(prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + _ = subprocess.run( + prodigal_command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + ) except Exception as e: log.error("Unable to execute prodigal command for training") log.error(e) - stderr.print (f"[red] Unable to run prodigal commmand. ERROR {e} ") + stderr.print(f"[red] Unable to run prodigal commmand. ERROR {e} ") exit(1) return def get_sequence(self): - return \ No newline at end of file + return diff --git a/taranis/pruebas.py b/taranis/pruebas.py index 4481b72..0303bf2 100644 --- a/taranis/pruebas.py +++ b/taranis/pruebas.py @@ -25,7 +25,7 @@ locus_list = [] for line in lines: line = line.strip() - if line == "#Cluster 5" : + if line == "#Cluster 5": if alleles_found == False: alleles_found = True continue @@ -37,7 +37,9 @@ # import pdb; pdb.set_trace() rand_locus = random.choice(locus_list) schema_file = "/media/lchapado/Reference_data/proyectos_isciii/taranis/taranis_testing_data/listeria_testing_schema/lmo0002.fasta" -new_schema_file = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/cluster_lmo0002.fasta" +new_schema_file = ( + "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/cluster_lmo0002.fasta" +) q_file = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/q_file.fasta" with open(schema_file) as fh: with open(new_schema_file, "w") as fo: @@ -52,15 +54,39 @@ if record.id == rand_locus: SeqIO.write(record, fo, "fasta") break -print ("Selected locus: " , rand_locus) -db_name ="/media/lchapado/Reference_data/proyectos_isciii/taranis/test/testing_clster/lmo0002" -blast_command = ['makeblastdb' , '-in' , new_schema_file , '-parse_seqids', '-dbtype', "nucl", '-out' , db_name] -blast_result = subprocess.run(blast_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) +print("Selected locus: ", rand_locus) +db_name = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/testing_clster/lmo0002" +blast_command = [ + "makeblastdb", + "-in", + new_schema_file, + "-parse_seqids", + "-dbtype", + "nucl", + "-out", + db_name, +] +blast_result = subprocess.run( + blast_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE +) blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"' # pdb.set_trace() -#db=self.blast_dir, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path) -cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=q_file) +# db=self.blast_dir, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path) +cline = NcbiblastnCommandline( + db=db_name, + evalue=0.001, + perc_identity=90, + reward=1, + penalty=-2, + gapopen=1, + gapextend=1, + outfmt=blast_parameters, + max_target_seqs=1100, + max_hsps=1000, + num_threads=4, + query=q_file, +) try: out, _ = cline() @@ -69,4 +95,4 @@ b_lines = out.splitlines() print("longitud del cluster = ", len(locus_list)) print("numero de matches = ", len(b_lines)) -# pdb.set_trace() \ No newline at end of file +# pdb.set_trace() diff --git a/taranis/reference_alleles.py b/taranis/reference_alleles.py index db20a61..13819af 100644 --- a/taranis/reference_alleles.py +++ b/taranis/reference_alleles.py @@ -21,6 +21,7 @@ force_terminal=taranis.utils.rich_force_colors(), ) + class ReferenceAlleles: def __init__(self, fasta_file, output): self.fasta_file = fasta_file @@ -45,7 +46,7 @@ def check_locus_quality(self): # Check if multiple stop codon by translating to protein and # comparing length locus_prot = Seq(record.seq).translate() - if len(locus_prot) == int(len(seq)/3): + if len(locus_prot) == int(len(seq) / 3): self.locus_quality[record.id] = "good quality" self.selected_locus[record.id] = seq else: @@ -59,7 +60,7 @@ def check_locus_quality(self): # Matched reverse start codon if s_codon_f.group(1) in STOP_CODONS_REVERSE: locus_prot = Seq(record.seq).reverse_complement().translate() - if len(locus_prot) == int(len(record.seq)/3): + if len(locus_prot) == int(len(record.seq) / 3): self.locus_quality[record.id] = "good quality" self.selected_locus[record.id] = seq else: @@ -73,92 +74,119 @@ def check_locus_quality(self): def create_matrix_distance(self): # f_name = os.path.basename(self.fasta_file).split('.')[0] f_name = os.path.basename(self.fasta_file) - mash_folder = os.path.join(self.output, "mash" ) + mash_folder = os.path.join(self.output, "mash") # _ = taranis.utils.write_fasta_file(mash_folder, self.selected_locus, multiple_files=True, extension=False) # save directory to return after mash working_dir = os.getcwd() os.chdir(mash_folder) # run mash sketch command - sketch_file = "reference.msh" + sketch_file = "reference.msh" mash_sketch_command = ["mash", "sketch", "-i", "-o", sketch_file, f_name] # mash sketch -i -o prueba.msh lmo0003.fasta # mash_sketch_command += list(self.selected_locus.keys()) - - mash_sketch_result = subprocess.run(mash_sketch_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + mash_sketch_result = subprocess.run( + mash_sketch_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) # Get pairwise allele sequences mash distances # mash_distance_command = ["mash", "dist", sketch_path, sketch_path] - mash_distance_command = ["mash", "triangle", "-i", "reference.msh"] - mash_distance_result = subprocess.Popen(mash_distance_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + mash_distance_command = ["mash", "triangle", "-i", "reference.msh"] + mash_distance_result = subprocess.Popen( + mash_distance_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) # pdb.set_trace() out, err = mash_distance_result.communicate() - with open ("matrix_distance.tsv", "w") as fo: + with open("matrix_distance.tsv", "w") as fo: # adding alleles to create a heading # the value are not required to be in order, just only any name and the right length - fo.write( "alleles\t" + "\t".join(list(self.selected_locus.keys())) + "\n") + fo.write("alleles\t" + "\t".join(list(self.selected_locus.keys())) + "\n") fo.write(out.decode("UTF-8")) import pandas as pd + locus_num = len(self.selected_locus) # pdb.set_trace() matrix_df = pd.read_csv("matrix_distance.tsv", sep="\t").fillna(value=0) # remove the first line of the matrix that contain only the number of alleles matrix_df = matrix_df.drop(0) - locus_list = matrix_df.iloc[0:locus_num,0] - matrix_np = matrix_df.iloc[:,1:].to_numpy() + locus_list = matrix_df.iloc[0:locus_num, 0] + matrix_np = matrix_df.iloc[:, 1:].to_numpy() # convert the triangular matrix to mirror up triangular part t_matrix_np = matrix_np.transpose() - matrix_np = t_matrix_np + matrix_np + matrix_np = t_matrix_np + matrix_np # values_np = matrix_df.iloc[:,2].to_numpy() - + # matrix_np = values_np.reshape(locus_num, locus_num) # out = out.decode('UTF-8').split('\n') from sklearn.cluster import AgglomerativeClustering - clusterer = AgglomerativeClustering(n_clusters=7, metric="precomputed", linkage="average", distance_threshold=None) + + clusterer = AgglomerativeClustering( + n_clusters=7, + metric="precomputed", + linkage="average", + distance_threshold=None, + ) clusters = clusterer.fit_predict(matrix_np) # clustering = AgglomerativeClustering(affinity="precomputed").fit(matrix_np) - mean_distance =np.mean(matrix_np, 0) + mean_distance = np.mean(matrix_np, 0) std = np.std(matrix_np) min_mean = min(mean_distance) mean_all_alleles = np.mean(mean_distance) - max_mean= max(mean_distance) + max_mean = max(mean_distance) # buscar el indice que tiene el minimo valor de media - min_mean_idx= np.where(mean_distance==float(min_mean))[0][0] + min_mean_idx = np.where(mean_distance == float(min_mean))[0][0] # create fasta file with the allele min_allele = self.selected_locus[locus_list[min_mean_idx]] record_allele_folder = os.path.join(os.getcwd(), f_name.split(".")[0]) - min_allele_file = taranis.utils.write_fasta_file(record_allele_folder,min_allele, locus_list[min_mean_idx]) + min_allele_file = taranis.utils.write_fasta_file( + record_allele_folder, min_allele, locus_list[min_mean_idx] + ) # pdb.set_trace() # busca el indice que tiene el valor de la media - mean_all_closser_value = taranis.utils.find_nearest_numpy_value(mean_distance, mean_all_alleles) - mean_all_alleles_idx= np.where(mean_distance==float(mean_all_closser_value))[0][0] + mean_all_closser_value = taranis.utils.find_nearest_numpy_value( + mean_distance, mean_all_alleles + ) + mean_all_alleles_idx = np.where(mean_distance == float(mean_all_closser_value))[ + 0 + ][0] # create fasta file with the allele mean_allele = self.selected_locus[locus_list[mean_all_alleles_idx]] # record_allele_folder = os.path.join(mash_folder, f_name) - mean_allele_file = taranis.utils.write_fasta_file(record_allele_folder,mean_allele, locus_list[mean_all_alleles_idx]) - + mean_allele_file = taranis.utils.write_fasta_file( + record_allele_folder, mean_allele, locus_list[mean_all_alleles_idx] + ) + # busca el indice con la mayor distancia - max_mean_idx= np.where(mean_distance==float(max_mean))[0][0] + max_mean_idx = np.where(mean_distance == float(max_mean))[0][0] # create fasta file with the allele max_allele = self.selected_locus[locus_list[max_mean_idx]] - max_allele_file = taranis.utils.write_fasta_file(record_allele_folder,max_allele, locus_list[max_mean_idx]) - + max_allele_file = taranis.utils.write_fasta_file( + record_allele_folder, max_allele, locus_list[max_mean_idx] + ) # Elijo un outlier lmo0002_185 para ver la distancia outlier_allele = self.selected_locus[locus_list[184]] - outlier_allele_file = taranis.utils.write_fasta_file(record_allele_folder,outlier_allele, locus_list[184]) + outlier_allele_file = taranis.utils.write_fasta_file( + record_allele_folder, outlier_allele, locus_list[184] + ) - # elijo un segundo outlier lmo0002_95 que tiene como cluster =1 + # elijo un segundo outlier lmo0002_95 que tiene como cluster =1 outlier2_allele = self.selected_locus[locus_list[95]] - outlier2_allele_file = taranis.utils.write_fasta_file(record_allele_folder,outlier2_allele, locus_list[95]) - - # elijo un tercer outlier lmo0002_185 que tiene como cluster =4 + outlier2_allele_file = taranis.utils.write_fasta_file( + record_allele_folder, outlier2_allele, locus_list[95] + ) + + # elijo un tercer outlier lmo0002_185 que tiene como cluster =4 outlier3_allele = self.selected_locus[locus_list[185]] - outlier3_allele_file = taranis.utils.write_fasta_file(record_allele_folder,outlier3_allele, locus_list[185]) + outlier3_allele_file = taranis.utils.write_fasta_file( + record_allele_folder, outlier3_allele, locus_list[185] + ) # saca una lista de cuantas veces se repite un valor np.bincount(clusters) blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"' from Bio.Blast.Applications import NcbiblastnCommandline + # Create local BLAST database for all alleles in the locus db_name = "/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/blast/locus_db" # db_name = os.path.join("blast", 'locus_blastdb') @@ -169,16 +197,29 @@ def create_matrix_distance(self): # taranis.utils.create_blastdb(fasta_file, db_name, 'nucl', logger): # locus_db_name = os.path.join(db_name, f_name[0], f_name[0]) # query_data= self.selected_locus["lmo0002_1"] - # All alleles in locus VS reference allele chosen (centroid) BLAST - + # All alleles in locus VS reference allele chosen (centroid) BLAST + # ref_query_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/query.fasta" # cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=100, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=0, max_hsps=0, num_threads=4, query=ref_query_file) - # minima distancia . + # minima distancia . # min_dist_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_610" # pdb.set_trace() min_dist_file = os.path.join(record_allele_folder, min_allele_file) - cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=min_dist_file) + cline = NcbiblastnCommandline( + db=db_name, + evalue=0.001, + perc_identity=90, + reward=1, + penalty=-2, + gapopen=1, + gapextend=1, + outfmt=blast_parameters, + max_target_seqs=1100, + max_hsps=1000, + num_threads=4, + query=min_dist_file, + ) out, err = cline() min_dist_lines = out.splitlines() min_dist_alleles = [] @@ -187,11 +228,24 @@ def create_matrix_distance(self): min_np = np.array(min_dist_alleles) # pdb.set_trace() print("matches con min distancia: ", len(min_dist_lines)) - print("Not coverage using as reference" , np.setdiff1d(locus_list, min_np)) + print("Not coverage using as reference", np.setdiff1d(locus_list, min_np)) # distancia media. Sale 133 matches # mean_dist_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_870" - mean_dist_file = os.path.join(record_allele_folder, mean_allele_file) - cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=mean_dist_file) + mean_dist_file = os.path.join(record_allele_folder, mean_allele_file) + cline = NcbiblastnCommandline( + db=db_name, + evalue=0.001, + perc_identity=90, + reward=1, + penalty=-2, + gapopen=1, + gapextend=1, + outfmt=blast_parameters, + max_target_seqs=1100, + max_hsps=1000, + num_threads=4, + query=mean_dist_file, + ) out, err = cline() mean_dist_lines = out.splitlines() mean_dist_alleles = [] @@ -199,12 +253,25 @@ def create_matrix_distance(self): mean_dist_alleles.append(mean_dist.split("\t")[1]) mean_np = np.array(mean_dist_alleles) print("matches con distancia media: ", len(mean_dist_lines)) - print("Not coverage using as reference" , np.setdiff1d(locus_list, mean_np)) - - # maxima distancia, + print("Not coverage using as reference", np.setdiff1d(locus_list, mean_np)) + + # maxima distancia, # ref_query_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_216" - max_dist_file = os.path.join(record_allele_folder, max_allele_file) - cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=max_dist_file) + max_dist_file = os.path.join(record_allele_folder, max_allele_file) + cline = NcbiblastnCommandline( + db=db_name, + evalue=0.001, + perc_identity=90, + reward=1, + penalty=-2, + gapopen=1, + gapextend=1, + outfmt=blast_parameters, + max_target_seqs=1100, + max_hsps=1000, + num_threads=4, + query=max_dist_file, + ) out, err = cline() max_dist_lines = out.splitlines() max_dist_alleles = [] @@ -212,12 +279,25 @@ def create_matrix_distance(self): max_dist_alleles.append(max_dist.split("\t")[1]) max_np = np.array(max_dist_alleles) print("matches con max distancia: ", len(max_dist_lines)) - print("Not coverage using as reference" , np.setdiff1d(locus_list, max_np)) - - # eligiendo uno de los outliers , + print("Not coverage using as reference", np.setdiff1d(locus_list, max_np)) + + # eligiendo uno de los outliers , # outlier_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_183" - outlier_file = os.path.join(record_allele_folder, outlier_allele_file) - cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=outlier_file) + outlier_file = os.path.join(record_allele_folder, outlier_allele_file) + cline = NcbiblastnCommandline( + db=db_name, + evalue=0.001, + perc_identity=90, + reward=1, + penalty=-2, + gapopen=1, + gapextend=1, + outfmt=blast_parameters, + max_target_seqs=1100, + max_hsps=1000, + num_threads=4, + query=outlier_file, + ) out, err = cline() outlier_lines = out.splitlines() outlier_alleles = [] @@ -226,14 +306,27 @@ def create_matrix_distance(self): outlier_np = np.array(outlier_alleles) print("matches con outliers distancia: ", len(outlier_lines)) - print("Alleles added using outlier as reference" , outlier_np) + print("Alleles added using outlier as reference", outlier_np) new_ref_np = np.unique(np.concatenate((min_np, outlier_np), axis=0)) print("\n", "remaining alleles ", np.setdiff1d(locus_list, new_ref_np)) - # eligiendo el segundo de los outliers , + # eligiendo el segundo de los outliers , # outlier_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_183" - outlier2_file = os.path.join(record_allele_folder, outlier2_allele_file) - cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=outlier2_file) + outlier2_file = os.path.join(record_allele_folder, outlier2_allele_file) + cline = NcbiblastnCommandline( + db=db_name, + evalue=0.001, + perc_identity=90, + reward=1, + penalty=-2, + gapopen=1, + gapextend=1, + outfmt=blast_parameters, + max_target_seqs=1100, + max_hsps=1000, + num_threads=4, + query=outlier2_file, + ) out, err = cline() outlier2_lines = out.splitlines() outlier2_alleles = [] @@ -243,12 +336,29 @@ def create_matrix_distance(self): print("matches con second outliers distance: ", len(outlier2_lines)) # print("Alleles added using second outlier as reference" , outlier2_np) upd_new_ref_np = np.unique(np.concatenate((new_ref_np, outlier2_np), axis=0)) - print("\n", "remaining alleles after second outlier", np.setdiff1d(locus_list, upd_new_ref_np)) + print( + "\n", + "remaining alleles after second outlier", + np.setdiff1d(locus_list, upd_new_ref_np), + ) - # eligiendo el tercero de los outliers , + # eligiendo el tercero de los outliers , # outlier_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_183" - outlier3_file = os.path.join(record_allele_folder, outlier3_allele_file) - cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=outlier3_file) + outlier3_file = os.path.join(record_allele_folder, outlier3_allele_file) + cline = NcbiblastnCommandline( + db=db_name, + evalue=0.001, + perc_identity=90, + reward=1, + penalty=-2, + gapopen=1, + gapextend=1, + outfmt=blast_parameters, + max_target_seqs=1100, + max_hsps=1000, + num_threads=4, + query=outlier3_file, + ) out, err = cline() outlier3_lines = out.splitlines() outlier3_alleles = [] @@ -257,11 +367,16 @@ def create_matrix_distance(self): outlier3_np = np.array(outlier3_alleles) print("matches con third outliers distance: ", len(outlier3_lines)) # print("Alleles added using second outlier as reference" , outlier2_np) - upd2_new_ref_np = np.unique(np.concatenate((upd_new_ref_np, outlier3_np), axis=0)) - print("\n", "remaining alleles after second outlier", np.setdiff1d(locus_list, upd2_new_ref_np)) - - print("\n Still missing " ,len( np.setdiff1d(locus_list, upd2_new_ref_np))) + upd2_new_ref_np = np.unique( + np.concatenate((upd_new_ref_np, outlier3_np), axis=0) + ) + print( + "\n", + "remaining alleles after second outlier", + np.setdiff1d(locus_list, upd2_new_ref_np), + ) + print("\n Still missing ", len(np.setdiff1d(locus_list, upd2_new_ref_np))) pdb.set_trace() @@ -270,13 +385,11 @@ def create_matrix_distance(self): # X = np.array([[0, 2, 3], [2, 0, 3], [3, 3, 0]]) # clustering = AgglomerativeClustering(affinity="precomputed").fit(X) - def create_ref_alleles(self): self.records = taranis.utils.read_fasta_file(self.fasta_file) _ = self.check_locus_quality() # pdb.set_trace() # Prepare data to use mash to create the distance matrix _ = self.create_matrix_distance() - - pass \ No newline at end of file + pass diff --git a/taranis/utils.py b/taranis/utils.py index f41f297..bd1b09a 100644 --- a/taranis/utils.py +++ b/taranis/utils.py @@ -4,16 +4,17 @@ """ import glob - +import io import logging import numpy as np import questionary import os import plotly.graph_objects as go +import re import rich.console import shutil import subprocess - +import tarfile import sys @@ -22,8 +23,10 @@ from Bio.SeqRecord import SeqRecord import pdb + log = logging.getLogger(__name__) + def rich_force_colors(): """ Check if any environment variables are set to force Rich to use coloured output @@ -35,6 +38,8 @@ def rich_force_colors(): ): return True return None + + stderr = rich.console.Console( stderr=True, style="dim", @@ -42,39 +47,73 @@ def rich_force_colors(): force_terminal=rich_force_colors(), ) -START_CODON_FORWARD= ['ATG','ATA','ATT','GTG', 'TTG'] -start_codon_reverse= ['CAT', 'TAT','AAT','CAC','CAA'] +START_CODON_FORWARD = ["ATG", "ATA", "ATT", "GTG", "TTG"] +START_CODON_REVERSE = ["CAT", "TAT", "AAT", "CAC", "CAA"] + +STOP_CODON_FORWARD = ["TAA", "TAG", "TGA"] +STOP_CODON_REVERSE = ["TTA", "CTA", "TCA"] -STOP_CODON_FORWARD = ['TAA', 'TAG','TGA'] -stop_codon_reverse = ['TTA', 'CTA','TCA'] +POSIBLE_BAD_QUALITY = [ + "not a start codon", + "not a stop codon", + "Extra in frame stop codon", + "is not a multiple of three", + "Duplicate allele", + "Sub set allele", +] -POSIBLE_BAD_QUALITY = ["Can not be converted to protein", "Start codon not found", "Stop codon not found", "Multiple stop codons found" ,"Duplicate allele", "Sub set allele"] -def check_sequence_order(allele_sequence): +def get_seq_direction(allele_sequence): # check direction - if allele_sequence[0:3] in START_CODON_FORWARD or allele_sequence[-3:] in STOP_CODON_FORWARD: - return 'forward' - if allele_sequence[-3:] in start_codon_reverse or allele_sequence[0:3] in stop_codon_reverse: - return 'reverse' + if ( + allele_sequence[0:3] in START_CODON_FORWARD + or allele_sequence[-3:] in STOP_CODON_FORWARD + ): + return "forward" + if ( + allele_sequence[-3:] in START_CODON_REVERSE + or allele_sequence[0:3] in STOP_CODON_REVERSE + ): + return "reverse" return "Error" -def create_annotation_files(fasta_file, annotation_dir, prefix, genus="Genus", species="species", usegenus=False): + +def create_annotation_files( + fasta_file, + annotation_dir, + prefix, + genus="Genus", + species="species", + usegenus=False, + cpus=3, +): try: - _ = subprocess.run (['prokka', fasta_file, '--force', '--outdir', annotation_dir, '--genus', genus, '--species', species, '--usegenus', str(usegenus), '--gcode', '11', '--prefix', prefix, '--quiet']) + _ = subprocess.run( + [ + "prokka", + fasta_file, + "--force", + "--outdir", + annotation_dir, + "--genus", + genus, + "--species", + species, + "--usegenus", + str(usegenus), + "--gcode", + "11", + "--prefix", + prefix, + "--cpus", + str(cpus), + "--quiet", + ] + ) except Exception as e: - log.error("Unable to run prokka. Error message: %s ", e ) + log.error("Unable to run prokka. Error message: %s ", e) stderr.print("[red] Unable to run prokka. Given error; " + e) sys.exit(1) - # Check that prokka store files in the requested folder - # if prokka results are not found in the requested folder then move from the - # running directory to the right one - if not folder_exists(annotation_dir): - try: - shutil.move(prefix, annotation_dir) - except Exception as e: - log.error("Unable to move prokka result folder to %s ", e ) - stderr.print("[red] Unable to move result prokka folder. Error; " + e) - sys.exit(1) return os.path.join(annotation_dir, prefix) @@ -88,14 +127,19 @@ def create_new_folder(folder_name): return -def create_graphic(out_folder, f_name, mode, x_data, y_data, labels, title ): +def create_graphic(out_folder, f_name, mode, x_data, y_data, labels, title): fig = go.Figure() # pdb.set_trace() if mode == "lines": fig.add_trace(go.Scatter(x=x_data, y=y_data, mode=mode, name=title)) elif mode == "pie": fig.add_trace(go.Pie(labels=labels, values=x_data)) - fig.update_layout(title_text= title) + elif mode == "bar": + fig.add_trace(go.Bar(x=x_data, y=y_data)) + elif mode == "box": + fig.add_trace(go.Box(y=y_data)) + + fig.update_layout(title_text=title) fig.write_image(os.path.join(out_folder, f_name)) @@ -106,7 +150,7 @@ def get_files_in_folder(folder, extension=None): Args: folder (string): folder path extension (string, optional): extension for filtering. Defaults to None. - + Returns: list: list of files which match the condition """ @@ -116,14 +160,16 @@ def get_files_in_folder(folder, extension=None): sys.exit(1) if extension is None: extension = "*" - folder_files = os.path.join(folder , "*." + extension) + folder_files = os.path.join(folder, "*." + extension) files_in_folder = glob.glob(folder_files) if len(files_in_folder) == 0: - log.error("Folder %s does not have any file which the extension %s", folder, extension) + log.error( + "Folder %s does not have any file which the extension %s", folder, extension + ) stderr.print("[red] Folder does not have any file which match your request") sys.exit(1) return files_in_folder - + def file_exists(file_to_check): """Checks if input file exists @@ -133,28 +179,18 @@ def file_exists(file_to_check): Returns: boolean: True if exists - """ + """ if os.path.isfile(file_to_check): return True return False -def find_multiple_stop_codons(seq) : - stop_codons = ['TAA', 'TAG','TGA'] - c_index = [] - for idx in range (0, len(seq) -2, 3) : - c_seq = seq[idx : idx + 3] - if c_seq in stop_codons : - c_index.append(idx) - if len(c_index) == 1: - return False - return True - def find_nearest_numpy_value(array, value): array = np.asarray(array) idx = (np.abs(array - value)).argmin() return array[idx] + def folder_exists(folder_to_check): """Checks if input folder exists @@ -163,15 +199,17 @@ def folder_exists(folder_to_check): Returns: boolean: True if exists - """ + """ if os.path.isdir(folder_to_check): return True return False + def prompt_text(msg): source = questionary.text(msg).unsafe_ask() return source + def query_user_yes_no(question, default): """Query the user to choose yes or no for the query question @@ -180,8 +218,8 @@ def query_user_yes_no(question, default): default (string): default option to be used: yes or no Returns: - user select: True continue with code - """ + user select: True continue with code + """ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} if default is None: prompt = " [y/n] " @@ -204,36 +242,37 @@ def query_user_yes_no(question, default): else: sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") -def read_annotation_file(ann_file, allele_name, only_first_line=True): - """ example of annotation file - locus_tag ftype length_bp gene EC_number COG product - IEKBEMEO_00001 CDS 1344 yeeO_1 COG0534 putative FMN/FAD exporter YeeO - IEKBEMEO_00002 CDS 1344 yeeO_2 COG0534 putative FMN/FAD exporter YeeO +def read_annotation_file(ann_file): + """example of annotation file + + lmo0002_782 Prodigal:002006 CDS 1 1146 . + 0 ID=OJGEGONH_00782;Name=dnaN_782;db_xref=COG:COG0592;gene=dnaN_782;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P05649;locus_tag=OJGEGONH_00782;product=Beta sliding clamp + lmo0002_783 Prodigal:002006 CDS 1 1146 . + 0 ID=OJGEGONH_00783;Name=dnaN_783;db_xref=COG:COG0592;gene=dnaN_783;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P05649;locus_tag=OJGEGONH_00783;product=Beta sliding clamp + lmo0049_3 Prodigal:002006 CDS 1 162 . + 0 ID=CODOCEEL_00001;inference=ab initio prediction:Prodigal:002006;locus_tag=CODOCEEL_00001;product=hypothetical protein + lmo0049_6 Prodigal:002006 CDS 1 162 . + 0 ID=CODOCEEL_00002;inference=ab initio prediction:Prodigal:002006;locus_tag=CODOCEEL_00002;product=hypothetical protein """ ann_data = {} - with open (ann_file, "r") as fh: + with open(ann_file, "r") as fh: lines = fh.readlines() - heading = lines[0].split("\t") - locus_tag_idx = heading.index("locus_tag") - gene_idx = heading.index("gene") - if only_first_line: - first_line = lines[1].split("\t") - ann_data[allele_name] = first_line[gene_idx] if first_line[gene_idx] != "" else "Not found by Prokka" - else: - # Return all annotation lines - for line in lines[1:]: - s_line = line.strip().split("\t") - allele_key = allele_name + "_" + s_line[locus_tag_idx].split("_")[1] - ann_data[allele_key] = s_line[gene_idx] if s_line[gene_idx] != "" else "Not found by Prokka" - return ann_data + for line in lines: + if "Prodigal" in line: + gene_match = re.search(r"(.*)[\t]Prodigal.*gene=(\w+)_.*", line) + if gene_match: + ann_data[gene_match.group(1)] = gene_match.group(2) + else: + pred_match = re.search(r"(.*)[\t]Prodigal.*product=(\w+)_.*", line) + if pred_match: + ann_data[pred_match.group(1)] = pred_match.group(2).strip() + if "fasta" in line: + break + return ann_data def read_fasta_file(fasta_file): return SeqIO.parse(fasta_file, "fasta") - + def write_fasta_file(out_folder, seq_data, allele_name=None, f_name=None): try: @@ -246,22 +285,55 @@ def write_fasta_file(out_folder, seq_data, allele_name=None, f_name=None): # use the fasta name as file name f_name = key + ".fasta" f_path_name = os.path.join(out_folder, f_name) - with open (f_path_name, "w") as fo: + with open(f_path_name, "w") as fo: fo.write(">" + key + "\n") fo.write(seq) else: if f_name is None: f_name = allele_name f_path_name = os.path.join(out_folder, f_name) - with open (f_path_name, "w") as fo: + with open(f_path_name, "w") as fo: fo.write(">" + allele_name + "\n") fo.write(seq_data) return f_name -def write_data_to_file(out_folder, f_name, data, include_header=True, data_type="pandas", extension="csv"): - f_path_name = os.path.join(out_folder,f_name) + +def write_data_to_compress_filed(out_folder, f_name, dump_data): + with io.BytesIO() as buffer: + with tarfile.open(fileobj=buffer, mode="w:gz") as tar: + # Add data to the tar archive + tarinfo = tarfile.TarInfo(f_name) + # Example: Write a string to the tar.gz file (replace this with your data) + data_bytes = dump_data.encode("utf-8") + tarinfo.size = len(data_bytes) + tar.addfile(tarinfo, io.BytesIO(data_bytes)) + + # Get the content of the in-memory tar.gz file + buffer.seek(0) + tar_data = buffer.read() + file_path_name = os.path.join(out_folder, Path(f_name).stem + ".tar.gz") + with open(file_path_name, "wb") as fo: + fo.write(tar_data) + + +def write_data_to_file( + out_folder, f_name, data, include_header=True, data_type="pandas", extension="csv" +): + f_path_name = os.path.join(out_folder, f_name) if data_type == "pandas": - data.to_csv(f_path_name, sep=",",header=include_header) + data.to_csv(f_path_name, sep=",", header=include_header) return +""" +def find_multiple_stop_codons(seq) : + stop_codons = ['TAA', 'TAG','TGA'] + c_index = [] + for idx in range (0, len(seq) -2, 3) : + c_seq = seq[idx : idx + 3] + if c_seq in stop_codons : + c_index.append(idx) + if len(c_index) == 1: + return False + return True +"""