diff --git a/.github/workflows/dockerhub_push_release.yml b/.github/workflows/dockerhub_push_release.yml
deleted file mode 100644
index e8b6638..0000000
--- a/.github/workflows/dockerhub_push_release.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: deploy release
-# This builds the docker image and pushes it to DockerHub
-on:
- release:
- types: [published]
-jobs:
- push_dockerhub:
- name: Push new Docker image to Docker Hub (release)
- runs-on: ubuntu-latest
- # Only run for the official repo, for releases and merged PRs
- if: ${{ github.repository == 'BU-ISCIII/taranis' }}
- env:
- DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
- DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }}
- steps:
- - name: Check out pipeline code
- uses: actions/checkout@v2
-
- - name: Build new docker image
- run: docker build --no-cache . -t buisciii/taranis:${{ github.event.release.tag_name }}
-
- - name: Push Docker image to DockerHub (develop)
- run: |
- echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
- docker push buisciii/taranis:${{ github.event.release.tag_name }}
diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml
new file mode 100644
index 0000000..9d043bb
--- /dev/null
+++ b/.github/workflows/python_lint.yml
@@ -0,0 +1,35 @@
+name: python_lint
+
+on:
+ push:
+ paths:
+ - '**.py'
+ pull_request:
+ paths:
+ - '**.py'
+
+jobs:
+ flake8_py3:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Setup Python
+ uses: actions/setup-python@v1
+ with:
+ python-version: 3.9.x
+ architecture: x64
+ - name: Checkout PyTorch
+ uses: actions/checkout@master
+ - name: Install flake8
+ run: pip install flake8
+ - name: Run flake8
+ run: flake8 --ignore E501,W503,E203,W605
+
+ black_lint:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Setup
+ uses: actions/checkout@v2
+ - name: Install black in jupyter
+ run: pip install black[jupyter]
+ - name: Check code lints with Black
+ uses: psf/black@stable
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
deleted file mode 100644
index ed66541..0000000
--- a/.github/workflows/tests.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: tests ci
-# This workflow runs the pipeline with the minimal test dataset to check that it completes any errors
-on:
- push:
- branches: [develop]
- pull_request_target:
- branches: [develop]
- release:
- types: [published]
-
-jobs:
- push_dockerhub:
- name: Push new Docker image to Docker Hub (dev)
- runs-on: ubuntu-latest
- # Only run for the official repo, for releases and merged PRs
- if: ${{ github.repository == 'BU-ISCIII/taranis' }}
- env:
- DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
- DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }}
- steps:
- - name: Check out pipeline code
- uses: actions/checkout@v2
-
- - name: Build new docker image
- run: docker build --no-cache . -t buisciii/taranis:dev
-
- - name: Push Docker image to DockerHub (develop)
- run: |
- echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
- docker push buisciii/taranis:dev
- run-tests:
- name: Run tests
- needs: push_dockerhub
- runs-on: ubuntu-latest
- steps:
- - name: Run pipeline with test data
- run: |
- docker run buisciii/taranis:dev bash -c /opt/taranis/test/test.sh
diff --git a/setup.py b/setup.py
index 4b6fad4..eda9691 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
from setuptools import setup, find_packages
-version = "2.2.0"
+version = "3.0.0"
with open("README.md") as f:
readme = f.read()
diff --git a/taranis/__main__.py b/taranis/__main__.py
index d08cb47..a281777 100644
--- a/taranis/__main__.py
+++ b/taranis/__main__.py
@@ -52,7 +52,7 @@ def run_taranis():
)
# stderr.print("[green] `._,._,'\n", highlight=False)
- __version__ = "2.1.0"
+ __version__ = "3.0.0"
stderr.print(
"\n" "[grey39] Taranis version {}".format(__version__), highlight=False
)
@@ -166,6 +166,12 @@ def taranis_cli(verbose, log_file):
default=False,
help="Remove no CDS alleles from the schema.",
)
+@click.option(
+ "--output-allele-annot/--no-output-allele-annot",
+ required=False,
+ default=True,
+ help="get extension annotation for all alleles in locus",
+)
@click.option(
"--genus",
required=False,
@@ -184,29 +190,41 @@ def taranis_cli(verbose, log_file):
default="Genus",
help="Use genus-specific BLAST databases for Prokka schema genes annotation (needs --genus). Default is False.",
)
+@click.option(
+ "--cpus",
+ required=False,
+ multiple=False,
+ type=int,
+ default=1,
+ help="Number of cpus used for execution",
+)
def analyze_schema(
inputdir,
output,
remove_subset,
remove_duplicated,
remove_no_cds,
+ output_allele_annot,
genus,
species,
usegenus,
+ cpus,
):
schema_files = taranis.utils.get_files_in_folder(inputdir, "fasta")
"""
- schema_analyze = {}
+ schema_analyze = []
for schema_file in schema_files:
schema_obj = taranis.analyze_schema.AnalyzeSchema(schema_file, output, remove_subset, remove_duplicated, remove_no_cds, genus, species, usegenus)
- schema_analyze.update(schema_obj.analyze_allele_in_schema())
-
- """
+ schema_analyze.append(schema_obj.analyze_allele_in_schema())
+ import pdb; pdb.set_trace()
+ _ = taranis.analyze_schema.collect_statistics(schema_analyze, output, output_allele_annot)
+ sys.exit(0)
# for schema_file in schema_files:
+ """
results = []
start = time.perf_counter()
- with concurrent.futures.ProcessPoolExecutor() as executor:
+ with concurrent.futures.ProcessPoolExecutor(max_workers=cpus) as executor:
futures = [
executor.submit(
taranis.analyze_schema.parallel_execution,
@@ -224,10 +242,11 @@ def analyze_schema(
# Collect results as they complete
for future in concurrent.futures.as_completed(futures):
results.append(future.result())
- _ = taranis.analyze_schema.collect_statistics(results, output)
+ _ = taranis.analyze_schema.collect_statistics(results, output, output_allele_annot)
finish = time.perf_counter()
print(f"Schema analyze finish in {round((finish-start)/60, 2)} minutes")
+
# Reference alleles
@taranis_cli.command(help_priority=2)
@click.option(
diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py
index 20ef08c..d9a1e99 100644
--- a/taranis/allele_calling.py
+++ b/taranis/allele_calling.py
@@ -5,12 +5,14 @@
import taranis.utils
import taranis.blast
+
# import numpy
import pandas as pd
from pathlib import Path
import pdb
+
log = logging.getLogger(__name__)
stderr = rich.console.Console(
stderr=True,
@@ -19,6 +21,7 @@
force_terminal=taranis.utils.rich_force_colors(),
)
+
class AlleleCalling:
def __init__(self, prediction, sample_file, schema, reference_alleles, out_folder):
self.prediction = prediction
@@ -27,9 +30,25 @@ def __init__(self, prediction, sample_file, schema, reference_alleles, out_folde
self.ref_alleles = reference_alleles
self.out_folder = out_folder
self.s_name = Path(sample_file).stem
- self.blast_dir = os.path.join(out_folder,"blastdb")
+ self.blast_dir = os.path.join(out_folder, "blastdb")
self.blast_sample = os.path.join(self.blast_dir, self.s_name)
- self.blast_heading = ["qseqid", "sseqid", "pident", "qlen", "length", "mismatch", "gapopen", "evalue", "bitscore", "sstart", "send", "qstart", "qend", "sseq", "qseq"]
+ self.blast_heading = [
+ "qseqid",
+ "sseqid",
+ "pident",
+ "qlen",
+ "length",
+ "mismatch",
+ "gapopen",
+ "evalue",
+ "bitscore",
+ "sstart",
+ "send",
+ "qstart",
+ "qend",
+ "sseq",
+ "qseq",
+ ]
def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene):
"""_summary_
@@ -39,23 +58,22 @@ def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene)
allele_name (_type_): _description_
sample_contig (_type_): _description_
schema_gene (_type_): _description_
- """
+ """
s_alleles_blast = taranis.blast.Blast("nucl")
ref_allele_blast_dir = os.path.join(self.blast_dir, "ref_alleles")
query_path = os.path.join(self.out_folder, "tmp", allele_name)
- # Write to file the sequence to find out the loci name that fully match
+ # Write to file the sequence to find out the loci name that fully match
f_name = taranis.utils.write_fasta_file(query_path, query_seq, allele_name)
query_file = os.path.join(query_path, f_name)
_ = s_alleles_blast.create_blastdb(schema_gene, ref_allele_blast_dir)
- # Blast with sample sequence to find the allele in the schema
+ # Blast with sample sequence to find the allele in the schema
seq_blast_match = s_alleles_blast.run_blast(query_file, perc_identity=100)
pdb.set_trace()
if len(seq_blast_match) >= 1:
- # allele is named as NIPHEM
-
+ # allele is named as NIPHEM
+
# Hacer un blast con la query esta secuencia y la database del alelo
# Create blast db with sample file
-
pass
elif len(seq_blast_match) == 1:
@@ -63,14 +81,13 @@ def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene)
else:
pass
-
- def search_alleles (self, ref_allele):
+ def search_alleles(self, ref_allele):
allele_name = Path(ref_allele).stem
- schema_gene = os.path.join(self.schema, allele_name + ".fasta")
+ schema_gene = os.path.join(self.schema, allele_name + ".fasta")
allele_name = Path(ref_allele).stem
# run blast with sample as db and reference allele as query
sample_blast_match = self.sample_blast.run_blast(ref_allele)
- if len(sample_blast_match) > 0 :
+ if len(sample_blast_match) > 0:
pd_lines = pd.DataFrame([item.split("\t") for item in sample_blast_match])
pd_lines.columns = self.blast_heading
pd_lines["pident"] = pd_lines["pident"].apply(pd.to_numeric)
@@ -84,16 +101,17 @@ def search_alleles (self, ref_allele):
# sel_row = np_lines[mask, :] = np_lines[mask, :]
# query_seq = sel_row[0,14]
sample_contig = sel_max["sseqid"]
- abbr = self.assign_allele_type(query_seq, allele_name, sample_contig, schema_gene)
+ abbr = self.assign_allele_type(
+ query_seq, allele_name, sample_contig, schema_gene
+ )
else:
# Sample does not have a reference allele to be matched
# Keep LNF info
# ver el codigo de espe
- #lnf_tpr_tag()
+ # lnf_tpr_tag()
pass
pdb.set_trace()
-
def analyze_sample(self):
# Create blast db with sample file
self.sample_blast = taranis.blast.Blast("nucl")
@@ -107,4 +125,3 @@ def analyze_sample(self):
pdb.set_trace()
return
-
diff --git a/taranis/allele_calling_old.py b/taranis/allele_calling_old.py
index 72d3294..e8be72f 100644
--- a/taranis/allele_calling_old.py
+++ b/taranis/allele_calling_old.py
@@ -28,16 +28,24 @@
import plotly.graph_objects as go
-def check_blast (reference_allele, sample_files, db_name, logger) : ## N
+def check_blast(reference_allele, sample_files, db_name, logger): ## N
for s_file in sample_files:
- f_name = os.path.basename(s_file).split('.')
+ f_name = os.path.basename(s_file).split(".")
dir_name = os.path.dirname(s_file)
- blast_dir = os.path.join(dir_name, db_name,f_name[0])
- blast_db = os.path.join(blast_dir,f_name[0])
- if not os.path.exists(blast_dir) :
- logger.error('Blast db folder for sample %s does not exist', f_name)
+ blast_dir = os.path.join(dir_name, db_name, f_name[0])
+ blast_db = os.path.join(blast_dir, f_name[0])
+ if not os.path.exists(blast_dir):
+ logger.error("Blast db folder for sample %s does not exist", f_name)
return False
- cline = NcbiblastnCommandline(db=blast_db, evalue=0.001, outfmt=5, max_target_seqs=10, max_hsps=10,num_threads=1, query=reference_allele)
+ cline = NcbiblastnCommandline(
+ db=blast_db,
+ evalue=0.001,
+ outfmt=5,
+ max_target_seqs=10,
+ max_hsps=10,
+ num_threads=1,
+ query=reference_allele,
+ )
out, err = cline()
psiblast_xml = StringIO(out)
@@ -51,16 +59,18 @@ def check_blast (reference_allele, sample_files, db_name, logger) : ## N
alleleMatchid = int((blast_record.query_id.split("_"))[-1])
return True
+
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · #
# Parse samples and core genes schema fasta files to dictionary #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · #
-def parsing_fasta_file_to_dict (fasta_file, logger):
+
+def parsing_fasta_file_to_dict(fasta_file, logger):
fasta_dict = {}
fasta_dict_ordered = {}
for contig in SeqIO.parse(fasta_file, "fasta"):
fasta_dict[str(contig.id)] = str(contig.seq.upper())
- logger.debug('file %s parsed to dictionary', fasta_file)
+ logger.debug("file %s parsed to dictionary", fasta_file)
for key in sorted(list(fasta_dict.keys())):
fasta_dict_ordered[key] = fasta_dict[key]
@@ -71,9 +81,11 @@ def parsing_fasta_file_to_dict (fasta_file, logger):
# Get core genes schema info before allele calling analysis #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · #
-#def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, logger):
-def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, genus, species, usegenus, logger):
+# def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, logger):
+def prepare_core_gene(
+ core_gene_file_list, store_dir, ref_alleles_dir, genus, species, usegenus, logger
+):
## Initialize dict for keeping id-allele, quality, length variability, length statistics and annotation info for each schema core gene
alleles_in_locus_dict = {}
schema_quality = {}
@@ -81,13 +93,11 @@ def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, genus, s
schema_variability = {}
schema_statistics = {}
-
## Process each schema core gene
- blast_dir = os.path.join(store_dir,'blastdb')
- logger.info('start preparation of core genes files')
+ blast_dir = os.path.join(store_dir, "blastdb")
+ logger.info("start preparation of core genes files")
for fasta_file in core_gene_file_list:
-
- f_name = os.path.basename(fasta_file).split('.')
+ f_name = os.path.basename(fasta_file).split(".")
# Parse core gene fasta file and keep id-sequence info in dictionary
fasta_file_parsed_dict = parsing_fasta_file_to_dict(fasta_file, logger)
@@ -96,8 +106,8 @@ def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, genus, s
alleles_in_locus_dict[f_name[0]] = fasta_file_parsed_dict
# dump fasta file into pickle file
- #with open (file_list[-1],'wb') as f:
- # pickle.dump(fasta_file_parsed_dict, f)
+ # with open (file_list[-1],'wb') as f:
+ # pickle.dump(fasta_file_parsed_dict, f)
# Get core gene alleles quality
locus_quality = check_core_gene_quality(fasta_file, logger)
@@ -106,63 +116,90 @@ def prepare_core_gene (core_gene_file_list, store_dir, ref_alleles_dir, genus, s
schema_quality[f_name[0]] = locus_quality
# Get gene and product annotation for core gene using reference allele(s)
- ref_allele = os.path.join(ref_alleles_dir, f_name[0] + '.fasta')
+ ref_allele = os.path.join(ref_alleles_dir, f_name[0] + ".fasta")
- gene_annot, product_annot = get_gene_annotation (ref_allele, store_dir, genus, species, usegenus, logger)
- #gene_annot, product_annot = get_gene_annotation (ref_allele, store_dir, logger)
+ gene_annot, product_annot = get_gene_annotation(
+ ref_allele, store_dir, genus, species, usegenus, logger
+ )
+ # gene_annot, product_annot = get_gene_annotation (ref_allele, store_dir, logger)
if f_name[0] not in annotation_core_dict.keys():
annotation_core_dict[f_name[0]] = {}
annotation_core_dict[f_name[0]] = [gene_annot, product_annot]
# Get core gene alleles length to keep length variability and statistics info
alleles_len = []
- for allele in fasta_file_parsed_dict :
+ for allele in fasta_file_parsed_dict:
alleles_len.append(len(fasta_file_parsed_dict[allele]))
- #alleles_in_locus = list (SeqIO.parse(fasta_file, "fasta")) ## parse
- #for allele in alleles_in_locus : ## parse
- #alleles_len.append(len(str(allele.seq))) ## parse
+ # alleles_in_locus = list (SeqIO.parse(fasta_file, "fasta")) ## parse
+ # for allele in alleles_in_locus : ## parse
+ # alleles_len.append(len(str(allele.seq))) ## parse
- schema_variability[f_name[0]]=list(set(alleles_len))
+ schema_variability[f_name[0]] = list(set(alleles_len))
if len(alleles_len) == 1:
stdev = 0
else:
stdev = statistics.stdev(alleles_len)
- schema_statistics[f_name[0]]=[statistics.mean(alleles_len), stdev, min(alleles_len), max(alleles_len)]
-
- return alleles_in_locus_dict, annotation_core_dict, schema_variability, schema_statistics, schema_quality
+ schema_statistics[f_name[0]] = [
+ statistics.mean(alleles_len),
+ stdev,
+ min(alleles_len),
+ max(alleles_len),
+ ]
+
+ return (
+ alleles_in_locus_dict,
+ annotation_core_dict,
+ schema_variability,
+ schema_statistics,
+ schema_quality,
+ )
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · #
# Get Prodigal training file from reference genome for samples gene prediction #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · #
-def prodigal_training(reference_genome_file, prodigal_dir, logger):
- f_name = os.path.basename(reference_genome_file).split('.')[0]
- prodigal_train_dir = os.path.join(prodigal_dir, 'training')
+def prodigal_training(reference_genome_file, prodigal_dir, logger):
+ f_name = os.path.basename(reference_genome_file).split(".")[0]
+ prodigal_train_dir = os.path.join(prodigal_dir, "training")
- output_prodigal_train_dir = os.path.join(prodigal_train_dir, f_name + '.trn')
+ output_prodigal_train_dir = os.path.join(prodigal_train_dir, f_name + ".trn")
if not os.path.exists(prodigal_train_dir):
try:
os.makedirs(prodigal_train_dir)
- logger.debug('Created prodigal directory for training file %s', f_name)
+ logger.debug("Created prodigal directory for training file %s", f_name)
except:
- logger.info('Cannot create prodigal directory for training file %s', f_name)
- print ('Error when creating the directory %s for training file', prodigal_train_dir)
+ logger.info("Cannot create prodigal directory for training file %s", f_name)
+ print(
+ "Error when creating the directory %s for training file",
+ prodigal_train_dir,
+ )
exit(0)
- prodigal_command = ['prodigal' , '-i', reference_genome_file, '-t', output_prodigal_train_dir]
- prodigal_result = subprocess.run(prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-
- # if prodigal_result.stderr:
- # logger.error('cannot create training file for %s', f_name)
- # logger.error('prodigal returning error code %s', prodigal_result.stderr)
- # return False
+ prodigal_command = [
+ "prodigal",
+ "-i",
+ reference_genome_file,
+ "-t",
+ output_prodigal_train_dir,
+ ]
+ prodigal_result = subprocess.run(
+ prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
+
+ # if prodigal_result.stderr:
+ # logger.error('cannot create training file for %s', f_name)
+ # logger.error('prodigal returning error code %s', prodigal_result.stderr)
+ # return False
else:
- logger.info('Skeeping prodigal training file creation for %s, as it has already been created', f_name)
+ logger.info(
+ "Skeeping prodigal training file creation for %s, as it has already been created",
+ f_name,
+ )
return output_prodigal_train_dir
@@ -171,33 +208,59 @@ def prodigal_training(reference_genome_file, prodigal_dir, logger):
# Get Prodigal sample gene prediction #
# · * · * · * · * · * · * · * · * · * #
-def prodigal_prediction(file_name, prodigal_dir, prodigal_train_dir, logger):
- f_name = '.'.join(os.path.basename(file_name).split('.')[:-1])
- prodigal_dir_sample = os.path.join(prodigal_dir,f_name)
+def prodigal_prediction(file_name, prodigal_dir, prodigal_train_dir, logger):
+ f_name = ".".join(os.path.basename(file_name).split(".")[:-1])
+ prodigal_dir_sample = os.path.join(prodigal_dir, f_name)
- output_prodigal_coord = os.path.join(prodigal_dir_sample, f_name + '_coord.gff') ## no
- output_prodigal_prot = os.path.join(prodigal_dir_sample, f_name + '_prot.faa') ## no
- output_prodigal_dna = os.path.join(prodigal_dir_sample, f_name + '_dna.faa')
+ output_prodigal_coord = os.path.join(
+ prodigal_dir_sample, f_name + "_coord.gff"
+ ) ## no
+ output_prodigal_prot = os.path.join(
+ prodigal_dir_sample, f_name + "_prot.faa"
+ ) ## no
+ output_prodigal_dna = os.path.join(prodigal_dir_sample, f_name + "_dna.faa")
if not os.path.exists(prodigal_dir_sample):
try:
os.makedirs(prodigal_dir_sample)
- logger.debug('Created prodigal directory for Core Gene %s', f_name)
+ logger.debug("Created prodigal directory for Core Gene %s", f_name)
except:
- logger.info('Cannot create prodigal directory for Core Gene %s' , f_name)
- print ('Error when creating the directory %s for prodigal genes prediction', prodigal_dir_sample)
+ logger.info("Cannot create prodigal directory for Core Gene %s", f_name)
+ print(
+ "Error when creating the directory %s for prodigal genes prediction",
+ prodigal_dir_sample,
+ )
exit(0)
- prodigal_command = ['prodigal' , '-i', file_name , '-t', prodigal_train_dir, '-f', 'gff', '-o', output_prodigal_coord, '-a', output_prodigal_prot, '-d', output_prodigal_dna]
- prodigal_result = subprocess.run(prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ prodigal_command = [
+ "prodigal",
+ "-i",
+ file_name,
+ "-t",
+ prodigal_train_dir,
+ "-f",
+ "gff",
+ "-o",
+ output_prodigal_coord,
+ "-a",
+ output_prodigal_prot,
+ "-d",
+ output_prodigal_dna,
+ ]
+ prodigal_result = subprocess.run(
+ prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
# if prodigal_result.stderr:
- # logger.error('cannot predict genes for %s ', f_name)
- # logger.error('prodigal returning error code %s', prodigal_result.stderr)
- #return False
+ # logger.error('cannot predict genes for %s ', f_name)
+ # logger.error('prodigal returning error code %s', prodigal_result.stderr)
+ # return False
else:
- logger.info('Skeeping prodigal genes prediction for %s, as it has already been made', f_name)
+ logger.info(
+ "Skeeping prodigal genes prediction for %s, as it has already been made",
+ f_name,
+ )
return True
@@ -206,64 +269,111 @@ def prodigal_prediction(file_name, prodigal_dir, prodigal_train_dir, logger):
# Get Prodigal predicted gene sequence equivalent to BLAST result matching bad quality allele or to no Exact Match BLAST result in allele calling analysis #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * #
-def get_prodigal_sequence(blast_sseq, contig_blast_id, prodigal_directory, sample_name, blast_parameters, logger):
+def get_prodigal_sequence(
+ blast_sseq,
+ contig_blast_id,
+ prodigal_directory,
+ sample_name,
+ blast_parameters,
+ logger,
+):
prodigal_directory_sample = os.path.join(prodigal_directory, sample_name)
- genes_file = os.path.join(prodigal_directory_sample, sample_name + '_dna.faa')
+ genes_file = os.path.join(prodigal_directory_sample, sample_name + "_dna.faa")
## Create directory for storing prodigal genes prediction per contig BLAST databases
- blastdb_per_contig_directory = 'blastdb_per_contig'
- full_path_blastdb_per_contig = os.path.join(prodigal_directory_sample, blastdb_per_contig_directory)
+ blastdb_per_contig_directory = "blastdb_per_contig"
+ full_path_blastdb_per_contig = os.path.join(
+ prodigal_directory_sample, blastdb_per_contig_directory
+ )
if not os.path.exists(full_path_blastdb_per_contig):
try:
os.makedirs(full_path_blastdb_per_contig)
- logger.info('Directory %s has been created', full_path_blastdb_per_contig)
+ logger.info("Directory %s has been created", full_path_blastdb_per_contig)
except:
- print ('Cannot create the directory ', full_path_blastdb_per_contig)
- logger.info('Directory %s cannot be created', full_path_blastdb_per_contig)
- exit (0)
+ print("Cannot create the directory ", full_path_blastdb_per_contig)
+ logger.info("Directory %s cannot be created", full_path_blastdb_per_contig)
+ exit(0)
## Create directory for storing prodigal genes prediction sequences per contig
- prodigal_genes_per_contig_directory = 'prodigal_genes_per_contig'
- full_path_prodigal_genes_per_contig = os.path.join(prodigal_directory_sample, prodigal_genes_per_contig_directory)
+ prodigal_genes_per_contig_directory = "prodigal_genes_per_contig"
+ full_path_prodigal_genes_per_contig = os.path.join(
+ prodigal_directory_sample, prodigal_genes_per_contig_directory
+ )
if not os.path.exists(full_path_prodigal_genes_per_contig):
try:
os.makedirs(full_path_prodigal_genes_per_contig)
- logger.info('Directory %s has been created', full_path_prodigal_genes_per_contig)
+ logger.info(
+ "Directory %s has been created", full_path_prodigal_genes_per_contig
+ )
except:
- print ('Cannot create the directory ', full_path_prodigal_genes_per_contig)
- logger.info('Directory %s cannot be created', full_path_prodigal_genes_per_contig)
- exit (0)
+ print("Cannot create the directory ", full_path_prodigal_genes_per_contig)
+ logger.info(
+ "Directory %s cannot be created", full_path_prodigal_genes_per_contig
+ )
+ exit(0)
## Parse prodigal genes prediction fasta file
predicted_genes = SeqIO.parse(genes_file, "fasta")
## Create fasta file containing Prodigal predicted genes sequences for X contig in sample
- contig_genes_path = os.path.join(full_path_prodigal_genes_per_contig, contig_blast_id + '.fasta')
- with open (contig_genes_path, 'w') as out_fh:
+ contig_genes_path = os.path.join(
+ full_path_prodigal_genes_per_contig, contig_blast_id + ".fasta"
+ )
+ with open(contig_genes_path, "w") as out_fh:
for rec in predicted_genes:
- contig_prodigal_id = '_'.join((rec.id).split("_")[:-1])
+ contig_prodigal_id = "_".join((rec.id).split("_")[:-1])
if contig_prodigal_id == contig_blast_id:
- out_fh.write ('>' + str(rec.description) + '\n' + str(rec.seq) + '\n')
+ out_fh.write(">" + str(rec.description) + "\n" + str(rec.seq) + "\n")
## Create local BLAST database for Prodigal predicted genes sequences for X contig in sample
- if not create_blastdb(contig_genes_path, full_path_blastdb_per_contig, 'nucl', logger):
- print('Error when creating the blastdb for samples files. Check log file for more information. \n ')
+ if not create_blastdb(
+ contig_genes_path, full_path_blastdb_per_contig, "nucl", logger
+ ):
+ print(
+ "Error when creating the blastdb for samples files. Check log file for more information. \n "
+ )
return False
## Local BLAST Prodigal predicted genes sequences database VS BLAST sequence obtained in sample in allele calling analysis
- blast_db_name = os.path.join(full_path_blastdb_per_contig, contig_blast_id, contig_blast_id)
-
- cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 90, outfmt= blast_parameters, max_target_seqs=10, max_hsps=10, num_threads=1)
- out, err = cline(stdin = blast_sseq)
+ blast_db_name = os.path.join(
+ full_path_blastdb_per_contig, contig_blast_id, contig_blast_id
+ )
+
+ cline = NcbiblastnCommandline(
+ db=blast_db_name,
+ evalue=0.001,
+ perc_identity=90,
+ outfmt=blast_parameters,
+ max_target_seqs=10,
+ max_hsps=10,
+ num_threads=1,
+ )
+ out, err = cline(stdin=blast_sseq)
out_lines = out.splitlines()
bigger_bitscore = 0
- if len (out_lines) > 0 :
- for line in out_lines :
- values = line.split('\t')
- if float(values[8]) > bigger_bitscore:
- qseqid , sseqid , pident , qlen , s_length , mismatch , r_gapopen , r_evalue , bitscore , sstart , send , qstart , qend ,sseq , qseq = values
+ if len(out_lines) > 0:
+ for line in out_lines:
+ values = line.split("\t")
+ if float(values[8]) > bigger_bitscore:
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = values
bigger_bitscore = float(bitscore)
## Get complete Prodigal sequence matching allele calling BLAST sequence using ID
@@ -272,41 +382,51 @@ def get_prodigal_sequence(blast_sseq, contig_blast_id, prodigal_directory, sampl
for rec in predicted_genes_in_contig:
if rec.id == sseqid:
predicted_gene_sequence = str(rec.seq)
- start_prodigal = str(rec.description.split( '#')[1])
- end_prodigal = str(rec.description.split('#')[2])
+ start_prodigal = str(rec.description.split("#")[1])
+ end_prodigal = str(rec.description.split("#")[2])
break
## Sequence not found by Prodigal when there are no BLAST results matching allele calling BLAST sequence
- if len (out_lines) == 0:
- predicted_gene_sequence = 'Sequence not found by Prodigal'
- start_prodigal = '-'
- end_prodigal = '-'
+ if len(out_lines) == 0:
+ predicted_gene_sequence = "Sequence not found by Prodigal"
+ start_prodigal = "-"
+ end_prodigal = "-"
- return predicted_gene_sequence, start_prodigal, end_prodigal ### start_prodigal y end_prodigal para report prodigal
+ return (
+ predicted_gene_sequence,
+ start_prodigal,
+ end_prodigal,
+ ) ### start_prodigal y end_prodigal para report prodigal
# · * · * · * · * · * · * · * · * · * · * · * · * #
# Get samples info before allele calling analysis #
# · * · * · * · * · * · * · * · * · * · * · * · * #
-def prepare_samples(sample_file_list, store_dir, reference_genome_file, logger):
+def prepare_samples(sample_file_list, store_dir, reference_genome_file, logger):
## Initialize dictionary for keeping id-contig
contigs_in_sample_dict = {}
## Paths for samples blastdb, Prodigal genes prediction and BLAST results
- blast_dir = os.path.join(store_dir,'blastdb')
- prodigal_dir = os.path.join(store_dir,'prodigal')
- blast_results_seq_dir = os.path.join(store_dir,'blast_results', 'blast_results_seq')
+ blast_dir = os.path.join(store_dir, "blastdb")
+ prodigal_dir = os.path.join(store_dir, "prodigal")
+ blast_results_seq_dir = os.path.join(
+ store_dir, "blast_results", "blast_results_seq"
+ )
## Get training file for Prodigal genes prediction
- output_prodigal_train_dir = prodigal_training(reference_genome_file, prodigal_dir, logger)
+ output_prodigal_train_dir = prodigal_training(
+ reference_genome_file, prodigal_dir, logger
+ )
if not output_prodigal_train_dir:
- print('Error when creating training file for genes prediction. Check log file for more information. \n ')
+ print(
+ "Error when creating training file for genes prediction. Check log file for more information. \n "
+ )
return False
for fasta_file in sample_file_list:
- f_name = '.'.join(os.path.basename(fasta_file).split('.')[:-1])
+ f_name = ".".join(os.path.basename(fasta_file).split(".")[:-1])
# Get samples id-contig dictionary
fasta_file_parsed_dict = parsing_fasta_file_to_dict(fasta_file, logger)
@@ -315,8 +435,8 @@ def prepare_samples(sample_file_list, store_dir, reference_genome_file, logger):
contigs_in_sample_dict[f_name] = fasta_file_parsed_dict
# dump fasta file into pickle file
- #with open (file_list[-1],'wb') as f: # generación de diccionarios de contigs para cada muestra
- # pickle.dump(fasta_file_parsed_dict, f)
+ # with open (file_list[-1],'wb') as f: # generación de diccionarios de contigs para cada muestra
+ # pickle.dump(fasta_file_parsed_dict, f)
# Create directory for storing BLAST results using reference allele(s)
blast_results_seq_per_sample_dir = os.path.join(blast_results_seq_dir, f_name)
@@ -324,35 +444,51 @@ def prepare_samples(sample_file_list, store_dir, reference_genome_file, logger):
if not os.path.exists(blast_results_seq_per_sample_dir):
try:
os.makedirs(blast_results_seq_per_sample_dir)
- logger.debug('Created blast results directory for sample %s', f_name)
+ logger.debug("Created blast results directory for sample %s", f_name)
except:
- logger.info('Cannot create blast results directory for sample %s', f_name)
- print ('Error when creating the directory for blast results', blast_results_seq_per_sample_dir)
+ logger.info(
+ "Cannot create blast results directory for sample %s", f_name
+ )
+ print(
+ "Error when creating the directory for blast results",
+ blast_results_seq_per_sample_dir,
+ )
exit(0)
# Prodigal genes prediction for each sample
- if not prodigal_prediction(fasta_file, prodigal_dir, output_prodigal_train_dir, logger):
- print('Error when predicting genes for samples files. Check log file for more information. \n ')
+ if not prodigal_prediction(
+ fasta_file, prodigal_dir, output_prodigal_train_dir, logger
+ ):
+ print(
+ "Error when predicting genes for samples files. Check log file for more information. \n "
+ )
return False
# Create local BLAST db for each sample fasta file
- if not create_blastdb(fasta_file, blast_dir, 'nucl', logger):
- print('Error when creating the blastdb for samples files. Check log file for more information. \n ')
+ if not create_blastdb(fasta_file, blast_dir, "nucl", logger):
+ print(
+ "Error when creating the blastdb for samples files. Check log file for more information. \n "
+ )
return False
return contigs_in_sample_dict
+
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * #
# Get established length thresholds for allele tagging in allele calling analysis #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * #
-def length_thresholds(core_name, schema_statistics, percent): ### logger
+def length_thresholds(core_name, schema_statistics, percent): ### logger
locus_mean = int(schema_statistics[core_name][0])
if percent != "SD":
- max_length_threshold = math.ceil(locus_mean + ((locus_mean * float(percent)) / 100))
- min_length_threshold = math.floor(locus_mean - ((locus_mean * float(percent)) / 100))
+ max_length_threshold = math.ceil(
+ locus_mean + ((locus_mean * float(percent)) / 100)
+ )
+ min_length_threshold = math.floor(
+ locus_mean - ((locus_mean * float(percent)) / 100)
+ )
else:
percent = float(schema_statistics[core_name][1])
@@ -366,103 +502,156 @@ def length_thresholds(core_name, schema_statistics, percent): ### logger
# Convert dna sequence to protein sequence #
# · * · * · * · * · * · * · * · * · * · * · #
-def convert_to_protein (sequence) :
+def convert_to_protein(sequence):
seq = Seq.Seq(sequence)
protein = str(seq.translate())
return protein
+
# · * · * · * · * · * · * · * · * · * · * · * · * · * #
# Get SNPs between BLAST sequence and matching allele #
# · * · * · * · * · * · * · * · * · * · * · * · * · * #
-def get_snp (sample, query) :
- prot_annotation = {'S': 'polar' ,'T': 'polar' ,'Y': 'polar' ,'Q': 'polar' ,'N': 'polar' ,'C': 'polar' ,'S': 'polar' ,
- 'F': 'nonpolar' ,'L': 'nonpolar','I': 'nonpolar','M': 'nonpolar','P': 'nonpolar','V': 'nonpolar','A': 'nonpolar','W': 'nonpolar','G': 'nonpolar',
- 'D' : 'acidic', 'E' :'acidic',
- 'H': 'basic' , 'K': 'basic' , 'R' : 'basic',
- '-': '-----', '*' : 'Stop codon'}
+def get_snp(sample, query):
+ prot_annotation = {
+ "S": "polar",
+ "T": "polar",
+ "Y": "polar",
+ "Q": "polar",
+ "N": "polar",
+ "C": "polar",
+ "S": "polar",
+ "F": "nonpolar",
+ "L": "nonpolar",
+ "I": "nonpolar",
+ "M": "nonpolar",
+ "P": "nonpolar",
+ "V": "nonpolar",
+ "A": "nonpolar",
+ "W": "nonpolar",
+ "G": "nonpolar",
+ "D": "acidic",
+ "E": "acidic",
+ "H": "basic",
+ "K": "basic",
+ "R": "basic",
+ "-": "-----",
+ "*": "Stop codon",
+ }
snp_list = []
- sample = sample.replace('-','')
- #length = max(len(sample), len(query))
+ sample = sample.replace("-", "")
+ # length = max(len(sample), len(query))
length = len(query)
# normalize the length of the sample for the iteration
- if len(sample) < length :
+ if len(sample) < length:
need_to_add = length - len(sample)
- sample = sample + need_to_add * '-'
+ sample = sample + need_to_add * "-"
# convert to Seq class to translate to protein
seq_sample = Seq.Seq(sample)
seq_query = Seq.Seq(query)
for index in range(length):
- if seq_query[index] != seq_sample[index] :
+ if seq_query[index] != seq_sample[index]:
triple_index = index - (index % 3)
codon_seq = seq_sample[triple_index : triple_index + 3]
codon_que = seq_query[triple_index : triple_index + 3]
- if not '-' in str(codon_seq) :
+ if not "-" in str(codon_seq):
prot_seq = str(codon_seq.translate())
prot_que = str(codon_que.translate())
else:
- prot_seq = '-'
- prot_que = str(seq_query[triple_index: ].translate())
- if prot_annotation[prot_que[0]] == prot_annotation[prot_seq[0]] :
- missense_synonym = 'Synonymous'
- elif prot_seq == '*' :
- missense_synonym = 'Nonsense'
+ prot_seq = "-"
+ prot_que = str(seq_query[triple_index:].translate())
+ if prot_annotation[prot_que[0]] == prot_annotation[prot_seq[0]]:
+ missense_synonym = "Synonymous"
+ elif prot_seq == "*":
+ missense_synonym = "Nonsense"
else:
- missense_synonym = 'Missense'
- #snp_list.append([str(index+1),str(seq_sample[index]) + '/' + str(seq_query[index]), str(codon_seq) + '/'+ str(codon_que),
- snp_list.append([str(index+1),str(seq_query[index]) + '/' + str(seq_sample[index]), str(codon_que) + '/'+ str(codon_seq),
- # when one of the sequence ends but not the other we will translate the remain sequence to proteins
- # in that case we will only annotate the first protein. Using [0] as key of the dictionary annotation
- prot_que + '/' + prot_seq, missense_synonym, prot_annotation[prot_que[0]] + ' / ' + prot_annotation[prot_seq[0]]])
- if '-' in str(codon_seq) :
+ missense_synonym = "Missense"
+ # snp_list.append([str(index+1),str(seq_sample[index]) + '/' + str(seq_query[index]), str(codon_seq) + '/'+ str(codon_que),
+ snp_list.append(
+ [
+ str(index + 1),
+ str(seq_query[index]) + "/" + str(seq_sample[index]),
+ str(codon_que) + "/" + str(codon_seq),
+ # when one of the sequence ends but not the other we will translate the remain sequence to proteins
+ # in that case we will only annotate the first protein. Using [0] as key of the dictionary annotation
+ prot_que + "/" + prot_seq,
+ missense_synonym,
+ prot_annotation[prot_que[0]] + " / " + prot_annotation[prot_seq[0]],
+ ]
+ )
+ if "-" in str(codon_seq):
break
return snp_list
-def nucleotide_to_protein_alignment (sample_seq, query_seq ) : ### Sustituido por get_alignment
+def nucleotide_to_protein_alignment(
+ sample_seq, query_seq
+): ### Sustituido por get_alignment
aligment = []
sample_prot = convert_to_protein(sample_seq)
query_prot = convert_to_protein(query_seq)
minimun_length = min(len(sample_prot), len(query_prot))
for i in range(minimun_length):
- if sample_prot[i] == query_prot[i] :
- aligment.append('|')
+ if sample_prot[i] == query_prot[i]:
+ aligment.append("|")
else:
- aligment.append(' ')
- protein_alignment = [['sample', sample_prot],['match', ''.join(aligment)], ['schema', query_prot]]
+ aligment.append(" ")
+ protein_alignment = [
+ ["sample", sample_prot],
+ ["match", "".join(aligment)],
+ ["schema", query_prot],
+ ]
return protein_alignment
-def get_alignment_for_indels (blast_db_name, qseq) : ### Sustituido por get_alignment
- #match_alignment =[]
- cline = NcbiblastnCommandline(db=blast_db_name, evalue=0.001, perc_identity = 80, outfmt= 5, max_target_seqs=10, max_hsps=10,num_threads=1)
- out, err = cline(stdin = qseq)
+def get_alignment_for_indels(blast_db_name, qseq): ### Sustituido por get_alignment
+ # match_alignment =[]
+ cline = NcbiblastnCommandline(
+ db=blast_db_name,
+ evalue=0.001,
+ perc_identity=80,
+ outfmt=5,
+ max_target_seqs=10,
+ max_hsps=10,
+ num_threads=1,
+ )
+ out, err = cline(stdin=qseq)
psiblast_xml = StringIO(out)
blast_records = NCBIXML.parse(psiblast_xml)
for blast_record in blast_records:
for alignment in blast_record.alignments:
for match in alignment.hsps:
- match_alignment = [['sample', match.sbjct],['match', match.match], ['schema',match.query]]
+ match_alignment = [
+ ["sample", match.sbjct],
+ ["match", match.match],
+ ["schema", match.query],
+ ]
return match_alignment
-def get_alignment_for_deletions (sample_seq, query_seq): ### Sustituido por get_alignment
+def get_alignment_for_deletions(
+ sample_seq, query_seq
+): ### Sustituido por get_alignment
index_found = False
alignments = pairwise2.align.globalxx(sample_seq, query_seq)
- for index in range(len(alignments)) :
- if alignments[index][4] == len(query_seq) :
+ for index in range(len(alignments)):
+ if alignments[index][4] == len(query_seq):
index_found = True
break
- if not index_found :
+ if not index_found:
index = 0
- values = format_alignment(*alignments[index]).split('\n')
- match_alignment = [['sample', values[0]],['match', values[1]], ['schema',values[2]]]
+ values = format_alignment(*alignments[index]).split("\n")
+ match_alignment = [
+ ["sample", values[0]],
+ ["match", values[1]],
+ ["schema", values[2]],
+ ]
return match_alignment
@@ -470,8 +659,10 @@ def get_alignment_for_deletions (sample_seq, query_seq): ### Sustituido por get_
# Get DNA and protein alignment between the final sequence found in the sample and the matching allele #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * #
-def get_alignment (sample_seq, query_seq, reward, penalty, gapopen, gapextend, seq_type = "dna"):
+def get_alignment(
+ sample_seq, query_seq, reward, penalty, gapopen, gapextend, seq_type="dna"
+):
## If sequences alignment type desired is "protein" convert dna sequences to protein
if seq_type == "protein":
sample_seq = convert_to_protein(sample_seq)
@@ -479,9 +670,15 @@ def get_alignment (sample_seq, query_seq, reward, penalty, gapopen, gapextend, s
## Get dna/protein alignment between final sequence found and matching allele
# arguments pairwise2.align.globalms: match, mismatch, gap opening, gap extending
- alignments = pairwise2.align.localms(sample_seq, query_seq, reward, penalty, -gapopen, -gapextend)
- values = format_alignment(*alignments[0]).split('\n')
- match_alignment = [['sample', values[0]],['match', values[1]], ['schema',values[2]]]
+ alignments = pairwise2.align.localms(
+ sample_seq, query_seq, reward, penalty, -gapopen, -gapextend
+ )
+ values = format_alignment(*alignments[0]).split("\n")
+ match_alignment = [
+ ["sample", values[0]],
+ ["match", values[1]],
+ ["schema", values[2]],
+ ]
return match_alignment
@@ -490,102 +687,135 @@ def get_alignment (sample_seq, query_seq, reward, penalty, gapopen, gapextend, s
# Tag LNF cases and keep LNF info #
# · * · * · * · * · * · * · * · * #
-def lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, qseqid, pident, s_length, new_sequence_length, perc_identity_ref, coverage, schema_quality, annotation_core_dict, count_dict, logger):
+def lnf_tpr_tag(
+ core_name,
+ sample_name,
+ alleles_in_locus_dict,
+ samples_matrix_dict,
+ lnf_tpr_dict,
+ schema_statistics,
+ locus_alleles_path,
+ qseqid,
+ pident,
+ s_length,
+ new_sequence_length,
+ perc_identity_ref,
+ coverage,
+ schema_quality,
+ annotation_core_dict,
+ count_dict,
+ logger,
+):
gene_annot, product_annot = annotation_core_dict[core_name]
- if qseqid == '-':
- samples_matrix_dict[sample_name].append('LNF')
- tag_report = 'LNF'
- matching_allele_length = '-'
+ if qseqid == "-":
+ samples_matrix_dict[sample_name].append("LNF")
+ tag_report = "LNF"
+ matching_allele_length = "-"
else:
- if new_sequence_length == '-':
- samples_matrix_dict[sample_name].append('LNF_' + str(qseqid))
- tag_report = 'LNF'
+ if new_sequence_length == "-":
+ samples_matrix_dict[sample_name].append("LNF_" + str(qseqid))
+ tag_report = "LNF"
else:
- samples_matrix_dict[sample_name].append('TPR_' + str(qseqid))
- tag_report = 'TPR'
+ samples_matrix_dict[sample_name].append("TPR_" + str(qseqid))
+ tag_report = "TPR"
matching_allele_seq = alleles_in_locus_dict[core_name][qseqid]
matching_allele_length = len(matching_allele_seq)
- #alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse
- #for allele in alleles_in_locus : ## parse
- #if allele.id == qseqid : ## parse
- #break ## parse
- #matching_allele_seq = str(allele.seq) ## parse
- #matching_allele_length = len(matching_allele_seq) ## parse
+ # alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse
+ # for allele in alleles_in_locus : ## parse
+ # if allele.id == qseqid : ## parse
+ # break ## parse
+ # matching_allele_seq = str(allele.seq) ## parse
+ # matching_allele_length = len(matching_allele_seq) ## parse
- if pident == '-':
+ if pident == "-":
# (los dos BLAST sin resultado)
- coverage_blast = '-'
- coverage_new_sequence = '-'
- add_info = 'Locus not found'
- logger.info('Locus not found at sample %s, for gene %s', sample_name, core_name)
+ coverage_blast = "-"
+ coverage_new_sequence = "-"
+ add_info = "Locus not found"
+ logger.info("Locus not found at sample %s, for gene %s", sample_name, core_name)
# Get allele quality
- allele_quality = '-'
+ allele_quality = "-"
# (recuento tags para plot)
- count_dict[sample_name]['not_found'] += 1
- count_dict[sample_name]['total'] += 1
+ count_dict[sample_name]["not_found"] += 1
+ count_dict[sample_name]["total"] += 1
elif 90 > float(pident):
# (BLAST 90 sin resultado y BLAST 70 con resultado)
- coverage_blast = '-'
- coverage_new_sequence = '-'
- add_info = 'BLAST sequence ID under threshold: {}%'.format(perc_identity_ref)
- logger.info('BLAST sequence ID %s under threshold at sample %s, for gene %s', pident, sample_name, core_name)
+ coverage_blast = "-"
+ coverage_new_sequence = "-"
+ add_info = "BLAST sequence ID under threshold: {}%".format(perc_identity_ref)
+ logger.info(
+ "BLAST sequence ID %s under threshold at sample %s, for gene %s",
+ pident,
+ sample_name,
+ core_name,
+ )
# Get allele quality
- allele_quality = '-'
+ allele_quality = "-"
# (recuento tags para plot)
- count_dict[sample_name]['low_id'] += 1
- count_dict[sample_name]['total'] += 1
+ count_dict[sample_name]["low_id"] += 1
+ count_dict[sample_name]["total"] += 1
- elif 90 <= float(pident) and new_sequence_length == '-':
+ elif 90 <= float(pident) and new_sequence_length == "-":
# (BLAST 90 con resultado, bajo coverage BLAST)
locus_mean = int(schema_statistics[core_name][0])
coverage_blast = int(s_length) / locus_mean
- #coverage_blast = int(s_length) / matching_allele_length
- coverage_new_sequence = '-'
+ # coverage_blast = int(s_length) / matching_allele_length
+ coverage_new_sequence = "-"
if coverage_blast < 1:
- add_info = 'BLAST sequence coverage under threshold: {}%'.format(coverage)
+ add_info = "BLAST sequence coverage under threshold: {}%".format(coverage)
else:
- add_info = 'BLAST sequence coverage above threshold: {}%'.format(coverage)
- logger.info('BLAST sequence coverage %s under threshold at sample %s, for gene %s', coverage_blast, sample_name, core_name)
+ add_info = "BLAST sequence coverage above threshold: {}%".format(coverage)
+ logger.info(
+ "BLAST sequence coverage %s under threshold at sample %s, for gene %s",
+ coverage_blast,
+ sample_name,
+ core_name,
+ )
# Get allele quality
- allele_quality = '-'
+ allele_quality = "-"
# (recuento tags para plot)
- count_dict[sample_name]['low_coverage'] += 1
- count_dict[sample_name]['total'] += 1
+ count_dict[sample_name]["low_coverage"] += 1
+ count_dict[sample_name]["total"] += 1
- elif 90 <= float(pident) and new_sequence_length != '-':
+ elif 90 <= float(pident) and new_sequence_length != "-":
# (BLAST 90 con resultado, buen coverage BLAST, bajo coverage new_sseq)
locus_mean = int(schema_statistics[core_name][0])
coverage_blast = int(s_length) / locus_mean * 100
- #coverage_blast = int(s_length) / matching_allele_length
+ # coverage_blast = int(s_length) / matching_allele_length
coverage_new_sequence = new_sequence_length / matching_allele_length * 100
if coverage_new_sequence < 1:
- add_info = 'New sequence coverage under threshold: {}%'.format(coverage)
+ add_info = "New sequence coverage under threshold: {}%".format(coverage)
else:
- add_info = 'New sequence coverage above threshold: {}%'.format(coverage)
- logger.info('New sequence coverage %s under threshold at sample %s, for gene %s', coverage_new_sequence, sample_name, core_name)
+ add_info = "New sequence coverage above threshold: {}%".format(coverage)
+ logger.info(
+ "New sequence coverage %s under threshold at sample %s, for gene %s",
+ coverage_new_sequence,
+ sample_name,
+ core_name,
+ )
# Get allele quality
allele_quality = schema_quality[core_name][qseqid]
# (recuento tags para plot)
- count_dict[sample_name]['total'] += 1
+ count_dict[sample_name]["total"] += 1
for count_class in count_dict[sample_name]:
if count_class in allele_quality:
count_dict[sample_name][count_class] += 1
- #if "bad_quality" in allele_quality:
- # count_dict[sample_name]['bad_quality'] += 1
+ # if "bad_quality" in allele_quality:
+ # count_dict[sample_name]['bad_quality'] += 1
## Keeping LNF and TPR report info
if not core_name in lnf_tpr_dict:
@@ -593,7 +823,22 @@ def lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_di
if not sample_name in lnf_tpr_dict[core_name]:
lnf_tpr_dict[core_name][sample_name] = []
- lnf_tpr_dict[core_name][sample_name].append([gene_annot, product_annot, tag_report, qseqid, allele_quality, pident, str(coverage_blast), str(coverage_new_sequence), str(matching_allele_length), str(s_length), str(new_sequence_length), add_info]) ### Meter secuencias alelo, blast y new_sseq (si las hay)?
+ lnf_tpr_dict[core_name][sample_name].append(
+ [
+ gene_annot,
+ product_annot,
+ tag_report,
+ qseqid,
+ allele_quality,
+ pident,
+ str(coverage_blast),
+ str(coverage_new_sequence),
+ str(matching_allele_length),
+ str(s_length),
+ str(new_sequence_length),
+ add_info,
+ ]
+ ) ### Meter secuencias alelo, blast y new_sseq (si las hay)?
return True
@@ -602,20 +847,37 @@ def lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_di
# Tag paralog and exact match cases and keep info #
# · * · * · * · * · * · * · * · * · * · * · * · * #
-def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, tag_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_dict, logger):
- logger.info('Found %s at sample %s for core gene %s ', tag, sample_name, core_name)
-
- paralog_quality_count = [] # (lista para contabilizar parálogos debido a bad o good quality)
+def paralog_exact_tag(
+ sample_name,
+ core_name,
+ tag,
+ schema_quality,
+ matching_genes_dict,
+ samples_matrix_dict,
+ allele_found,
+ tag_dict,
+ prodigal_report,
+ prodigal_directory,
+ blast_parameters,
+ annotation_core_dict,
+ count_dict,
+ logger,
+):
+ logger.info("Found %s at sample %s for core gene %s ", tag, sample_name, core_name)
+
+ paralog_quality_count = (
+ []
+ ) # (lista para contabilizar parálogos debido a bad o good quality)
gene_annot, product_annot = annotation_core_dict[core_name]
- if not sample_name in tag_dict :
+ if not sample_name in tag_dict:
tag_dict[sample_name] = {}
- if not core_name in tag_dict[sample_name] :
- tag_dict[sample_name][core_name]= []
+ if not core_name in tag_dict[sample_name]:
+ tag_dict[sample_name][core_name] = []
- if tag == 'EXACT':
+ if tag == "EXACT":
allele = list(allele_found.keys())[0]
qseqid = allele_found[allele][0]
tag = qseqid
@@ -623,8 +885,24 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene
samples_matrix_dict[sample_name].append(tag)
for sequence in allele_found:
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = allele_found[sequence]
- sseq = sseq.replace('-', '')
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = allele_found[sequence]
+ sseq = sseq.replace("-", "")
# Get allele quality
allele_quality = schema_quality[core_name][qseqid]
@@ -633,35 +911,87 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene
paralog_quality_count.append(allele_quality)
# Get prodigal gene prediction if allele quality is 'bad_quality'
- if 'bad_quality' in allele_quality:
- complete_predicted_seq, start_prodigal, end_prodigal = get_prodigal_sequence(sseq, sseqid, prodigal_directory, sample_name, blast_parameters, logger)
+ if "bad_quality" in allele_quality:
+ (
+ complete_predicted_seq,
+ start_prodigal,
+ end_prodigal,
+ ) = get_prodigal_sequence(
+ sseq, sseqid, prodigal_directory, sample_name, blast_parameters, logger
+ )
##### informe prodigal #####
- prodigal_report.append([core_name, sample_name, qseqid, tag, sstart, send, start_prodigal, end_prodigal, sseq, complete_predicted_seq])
+ prodigal_report.append(
+ [
+ core_name,
+ sample_name,
+ qseqid,
+ tag,
+ sstart,
+ send,
+ start_prodigal,
+ end_prodigal,
+ sseq,
+ complete_predicted_seq,
+ ]
+ )
else:
- complete_predicted_seq = '-'
+ complete_predicted_seq = "-"
- if not sseqid in matching_genes_dict[sample_name] :
+ if not sseqid in matching_genes_dict[sample_name]:
matching_genes_dict[sample_name][sseqid] = []
- if sstart > send :
- #matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'-', tag])
- matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, send,'-', tag])
+ if sstart > send:
+ # matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'-', tag])
+ matching_genes_dict[sample_name][sseqid].append(
+ [core_name, qseqid, sstart, send, "-", tag]
+ )
else:
- #matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'+', tag])
- matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, send,'+', tag])
+ # matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'+', tag])
+ matching_genes_dict[sample_name][sseqid].append(
+ [core_name, qseqid, sstart, send, "+", tag]
+ )
## Keeping paralog NIPH/NIPHEM report info
- if tag == 'NIPH' or tag == 'NIPHEM':
- tag_dict[sample_name][core_name].append([gene_annot, product_annot, tag, pident, qseqid, allele_quality, sseqid, bitscore, sstart, send, sseq, complete_predicted_seq])
+ if tag == "NIPH" or tag == "NIPHEM":
+ tag_dict[sample_name][core_name].append(
+ [
+ gene_annot,
+ product_annot,
+ tag,
+ pident,
+ qseqid,
+ allele_quality,
+ sseqid,
+ bitscore,
+ sstart,
+ send,
+ sseq,
+ complete_predicted_seq,
+ ]
+ )
else:
- tag_dict[sample_name][core_name] = [gene_annot, product_annot, qseqid, allele_quality, sseqid, s_length, sstart, send, sseq, complete_predicted_seq]
+ tag_dict[sample_name][core_name] = [
+ gene_annot,
+ product_annot,
+ qseqid,
+ allele_quality,
+ sseqid,
+ s_length,
+ sstart,
+ send,
+ sseq,
+ complete_predicted_seq,
+ ]
# (recuento tags para plot)
- count_dict[sample_name]['total'] += 1
+ count_dict[sample_name]["total"] += 1
for count_class in count_dict[sample_name]:
if count_class in allele_quality:
- if "no_start_stop" not in count_class and "no_start_stop" in allele_quality:
+ if (
+ "no_start_stop" not in count_class
+ and "no_start_stop" in allele_quality
+ ):
if count_class == "bad_quality":
count_dict[sample_name][count_class] += 1
else:
@@ -673,10 +1003,13 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene
for paralog_quality in paralog_quality_count:
count += 1
if "bad_quality" in paralog_quality:
- count_dict[sample_name]['total'] += 1
+ count_dict[sample_name]["total"] += 1
for count_class in count_dict[sample_name]:
if count_class in paralog_quality:
- if "no_start_stop" not in count_class and "no_start_stop" in paralog_quality:
+ if (
+ "no_start_stop" not in count_class
+ and "no_start_stop" in paralog_quality
+ ):
if count_class == "bad_quality":
count_dict[sample_name][count_class] += 1
else:
@@ -687,8 +1020,8 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene
else:
if count == len(paralog_quality_count):
- count_dict[sample_name]['total'] += 1
- count_dict[sample_name]['good_quality'] += 1
+ count_dict[sample_name]["total"] += 1
+ count_dict[sample_name]["good_quality"] += 1
return True
@@ -697,97 +1030,225 @@ def paralog_exact_tag(sample_name, core_name, tag, schema_quality, matching_gene
# Tag INF/ASM/ALM/PLOT cases and keep info #
# · * · * · * · * · * · * · * · * · * · * #
-def inf_asm_alm_tag(core_name, sample_name, tag, blast_values, allele_quality, new_sseq, matching_allele_length, tag_dict, list_tag, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_dict, logger):
+def inf_asm_alm_tag(
+ core_name,
+ sample_name,
+ tag,
+ blast_values,
+ allele_quality,
+ new_sseq,
+ matching_allele_length,
+ tag_dict,
+ list_tag,
+ samples_matrix_dict,
+ matching_genes_dict,
+ prodigal_report,
+ start_prodigal,
+ end_prodigal,
+ complete_predicted_seq,
+ annotation_core_dict,
+ count_dict,
+ logger,
+):
gene_annot, product_annot = annotation_core_dict[core_name]
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = blast_values
-
- sseq = sseq.replace('-', '')
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = blast_values
+
+ sseq = sseq.replace("-", "")
s_length = len(sseq)
new_sequence_length = len(new_sseq)
- logger.info('Found %s at sample %s for core gene %s ', tag, sample_name, core_name)
+ logger.info("Found %s at sample %s for core gene %s ", tag, sample_name, core_name)
- if tag == 'PLOT':
- tag_allele = tag + '_' + str(qseqid)
+ if tag == "PLOT":
+ tag_allele = tag + "_" + str(qseqid)
else:
# Adding ASM/ALM/INF allele to the allele_matrix if it is not already include
if not core_name in tag_dict:
tag_dict[core_name] = []
- if not new_sseq in tag_dict[core_name] :
+ if not new_sseq in tag_dict[core_name]:
tag_dict[core_name].append(new_sseq)
# Find the index of ASM/ALM/INF to include it in the sample matrix dict
index_tag = tag_dict[core_name].index(new_sseq)
- tag_allele = tag + '_' + core_name + '_' + str(qseqid) + '_' + str(index_tag)
+ tag_allele = tag + "_" + core_name + "_" + str(qseqid) + "_" + str(index_tag)
samples_matrix_dict[sample_name].append(tag_allele)
# Keeping INF/ASM/ALM/PLOT report info
- if not core_name in list_tag :
+ if not core_name in list_tag:
list_tag[core_name] = {}
- if not sample_name in list_tag[core_name] :
+ if not sample_name in list_tag[core_name]:
list_tag[core_name][sample_name] = {}
- if tag == 'INF':
- list_tag[core_name][sample_name][tag_allele] = [gene_annot, product_annot, qseqid, allele_quality, sseqid, bitscore, str(matching_allele_length), str(s_length), str(new_sequence_length), mismatch , r_gapopen, sstart, send, new_sseq, complete_predicted_seq]
+ if tag == "INF":
+ list_tag[core_name][sample_name][tag_allele] = [
+ gene_annot,
+ product_annot,
+ qseqid,
+ allele_quality,
+ sseqid,
+ bitscore,
+ str(matching_allele_length),
+ str(s_length),
+ str(new_sequence_length),
+ mismatch,
+ r_gapopen,
+ sstart,
+ send,
+ new_sseq,
+ complete_predicted_seq,
+ ]
# (recuento tags para plots)
- count_dict[sample_name]['total'] += 1
+ count_dict[sample_name]["total"] += 1
for count_class in count_dict[sample_name]:
if count_class in allele_quality:
count_dict[sample_name][count_class] += 1
- #if "bad_quality" in allele_quality:
- # count_dict[sample_name]['bad_quality'] += 1
-
- elif tag == 'PLOT':
- list_tag[core_name][sample_name] = [gene_annot, product_annot, qseqid, allele_quality, sseqid, bitscore, sstart, send, sseq, new_sseq]
+ # if "bad_quality" in allele_quality:
+ # count_dict[sample_name]['bad_quality'] += 1
+
+ elif tag == "PLOT":
+ list_tag[core_name][sample_name] = [
+ gene_annot,
+ product_annot,
+ qseqid,
+ allele_quality,
+ sseqid,
+ bitscore,
+ sstart,
+ send,
+ sseq,
+ new_sseq,
+ ]
# (recuento tags para plots)
- count_dict[sample_name]['total'] += 1
+ count_dict[sample_name]["total"] += 1
- else :
- if tag == 'ASM':
- newsseq_vs_blastseq = 'shorter'
- elif tag == 'ALM':
- newsseq_vs_blastseq = 'longer'
+ else:
+ if tag == "ASM":
+ newsseq_vs_blastseq = "shorter"
+ elif tag == "ALM":
+ newsseq_vs_blastseq = "longer"
if len(sseq) < matching_allele_length:
- add_info = 'Global effect: DELETION. BLAST sequence length shorter than matching allele sequence length / Net result: ' + tag + '. Final gene sequence length ' + newsseq_vs_blastseq + ' than matching allele sequence length'
+ add_info = (
+ "Global effect: DELETION. BLAST sequence length shorter than matching allele sequence length / Net result: "
+ + tag
+ + ". Final gene sequence length "
+ + newsseq_vs_blastseq
+ + " than matching allele sequence length"
+ )
elif len(sseq) == matching_allele_length:
- add_info = 'Global effect: BASE SUBSTITUTION. BLAST sequence length equal to matching allele sequence length / Net result: ' + tag + '. Final gene sequence length ' + newsseq_vs_blastseq + ' than matching allele sequence length'
+ add_info = (
+ "Global effect: BASE SUBSTITUTION. BLAST sequence length equal to matching allele sequence length / Net result: "
+ + tag
+ + ". Final gene sequence length "
+ + newsseq_vs_blastseq
+ + " than matching allele sequence length"
+ )
elif len(sseq) > matching_allele_length:
- add_info = 'Global effect: INSERTION. BLAST sequence length longer than matching allele sequence length / Net result: ' + tag + '. Final gene sequence length ' + newsseq_vs_blastseq + ' than matching allele sequence length'
-
- list_tag[core_name][sample_name][tag_allele] = [gene_annot, product_annot, qseqid, allele_quality, sseqid, bitscore, str(matching_allele_length), str(s_length), str(new_sequence_length), mismatch , r_gapopen, sstart, send, new_sseq, add_info, complete_predicted_seq]
+ add_info = (
+ "Global effect: INSERTION. BLAST sequence length longer than matching allele sequence length / Net result: "
+ + tag
+ + ". Final gene sequence length "
+ + newsseq_vs_blastseq
+ + " than matching allele sequence length"
+ )
+
+ list_tag[core_name][sample_name][tag_allele] = [
+ gene_annot,
+ product_annot,
+ qseqid,
+ allele_quality,
+ sseqid,
+ bitscore,
+ str(matching_allele_length),
+ str(s_length),
+ str(new_sequence_length),
+ mismatch,
+ r_gapopen,
+ sstart,
+ send,
+ new_sseq,
+ add_info,
+ complete_predicted_seq,
+ ]
# (recuento tags para plots)
- if tag == 'ASM':
- count_dict[sample_name]['total'] += 1
+ if tag == "ASM":
+ count_dict[sample_name]["total"] += 1
for mut_type in count_dict[sample_name]:
if mut_type in add_info.lower():
count_dict[sample_name][mut_type] += 1
- elif tag == 'ALM':
- count_dict[sample_name]['total'] += 1
+ elif tag == "ALM":
+ count_dict[sample_name]["total"] += 1
for mut_type in count_dict[sample_name]:
if mut_type in add_info.lower():
count_dict[sample_name][mut_type] += 1
- if not sseqid in matching_genes_dict[sample_name] :
+ if not sseqid in matching_genes_dict[sample_name]:
matching_genes_dict[sample_name][sseqid] = []
- if sstart > send :
- #matching_genes_dict[sample_name][sseqid].append([core_name, str(int(sstart)-new_sequence_length -1), sstart,'-', tag_allele])
- matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, str(int(sstart)-new_sequence_length -1), sstart,'-', tag_allele])
+ if sstart > send:
+ # matching_genes_dict[sample_name][sseqid].append([core_name, str(int(sstart)-new_sequence_length -1), sstart,'-', tag_allele])
+ matching_genes_dict[sample_name][sseqid].append(
+ [
+ core_name,
+ qseqid,
+ str(int(sstart) - new_sequence_length - 1),
+ sstart,
+ "-",
+ tag_allele,
+ ]
+ )
else:
- #matching_genes_dict[sample_name][sseqid].append([core_name, sstart, str(int(sstart)+ new_sequence_length),'+', tag_allele])
- matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, str(int(sstart)+ new_sequence_length),'+', tag_allele])
+ # matching_genes_dict[sample_name][sseqid].append([core_name, sstart, str(int(sstart)+ new_sequence_length),'+', tag_allele])
+ matching_genes_dict[sample_name][sseqid].append(
+ [
+ core_name,
+ qseqid,
+ sstart,
+ str(int(sstart) + new_sequence_length),
+ "+",
+ tag_allele,
+ ]
+ )
##### informe prodigal #####
- prodigal_report.append([core_name, sample_name, qseqid, tag_allele, sstart, send, start_prodigal, end_prodigal, sseq, complete_predicted_seq])
+ prodigal_report.append(
+ [
+ core_name,
+ sample_name,
+ qseqid,
+ tag_allele,
+ sstart,
+ send,
+ start_prodigal,
+ end_prodigal,
+ sseq,
+ complete_predicted_seq,
+ ]
+ )
return True
@@ -796,24 +1257,41 @@ def inf_asm_alm_tag(core_name, sample_name, tag, blast_values, allele_quality, n
# Keep best results info after BLAST using results from previous reference allele BLAST as database VS ALL alleles in locus as query in allele calling analysis #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · #
-def get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found, logger) :
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = values
+def get_blast_results(
+ sample_name, values, contigs_in_sample_dict, allele_found, logger
+):
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = values
## Get contig ID and BLAST sequence
- sseqid_blast = "_".join(sseqid.split('_')[1:])
- sseq_no_gaps = sseq.replace('-', '')
-
+ sseqid_blast = "_".join(sseqid.split("_")[1:])
+ sseq_no_gaps = sseq.replace("-", "")
## Get start and end positions in contig searching for BLAST sequence index in contig sequence
# Get contig sequence
accession_sequence = contigs_in_sample_dict[sample_name][sseqid_blast]
- #for record in sample_contigs: ## parse
- #if record.id == sseqid_blast : ## parse
- #break ## parse
- #accession_sequence = str(record.seq) ## parse
+ # for record in sample_contigs: ## parse
+ # if record.id == sseqid_blast : ## parse
+ # break ## parse
+ # accession_sequence = str(record.seq) ## parse
# Try to get BLAST sequence index in contig. If index -> error because different contig sequence and BLAST sequence
# direction, obtain reverse complement BLAST sequence and try again.
@@ -834,26 +1312,55 @@ def get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found
sstart_new = str(max(sseq_index_1, sseq_index_2))
send_new = str(min(sseq_index_1, sseq_index_2))
-
## Keep BLAST results info discarding subsets
allele_is_subset = False
- if len(allele_found) > 0 :
- for allele_id in allele_found :
- min_index = min(int(allele_found[allele_id][9]), int(allele_found[allele_id][10]))
- max_index = max(int(allele_found[allele_id][9]), int(allele_found[allele_id][10]))
- if int(sstart_new) in range(min_index, max_index + 1) or int(send_new) in range(min_index, max_index + 1): # if both genome locations overlap
- if sseqid_blast == allele_found[allele_id][1]: # if both sequences are in the same contig
- logger.info('Found allele %s that starts or ends at the same position as %s ' , qseqid, allele_id)
+ if len(allele_found) > 0:
+ for allele_id in allele_found:
+ min_index = min(
+ int(allele_found[allele_id][9]), int(allele_found[allele_id][10])
+ )
+ max_index = max(
+ int(allele_found[allele_id][9]), int(allele_found[allele_id][10])
+ )
+ if int(sstart_new) in range(min_index, max_index + 1) or int(
+ send_new
+ ) in range(
+ min_index, max_index + 1
+ ): # if both genome locations overlap
+ if (
+ sseqid_blast == allele_found[allele_id][1]
+ ): # if both sequences are in the same contig
+ logger.info(
+ "Found allele %s that starts or ends at the same position as %s ",
+ qseqid,
+ allele_id,
+ )
allele_is_subset = True
break
- if len(allele_found) == 0 or not allele_is_subset :
- contig_id_start = str(sseqid_blast + '_'+ sstart_new)
+ if len(allele_found) == 0 or not allele_is_subset:
+ contig_id_start = str(sseqid_blast + "_" + sstart_new)
# Skip the allele found in the 100% identity and 100% alignment
if not contig_id_start in allele_found:
- allele_found[contig_id_start] = [qseqid, sseqid_blast, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart_new, send_new, '-', '-', sseq, qseq]
+ allele_found[contig_id_start] = [
+ qseqid,
+ sseqid_blast,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart_new,
+ send_new,
+ "-",
+ "-",
+ sseq,
+ qseq,
+ ]
return True
@@ -862,35 +1369,55 @@ def get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found
# Get SNPs and ADN and protein alignment #
# · * · * · * · * · * · * · * · * · * · #
-def keep_snp_alignment_info(sseq, new_sseq, matching_allele_seq, qseqid, query_direction, core_name, sample_name, reward, penalty, gapopen, gapextend, snp_dict, match_alignment_dict, protein_dict, logger):
+def keep_snp_alignment_info(
+ sseq,
+ new_sseq,
+ matching_allele_seq,
+ qseqid,
+ query_direction,
+ core_name,
+ sample_name,
+ reward,
+ penalty,
+ gapopen,
+ gapextend,
+ snp_dict,
+ match_alignment_dict,
+ protein_dict,
+ logger,
+):
## Check allele sequence direction
- if query_direction == 'reverse':
+ if query_direction == "reverse":
matching_allele_seq = str(Seq.Seq(matching_allele_seq).reverse_complement())
else:
matching_allele_seq = str(matching_allele_seq)
## Get the SNP information
snp_information = get_snp(sseq, matching_allele_seq)
- if len(snp_information) > 0 :
- if not core_name in snp_dict :
+ if len(snp_information) > 0:
+ if not core_name in snp_dict:
snp_dict[core_name] = {}
- if not sample_name in snp_dict[core_name] :
+ if not sample_name in snp_dict[core_name]:
snp_dict[core_name][sample_name] = {}
- snp_dict[core_name][sample_name][qseqid]= snp_information
+ snp_dict[core_name][sample_name][qseqid] = snp_information
## Get new sequence-allele sequence dna alignment
- if not core_name in match_alignment_dict :
+ if not core_name in match_alignment_dict:
match_alignment_dict[core_name] = {}
- if not sample_name in match_alignment_dict[core_name] :
- match_alignment_dict[core_name][sample_name] = get_alignment (new_sseq, matching_allele_seq, reward, penalty, gapopen, gapextend)
+ if not sample_name in match_alignment_dict[core_name]:
+ match_alignment_dict[core_name][sample_name] = get_alignment(
+ new_sseq, matching_allele_seq, reward, penalty, gapopen, gapextend
+ )
## Get new sequence-allele sequence protein alignment
- if not core_name in protein_dict :
+ if not core_name in protein_dict:
protein_dict[core_name] = {}
- if not sample_name in protein_dict[core_name] :
+ if not sample_name in protein_dict[core_name]:
protein_dict[core_name][sample_name] = []
- protein_dict[core_name][sample_name] = get_alignment (new_sseq, matching_allele_seq, reward, penalty, gapopen, gapextend, "protein")
+ protein_dict[core_name][sample_name] = get_alignment(
+ new_sseq, matching_allele_seq, reward, penalty, gapopen, gapextend, "protein"
+ )
return True
@@ -899,52 +1426,81 @@ def keep_snp_alignment_info(sseq, new_sseq, matching_allele_seq, qseqid, query_d
# Create allele tag summary for each sample #
# · * · * · * · * · * · * · * · * · * · * · #
-def create_summary (samples_matrix_dict, logger) :
+def create_summary(samples_matrix_dict, logger):
summary_dict = {}
summary_result_list = []
- summary_heading_list = ['Exact match', 'INF', 'ASM', 'ALM', 'LNF', 'TPR', 'NIPH', 'NIPHEM', 'PLOT', 'ERROR']
- summary_result_list.append('File\t' + '\t'.join(summary_heading_list))
- for key in sorted (samples_matrix_dict) :
-
- summary_dict[key] = {'Exact match':0, 'INF':0, 'ASM':0, 'ALM':0, 'LNF':0, 'TPR':0,'NIPH':0, 'NIPHEM':0, 'PLOT':0, 'ERROR':0}
- for values in samples_matrix_dict[key] :
- if 'INF_' in values :
- summary_dict[key]['INF'] += 1
- elif 'ASM_' in values :
- summary_dict[key]['ASM'] += 1
- elif 'ALM_' in values :
- summary_dict[key]['ALM'] += 1
- elif 'LNF' in values :
- summary_dict[key]['LNF'] += 1
- elif 'TPR' in values :
- summary_dict[key]['TPR'] += 1
- elif 'NIPH' == values :
- summary_dict[key]['NIPH'] += 1
- elif 'NIPHEM' == values :
- summary_dict[key]['NIPHEM'] += 1
- elif 'PLOT' in values :
- summary_dict[key]['PLOT'] += 1
- elif 'ERROR' in values :
- summary_dict[key]['ERROR'] += 1
+ summary_heading_list = [
+ "Exact match",
+ "INF",
+ "ASM",
+ "ALM",
+ "LNF",
+ "TPR",
+ "NIPH",
+ "NIPHEM",
+ "PLOT",
+ "ERROR",
+ ]
+ summary_result_list.append("File\t" + "\t".join(summary_heading_list))
+ for key in sorted(samples_matrix_dict):
+ summary_dict[key] = {
+ "Exact match": 0,
+ "INF": 0,
+ "ASM": 0,
+ "ALM": 0,
+ "LNF": 0,
+ "TPR": 0,
+ "NIPH": 0,
+ "NIPHEM": 0,
+ "PLOT": 0,
+ "ERROR": 0,
+ }
+ for values in samples_matrix_dict[key]:
+ if "INF_" in values:
+ summary_dict[key]["INF"] += 1
+ elif "ASM_" in values:
+ summary_dict[key]["ASM"] += 1
+ elif "ALM_" in values:
+ summary_dict[key]["ALM"] += 1
+ elif "LNF" in values:
+ summary_dict[key]["LNF"] += 1
+ elif "TPR" in values:
+ summary_dict[key]["TPR"] += 1
+ elif "NIPH" == values:
+ summary_dict[key]["NIPH"] += 1
+ elif "NIPHEM" == values:
+ summary_dict[key]["NIPHEM"] += 1
+ elif "PLOT" in values:
+ summary_dict[key]["PLOT"] += 1
+ elif "ERROR" in values:
+ summary_dict[key]["ERROR"] += 1
else:
try:
number = int(values)
- summary_dict[key]['Exact match'] +=1
+ summary_dict[key]["Exact match"] += 1
except:
- if '_' in values :
+ if "_" in values:
tmp_value = values
try:
number = int(tmp_value[-1])
- summary_dict[key]['Exact match'] +=1
+ summary_dict[key]["Exact match"] += 1
except:
- logger.debug('The value %s, was found when collecting summary information for the %s', values, summary_dict[key] )
+ logger.debug(
+ "The value %s, was found when collecting summary information for the %s",
+ values,
+ summary_dict[key],
+ )
else:
- logger.debug('The value %s, was found when collecting summary information for the %s', values, summary_dict[key] )
+ logger.debug(
+ "The value %s, was found when collecting summary information for the %s",
+ values,
+ summary_dict[key],
+ )
summary_sample_list = []
- for item in summary_heading_list :
+ for item in summary_heading_list:
summary_sample_list.append(str(summary_dict[key][item]))
- summary_result_list.append(key + '\t' +'\t'.join(summary_sample_list))
+ summary_result_list.append(key + "\t" + "\t".join(summary_sample_list))
return summary_result_list
@@ -952,24 +1508,55 @@ def create_summary (samples_matrix_dict, logger) :
# Get gene and product annotation for core gene using Prokka #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · #
-### (tsv para algunos locus? Utils para analyze schema?)
-def get_gene_annotation (annotation_file, annotation_dir, genus, species, usegenus, logger) :
- name_file = os.path.basename(annotation_file).split('.')
- annotation_dir = os.path.join (annotation_dir, 'annotation', name_file[0])
-
- if usegenus == 'true':
- annotation_result = subprocess.run (['prokka', annotation_file, '--outdir', annotation_dir,
- '--genus', genus, '--species', species, '--usegenus',
- '--gcode', '11', '--prefix', name_file[0], '--quiet'])
-
- elif usegenus == 'false':
- annotation_result = subprocess.run (['prokka', annotation_file, '--outdir', annotation_dir,
- '--genus', genus, '--species', species,
- '--gcode', '11', '--prefix', name_file[0], '--quiet'])
+### (tsv para algunos locus? Utils para analyze schema?)
+def get_gene_annotation(
+ annotation_file, annotation_dir, genus, species, usegenus, logger
+):
+ name_file = os.path.basename(annotation_file).split(".")
+ annotation_dir = os.path.join(annotation_dir, "annotation", name_file[0])
+
+ if usegenus == "true":
+ annotation_result = subprocess.run(
+ [
+ "prokka",
+ annotation_file,
+ "--outdir",
+ annotation_dir,
+ "--genus",
+ genus,
+ "--species",
+ species,
+ "--usegenus",
+ "--gcode",
+ "11",
+ "--prefix",
+ name_file[0],
+ "--quiet",
+ ]
+ )
+
+ elif usegenus == "false":
+ annotation_result = subprocess.run(
+ [
+ "prokka",
+ annotation_file,
+ "--outdir",
+ annotation_dir,
+ "--genus",
+ genus,
+ "--species",
+ species,
+ "--gcode",
+ "11",
+ "--prefix",
+ name_file[0],
+ "--quiet",
+ ]
+ )
annot_tsv = []
- tsv_path = os.path.join (annotation_dir, name_file[0] + '.tsv')
+ tsv_path = os.path.join(annotation_dir, name_file[0] + ".tsv")
try:
with open(tsv_path) as tsvfile:
@@ -978,31 +1565,31 @@ def get_gene_annotation (annotation_file, annotation_dir, genus, species, usegen
annot_tsv.append(line)
if len(annot_tsv) > 1:
-
gene_index = annot_tsv[0].index("gene")
product_index = annot_tsv[0].index("product")
try:
- if '_' in annot_tsv[1][2]:
- gene_annot = annot_tsv[1][gene_index].split('_')[0]
+ if "_" in annot_tsv[1][2]:
+ gene_annot = annot_tsv[1][gene_index].split("_")[0]
else:
gene_annot = annot_tsv[1][gene_index]
except:
- gene_annot = 'Not found by Prokka'
+ gene_annot = "Not found by Prokka"
try:
product_annot = annot_tsv[1][product_index]
except:
- product_annot = 'Not found by Prokka'
+ product_annot = "Not found by Prokka"
else:
- gene_annot = 'Not found by Prokka'
- product_annot = 'Not found by Prokka'
+ gene_annot = "Not found by Prokka"
+ product_annot = "Not found by Prokka"
except:
- gene_annot = 'Not found by Prokka'
- product_annot = 'Not found by Prokka'
+ gene_annot = "Not found by Prokka"
+ product_annot = "Not found by Prokka"
return gene_annot, product_annot
+
"""
def get_gene_annotation (annotation_file, annotation_dir, logger) :
name_file = os.path.basename(annotation_file).split('.')
@@ -1079,7 +1666,7 @@ def get_gene_annotation (annotation_file, annotation_dir, logger) :
"""
-def analize_annotation_files (in_file, logger) : ## N
+def analize_annotation_files(in_file, logger): ## N
examiner = GFF.GFFExaminer()
file_fh = open(in_file)
datos = examiner.available_limits(in_file)
@@ -1087,24 +1674,33 @@ def analize_annotation_files (in_file, logger) : ## N
return True
-def get_inferred_allele_number(core_dict, logger): ## N
- #This function will look for the highest locus number and it will return a safe high value
+def get_inferred_allele_number(core_dict, logger): ## N
+ # This function will look for the highest locus number and it will return a safe high value
# that will be added to the schema database
- logger.debug('running get_inferred_allele_number function')
+ logger.debug("running get_inferred_allele_number function")
int_keys = []
for key in core_dict.keys():
int_keys.append(key)
max_value = max(int_keys)
digit_length = len(str(max_value))
- return True #str 1 ( #'1'+ '0'*digit_length + 2)
+ return True # str 1 ( #'1'+ '0'*digit_length + 2)
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * #
# Get ST profile for each samples based on allele calling results #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * #
-def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_list_files, sample_list_files, logger):
- ## logger
+
+def get_ST_profile(
+ outputdir,
+ profile_csv_path,
+ exact_dict,
+ inf_dict,
+ core_gene_list_files,
+ sample_list_files,
+ logger,
+):
+ ## logger
csv_read = []
ST_profiles_dict = {}
@@ -1118,15 +1714,17 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_
for line in csvreader:
csv_read.append(line)
- profile_header = csv_read[0][1:len(core_gene_list_files) + 1]
+ profile_header = csv_read[0][1 : len(core_gene_list_files) + 1]
for ST_index in range(1, len(csv_read)):
ST_profiles_dict[csv_read[ST_index][0]] = {}
for core_index in range(len(profile_header)):
- ST_profiles_dict[csv_read[ST_index][0]][profile_header[core_index]] = csv_read[ST_index][core_index + 1]
+ ST_profiles_dict[csv_read[ST_index][0]][
+ profile_header[core_index]
+ ] = csv_read[ST_index][core_index + 1]
for sample_file in sample_list_files:
- sample_name = '.'.join(os.path.basename(sample_file).split('.')[:-1])
+ sample_name = ".".join(os.path.basename(sample_file).split(".")[:-1])
st_counter = 0
for ST in ST_profiles_dict:
@@ -1140,14 +1738,16 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_
if core_name in exact_dict[sample_name]:
allele_in_sample = exact_dict[sample_name][core_name][2]
- if not '_' in allele_in_ST:
- if '_' in allele_in_sample:
- allele_in_sample = allele_in_sample.split('_')[1]
+ if not "_" in allele_in_ST:
+ if "_" in allele_in_sample:
+ allele_in_sample = allele_in_sample.split("_")[1]
if st_counter == 0:
if sample_name not in analysis_profiles_dict:
analysis_profiles_dict[sample_name] = {}
- analysis_profiles_dict[sample_name][core_name] = allele_in_sample
+ analysis_profiles_dict[sample_name][
+ core_name
+ ] = allele_in_sample
if allele_in_sample == allele_in_ST:
core_counter += 1
@@ -1165,14 +1765,16 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_
allele_in_sample = inf_dict[sample_name][core_name][2]
if sample_name not in analysis_profiles_dict:
analysis_profiles_dict[sample_name] = {}
- analysis_profiles_dict[sample_name][core_name] = allele_in_sample
+ analysis_profiles_dict[sample_name][
+ core_name
+ ] = allele_in_sample
else:
if st_counter == 0:
if sample_name not in analysis_profiles_dict:
analysis_profiles_dict[sample_name] = {}
- if allele_in_ST == 'N' and "allele_in_sample" not in locals():
+ if allele_in_ST == "N" and "allele_in_sample" not in locals():
core_counter += 1
st_counter += 1
@@ -1199,10 +1801,12 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_
if sample_name in analysis_profiles_dict:
if len(analysis_profiles_dict[sample_name]) == len(profile_header):
new_st_id = str(len(ST_profiles_dict) + 1)
- ST_profiles_dict[new_st_id + "_INF"] = analysis_profile_dict[sample_name]
+ ST_profiles_dict[new_st_id + "_INF"] = analysis_profile_dict[
+ sample_name
+ ]
inf_ST[new_st_id] = analysis_profile_dict[sample_name]
- samples_profiles_dict[sample_name]=new_st_id + "_INF"
+ samples_profiles_dict[sample_name] = new_st_id + "_INF"
if "New" not in count_st:
count_st["New"] = {}
@@ -1211,24 +1815,24 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_
count_st["New"][new_st_id] += 1
else:
- samples_profiles_dict[sample_name] = '-'
+ samples_profiles_dict[sample_name] = "-"
if "Unknown" not in count_st:
count_st["Unknown"] = 0
count_st["Unknown"] += 1
else:
- samples_profiles_dict[sample_name] = '-'
+ samples_profiles_dict[sample_name] = "-"
if "Unknown" not in count_st:
count_st["Unknown"] = 0
count_st["Unknown"] += 1
## Create ST profile results report
- save_st_profile_results (outputdir, samples_profiles_dict, logger)
+ save_st_profile_results(outputdir, samples_profiles_dict, logger)
## Obtain interactive piechart
- logger.info('Creating interactive ST results piechart')
- create_sunburst_plot_st (outputdir, count_st, logger)
+ logger.info("Creating interactive ST results piechart")
+ create_sunburst_plot_st(outputdir, count_st, logger)
return True, inf_ST
@@ -1237,24 +1841,24 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_
# Create ST results report #
# · * · * · * · * · * · * #
-def save_st_profile_results (outputdir, samples_profiles_dict, logger):
- header_stprofile = ['Sample Name', 'ST']
+def save_st_profile_results(outputdir, samples_profiles_dict, logger):
+ header_stprofile = ["Sample Name", "ST"]
- if samples_profiles_dict != '':
+ if samples_profiles_dict != "":
## Saving ST profile to file
- logger.info('Saving ST profile information to file..')
- stprofile_file = os.path.join(outputdir, 'stprofile.tsv')
- with open (stprofile_file , 'w') as st_fh :
- st_fh.write('\t'.join(header_stprofile)+ '\n')
+ logger.info("Saving ST profile information to file..")
+ stprofile_file = os.path.join(outputdir, "stprofile.tsv")
+ with open(stprofile_file, "w") as st_fh:
+ st_fh.write("\t".join(header_stprofile) + "\n")
for sample in sorted(samples_profiles_dict):
- st_fh.write(sample + '\t' + samples_profiles_dict[sample] + '\n')
+ st_fh.write(sample + "\t" + samples_profiles_dict[sample] + "\n")
return True
-def create_sunburst_plot_st (outputdir, count_st, logger):
- ### logger?
+def create_sunburst_plot_st(outputdir, count_st, logger):
+ ### logger?
counts = []
st_ids = ["ST"]
st_labels = ["ST"]
@@ -1263,7 +1867,6 @@ def create_sunburst_plot_st (outputdir, count_st, logger):
total_samples = 0
for st_type in count_st:
-
if type(count_st[st_type]) == dict:
total_st_type_count = sum(count_st[st_type].values())
else:
@@ -1285,17 +1888,19 @@ def create_sunburst_plot_st (outputdir, count_st, logger):
counts.insert(0, total_samples)
- fig = go.Figure(go.Sunburst(
- ids = st_ids,
- labels = st_labels,
- parents = st_parents,
- values = counts,
- branchvalues = "total",
- ))
+ fig = go.Figure(
+ go.Sunburst(
+ ids=st_ids,
+ labels=st_labels,
+ parents=st_parents,
+ values=counts,
+ branchvalues="total",
+ )
+ )
- fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
+ fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))
- plotsdir = os.path.join(outputdir, 'plots', 'samples_st.html')
+ plotsdir = os.path.join(outputdir, "plots", "samples_st.html")
fig.write_html(plotsdir)
@@ -1306,32 +1911,38 @@ def create_sunburst_plot_st (outputdir, count_st, logger):
# Update ST profile file adding new ST found #
# · * · * · * · * · * · * · * · * · * · * · #
-def update_st_profile (updateprofile, profile_csv_path, outputdir, inf_ST, core_gene_list_files, logger):
+def update_st_profile(
+ updateprofile, profile_csv_path, outputdir, inf_ST, core_gene_list_files, logger
+):
## Create a copy of ST profile file if updateprofile = 'new'
- if updateprofile == 'new':
+ if updateprofile == "new":
no_updated_profile_csv_path = profile_csv_path
- profile_csv_path_name = os.path.basename(no_updated_profile_csv_path).split('.')[0]
- profile_csv_path = os.path.join(outputdir, profile_csv_path_name + '_updated' + '.csv')
+ profile_csv_path_name = os.path.basename(no_updated_profile_csv_path).split(
+ "."
+ )[0]
+ profile_csv_path = os.path.join(
+ outputdir, profile_csv_path_name + "_updated" + ".csv"
+ )
shutil.copyfile(no_updated_profile_csv_path, profile_csv_path)
- logger.info('Copying ST profile file to update profiles')
+ logger.info("Copying ST profile file to update profiles")
## Update ST profile file
- logger.info('Updating ST profile file adding new INF ST')
+ logger.info("Updating ST profile file adding new INF ST")
- with open (profile_csv_path, 'r') as csvfile:
+ with open(profile_csv_path, "r") as csvfile:
csvreader = csv.reader(csvfile, delimiter="\t")
for line in csvreader:
- profile_header = line[0][1:len(core_gene_list_files) + 1]
+ profile_header = line[0][1 : len(core_gene_list_files) + 1]
break
- with open (profile_csv_path, 'a') as profile_fh:
+ with open(profile_csv_path, "a") as profile_fh:
for ST in inf_ST:
locus_ST_list = []
locus_ST_list.append(ST)
for locus in profile_header:
locus_ST_list.append(inf_ST[ST][locus])
- profile_fh.write ('\t'.join(locus_ST_list)+ '\n')
+ profile_fh.write("\t".join(locus_ST_list) + "\n")
return True
@@ -1340,312 +1951,760 @@ def update_st_profile (updateprofile, profile_csv_path, outputdir, inf_ST, core_
# Create allele calling results reports #
# · * · * · * · * · * · * · * · * · * · #
-def save_allele_call_results (outputdir, full_gene_list, samples_matrix_dict, exact_dict, paralog_dict, inf_dict, plot_dict, matching_genes_dict, list_asm, list_alm, lnf_tpr_dict, snp_dict, match_alignment_dict, protein_dict, prodigal_report, shorter_seq_coverage, longer_seq_coverage, equal_seq_coverage, shorter_blast_seq_coverage, longer_blast_seq_coverage, equal_blast_seq_coverage, logger):
- header_matching_alleles_contig = ['Sample Name', 'Contig', 'Core Gene', 'Allele', 'Contig Start', 'Contig Stop', 'Direction', 'Codification']
- header_exact = ['Core Gene', 'Sample Name', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig', 'Query length', 'Contig start', 'Contig end', 'Sequence', 'Predicted Sequence']
- header_paralogs = ['Core Gene','Sample Name', 'Gene Annotation', 'Product Annotation', 'Paralog Tag', 'ID %', 'Allele', 'Allele Quality', 'Contig', 'Bitscore', 'Contig start', 'Contig end', 'Sequence', 'Predicted Sequence']
- header_inferred = ['Core Gene','Sample Name', 'INF tag', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig', 'Bitscore', 'Query length', 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence', 'Predicted Sequence']
- header_asm = ['Core Gene', 'Sample Name', 'ASM tag', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig', 'Bitscore', 'Query length', 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence', 'Additional info', 'Predicted Sequence']
- header_alm = ['Core Gene', 'Sample Name', 'ALM tag', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig', 'Bitscore', 'Query length', 'Contig length', 'New sequence length' , 'Mismatch' , 'gaps', 'Contig start', 'Contig end', 'New sequence', 'Additional info', 'Predicted Sequence']
- header_plot = ['Core Gene', 'Sample Name', 'Gene Annotation', 'Product Annotation', 'Allele', 'Allele Quality', 'Contig','Bitscore', 'Contig start', 'Contig end', 'Sequence', 'Predicted Sequence']
- header_lnf_tpr = ['Core Gene', 'Sample Name', 'Gene Annotation', 'Product Annotation', 'Tag', 'Allele', 'Allele Quality', 'ID %', 'Blast sequence coverage %', 'New sequence coverage %', 'Query length', 'Contig length', 'New sequence length', 'Additional info']
- header_snp = ['Core Gene', 'Sample Name', 'Allele', 'Position', 'Mutation Schema/Sample', 'Codon Schema/Sample','Amino acid in Schema/Sample', 'Mutation type','Annotation Schema/Sample']
- header_protein = ['Core Gene','Sample Name', 'Protein in ' , 'Protein sequence']
- header_match_alignment = ['Core Gene','Sample Name','Alignment', 'Sequence']
- header_stprofile = ['Sample Name', 'ST']
+def save_allele_call_results(
+ outputdir,
+ full_gene_list,
+ samples_matrix_dict,
+ exact_dict,
+ paralog_dict,
+ inf_dict,
+ plot_dict,
+ matching_genes_dict,
+ list_asm,
+ list_alm,
+ lnf_tpr_dict,
+ snp_dict,
+ match_alignment_dict,
+ protein_dict,
+ prodigal_report,
+ shorter_seq_coverage,
+ longer_seq_coverage,
+ equal_seq_coverage,
+ shorter_blast_seq_coverage,
+ longer_blast_seq_coverage,
+ equal_blast_seq_coverage,
+ logger,
+):
+ header_matching_alleles_contig = [
+ "Sample Name",
+ "Contig",
+ "Core Gene",
+ "Allele",
+ "Contig Start",
+ "Contig Stop",
+ "Direction",
+ "Codification",
+ ]
+ header_exact = [
+ "Core Gene",
+ "Sample Name",
+ "Gene Annotation",
+ "Product Annotation",
+ "Allele",
+ "Allele Quality",
+ "Contig",
+ "Query length",
+ "Contig start",
+ "Contig end",
+ "Sequence",
+ "Predicted Sequence",
+ ]
+ header_paralogs = [
+ "Core Gene",
+ "Sample Name",
+ "Gene Annotation",
+ "Product Annotation",
+ "Paralog Tag",
+ "ID %",
+ "Allele",
+ "Allele Quality",
+ "Contig",
+ "Bitscore",
+ "Contig start",
+ "Contig end",
+ "Sequence",
+ "Predicted Sequence",
+ ]
+ header_inferred = [
+ "Core Gene",
+ "Sample Name",
+ "INF tag",
+ "Gene Annotation",
+ "Product Annotation",
+ "Allele",
+ "Allele Quality",
+ "Contig",
+ "Bitscore",
+ "Query length",
+ "Contig length",
+ "New sequence length",
+ "Mismatch",
+ "gaps",
+ "Contig start",
+ "Contig end",
+ "New sequence",
+ "Predicted Sequence",
+ ]
+ header_asm = [
+ "Core Gene",
+ "Sample Name",
+ "ASM tag",
+ "Gene Annotation",
+ "Product Annotation",
+ "Allele",
+ "Allele Quality",
+ "Contig",
+ "Bitscore",
+ "Query length",
+ "Contig length",
+ "New sequence length",
+ "Mismatch",
+ "gaps",
+ "Contig start",
+ "Contig end",
+ "New sequence",
+ "Additional info",
+ "Predicted Sequence",
+ ]
+ header_alm = [
+ "Core Gene",
+ "Sample Name",
+ "ALM tag",
+ "Gene Annotation",
+ "Product Annotation",
+ "Allele",
+ "Allele Quality",
+ "Contig",
+ "Bitscore",
+ "Query length",
+ "Contig length",
+ "New sequence length",
+ "Mismatch",
+ "gaps",
+ "Contig start",
+ "Contig end",
+ "New sequence",
+ "Additional info",
+ "Predicted Sequence",
+ ]
+ header_plot = [
+ "Core Gene",
+ "Sample Name",
+ "Gene Annotation",
+ "Product Annotation",
+ "Allele",
+ "Allele Quality",
+ "Contig",
+ "Bitscore",
+ "Contig start",
+ "Contig end",
+ "Sequence",
+ "Predicted Sequence",
+ ]
+ header_lnf_tpr = [
+ "Core Gene",
+ "Sample Name",
+ "Gene Annotation",
+ "Product Annotation",
+ "Tag",
+ "Allele",
+ "Allele Quality",
+ "ID %",
+ "Blast sequence coverage %",
+ "New sequence coverage %",
+ "Query length",
+ "Contig length",
+ "New sequence length",
+ "Additional info",
+ ]
+ header_snp = [
+ "Core Gene",
+ "Sample Name",
+ "Allele",
+ "Position",
+ "Mutation Schema/Sample",
+ "Codon Schema/Sample",
+ "Amino acid in Schema/Sample",
+ "Mutation type",
+ "Annotation Schema/Sample",
+ ]
+ header_protein = ["Core Gene", "Sample Name", "Protein in ", "Protein sequence"]
+ header_match_alignment = ["Core Gene", "Sample Name", "Alignment", "Sequence"]
+ header_stprofile = ["Sample Name", "ST"]
# Añadido header_prodigal_report para report prodigal
-# header_prodigal_report = ['Core gene', 'Sample Name', 'Allele', 'Sequence type', 'BLAST start', 'BLAST end', 'Prodigal start', 'Prodigal end', 'BLAST sequence', 'Prodigal sequence']
+ # header_prodigal_report = ['Core gene', 'Sample Name', 'Allele', 'Sequence type', 'BLAST start', 'BLAST end', 'Prodigal start', 'Prodigal end', 'BLAST sequence', 'Prodigal sequence']
# Añadido header_newsseq_coverage_report para determinar coverage threshold a imponer
-# header_newsseq_coverage_report = ['Core gene', 'Sample Name', 'Query length', 'New sequence length', 'Locus mean', 'Coverage (new sequence/allele)', 'Coverage (new sequence/locus mean)']
+ # header_newsseq_coverage_report = ['Core gene', 'Sample Name', 'Query length', 'New sequence length', 'Locus mean', 'Coverage (new sequence/allele)', 'Coverage (new sequence/locus mean)']
# Añadido header_blast_coverage_report para determinar coverage threshold a imponer
-# header_blast_coverage_report = ['Core gene', 'Sample Name', 'Query length', 'Blast sequence length', 'Locus mean', 'Coverage (blast sequence/allele)', 'Coverage (blast sequence/locus mean)']
+ # header_blast_coverage_report = ['Core gene', 'Sample Name', 'Query length', 'Blast sequence length', 'Locus mean', 'Coverage (blast sequence/allele)', 'Coverage (blast sequence/locus mean)']
## Saving the result information to file
- print ('Saving results to files \n')
- result_file = os.path.join ( outputdir, 'result.tsv')
- logger.info('Saving result information to file..')
- with open (result_file, 'w') as out_fh:
- out_fh.write ('Sample Name\t'+'\t'.join( full_gene_list) + '\n')
- for key in sorted (samples_matrix_dict):
- out_fh.write (key + '\t' + '\t'.join(samples_matrix_dict[key])+ '\n')
+ print("Saving results to files \n")
+ result_file = os.path.join(outputdir, "result.tsv")
+ logger.info("Saving result information to file..")
+ with open(result_file, "w") as out_fh:
+ out_fh.write("Sample Name\t" + "\t".join(full_gene_list) + "\n")
+ for key in sorted(samples_matrix_dict):
+ out_fh.write(key + "\t" + "\t".join(samples_matrix_dict[key]) + "\n")
## Saving exact matches to file
- logger.info('Saving exact matches information to file..')
- exact_file = os.path.join(outputdir, 'exact.tsv')
- with open (exact_file , 'w') as exact_fh :
- exact_fh.write('\t'.join(header_exact)+ '\n')
+ logger.info("Saving exact matches information to file..")
+ exact_file = os.path.join(outputdir, "exact.tsv")
+ with open(exact_file, "w") as exact_fh:
+ exact_fh.write("\t".join(header_exact) + "\n")
for sample in sorted(exact_dict):
for core in sorted(exact_dict[sample]):
- exact_fh.write(core + '\t' + sample + '\t' + '\t'.join(exact_dict[sample][core]) + '\n')
+ exact_fh.write(
+ core
+ + "\t"
+ + sample
+ + "\t"
+ + "\t".join(exact_dict[sample][core])
+ + "\n"
+ )
## Saving paralog alleles to file
- logger.info('Saving paralog information to file..')
- paralog_file = os.path.join(outputdir, 'paralog.tsv')
- with open (paralog_file , 'w') as paralog_fh :
- paralog_fh.write('\t'.join(header_paralogs) + '\n')
- for sample in sorted (paralog_dict) :
- for core in sorted (paralog_dict[sample]):
- for paralog in paralog_dict[sample][core] :
- paralog_fh.write(core + '\t' + sample + '\t' + '\t'.join (paralog) + '\n')
+ logger.info("Saving paralog information to file..")
+ paralog_file = os.path.join(outputdir, "paralog.tsv")
+ with open(paralog_file, "w") as paralog_fh:
+ paralog_fh.write("\t".join(header_paralogs) + "\n")
+ for sample in sorted(paralog_dict):
+ for core in sorted(paralog_dict[sample]):
+ for paralog in paralog_dict[sample][core]:
+ paralog_fh.write(
+ core + "\t" + sample + "\t" + "\t".join(paralog) + "\n"
+ )
## Saving inferred alleles to file
- logger.info('Saving inferred alleles information to file..')
- inferred_file = os.path.join(outputdir, 'inferred_alleles.tsv')
- with open (inferred_file , 'w') as infer_fh :
- infer_fh.write('\t'.join(header_inferred) + '\n')
- for core in sorted (inf_dict) :
- for sample in sorted (inf_dict[core]) :
+ logger.info("Saving inferred alleles information to file..")
+ inferred_file = os.path.join(outputdir, "inferred_alleles.tsv")
+ with open(inferred_file, "w") as infer_fh:
+ infer_fh.write("\t".join(header_inferred) + "\n")
+ for core in sorted(inf_dict):
+ for sample in sorted(inf_dict[core]):
for inferred in inf_dict[core][sample]:
# seq_in_inferred_allele = '\t'.join (inf_dict[sample])
- infer_fh.write(core + '\t' + sample + '\t' + inferred + '\t' + '\t'.join(inf_dict[core][sample][inferred]) + '\n')
+ infer_fh.write(
+ core
+ + "\t"
+ + sample
+ + "\t"
+ + inferred
+ + "\t"
+ + "\t".join(inf_dict[core][sample][inferred])
+ + "\n"
+ )
## Saving PLOTs to file
- logger.info('Saving PLOT information to file..')
- plot_file = os.path.join(outputdir, 'plot.tsv')
- with open (plot_file , 'w') as plot_fh :
- plot_fh.write('\t'.join(header_plot) + '\n')
- for core in sorted (plot_dict) :
- for sample in sorted (plot_dict[core]):
- plot_fh.write(core + '\t' + sample + '\t' + '\t'.join(plot_dict[core][sample]) + '\n')
+ logger.info("Saving PLOT information to file..")
+ plot_file = os.path.join(outputdir, "plot.tsv")
+ with open(plot_file, "w") as plot_fh:
+ plot_fh.write("\t".join(header_plot) + "\n")
+ for core in sorted(plot_dict):
+ for sample in sorted(plot_dict[core]):
+ plot_fh.write(
+ core
+ + "\t"
+ + sample
+ + "\t"
+ + "\t".join(plot_dict[core][sample])
+ + "\n"
+ )
## Saving matching contigs to file
- logger.info('Saving matching information to file..')
- matching_file = os.path.join(outputdir, 'matching_contigs.tsv')
- with open (matching_file , 'w') as matching_fh :
- matching_fh.write('\t'.join(header_matching_alleles_contig) + '\n')
- for samples in sorted (matching_genes_dict) :
- for contigs in matching_genes_dict[samples] :
- for contig in matching_genes_dict[samples] [contigs]:
- matching_alleles = '\t'.join (contig)
- matching_fh.write(samples + '\t' + contigs +'\t' + matching_alleles + '\n')
+ logger.info("Saving matching information to file..")
+ matching_file = os.path.join(outputdir, "matching_contigs.tsv")
+ with open(matching_file, "w") as matching_fh:
+ matching_fh.write("\t".join(header_matching_alleles_contig) + "\n")
+ for samples in sorted(matching_genes_dict):
+ for contigs in matching_genes_dict[samples]:
+ for contig in matching_genes_dict[samples][contigs]:
+ matching_alleles = "\t".join(contig)
+ matching_fh.write(
+ samples + "\t" + contigs + "\t" + matching_alleles + "\n"
+ )
## Saving ASMs to file
- logger.info('Saving asm information to file..')
- asm_file = os.path.join(outputdir, 'asm.tsv')
- with open (asm_file , 'w') as asm_fh :
- asm_fh.write('\t'.join(header_asm)+ '\n')
- for core in list_asm :
- for sample in list_asm[core] :
+ logger.info("Saving asm information to file..")
+ asm_file = os.path.join(outputdir, "asm.tsv")
+ with open(asm_file, "w") as asm_fh:
+ asm_fh.write("\t".join(header_asm) + "\n")
+ for core in list_asm:
+ for sample in list_asm[core]:
for asm in list_asm[core][sample]:
- asm_fh.write(core + '\t' + sample + '\t' + asm + '\t' + '\t'.join(list_asm[core][sample][asm]) + '\n')
+ asm_fh.write(
+ core
+ + "\t"
+ + sample
+ + "\t"
+ + asm
+ + "\t"
+ + "\t".join(list_asm[core][sample][asm])
+ + "\n"
+ )
## Saving ALMs to file
- logger.info('Saving alm information to file..')
- alm_file = os.path.join(outputdir, 'alm.tsv')
- with open (alm_file , 'w') as alm_fh :
- alm_fh.write('\t'.join(header_alm)+ '\n')
- for core in list_alm :
- for sample in list_alm[core] :
+ logger.info("Saving alm information to file..")
+ alm_file = os.path.join(outputdir, "alm.tsv")
+ with open(alm_file, "w") as alm_fh:
+ alm_fh.write("\t".join(header_alm) + "\n")
+ for core in list_alm:
+ for sample in list_alm[core]:
for alm in list_alm[core][sample]:
- alm_fh.write(core + '\t' + sample + '\t' + alm + '\t' + '\t'.join(list_alm[core][sample][alm]) + '\n')
+ alm_fh.write(
+ core
+ + "\t"
+ + sample
+ + "\t"
+ + alm
+ + "\t"
+ + "\t".join(list_alm[core][sample][alm])
+ + "\n"
+ )
## Saving LNFs to file
- logger.info('Saving lnf information to file..')
- lnf_file = os.path.join(outputdir, 'lnf_tpr.tsv')
- with open (lnf_file , 'w') as lnf_fh :
- lnf_fh.write('\t'.join(header_lnf_tpr)+ '\n')
- for core in lnf_tpr_dict :
- for sample in lnf_tpr_dict[core] :
- for lnf in lnf_tpr_dict[core][sample] :
- lnf_fh.write(core + '\t' + sample + '\t' + '\t'.join(lnf) + '\n')
+ logger.info("Saving lnf information to file..")
+ lnf_file = os.path.join(outputdir, "lnf_tpr.tsv")
+ with open(lnf_file, "w") as lnf_fh:
+ lnf_fh.write("\t".join(header_lnf_tpr) + "\n")
+ for core in lnf_tpr_dict:
+ for sample in lnf_tpr_dict[core]:
+ for lnf in lnf_tpr_dict[core][sample]:
+ lnf_fh.write(core + "\t" + sample + "\t" + "\t".join(lnf) + "\n")
## Saving SNPs information to file
- logger.info('Saving SNPs information to file..')
- snp_file = os.path.join(outputdir, 'snp.tsv')
- with open (snp_file , 'w') as snp_fh :
- snp_fh.write('\t'.join(header_snp) + '\n')
- for core in sorted (snp_dict) :
- for sample in sorted (snp_dict[core]):
- for allele_id_snp in snp_dict[core][sample] :
- for snp in snp_dict[core][sample][allele_id_snp] :
- snp_fh.write(core + '\t' + sample + '\t' + allele_id_snp + '\t' + '\t'.join (snp) + '\n')
+ logger.info("Saving SNPs information to file..")
+ snp_file = os.path.join(outputdir, "snp.tsv")
+ with open(snp_file, "w") as snp_fh:
+ snp_fh.write("\t".join(header_snp) + "\n")
+ for core in sorted(snp_dict):
+ for sample in sorted(snp_dict[core]):
+ for allele_id_snp in snp_dict[core][sample]:
+ for snp in snp_dict[core][sample][allele_id_snp]:
+ snp_fh.write(
+ core
+ + "\t"
+ + sample
+ + "\t"
+ + allele_id_snp
+ + "\t"
+ + "\t".join(snp)
+ + "\n"
+ )
## Saving DNA sequences alignments to file
- logger.info('Saving matching alignment information to files..')
- alignment_dir = os.path.join(outputdir,'alignments')
- if os.path.exists(alignment_dir) :
+ logger.info("Saving matching alignment information to files..")
+ alignment_dir = os.path.join(outputdir, "alignments")
+ if os.path.exists(alignment_dir):
shutil.rmtree(alignment_dir)
- logger.info('deleting the alignment files from previous execution')
+ logger.info("deleting the alignment files from previous execution")
os.makedirs(alignment_dir)
- for core in sorted(match_alignment_dict) :
- for sample in sorted (match_alignment_dict[core]) :
- match_alignment_file = os.path.join(alignment_dir, str('match_alignment_' + core + '_' + sample + '.txt'))
- with open(match_alignment_file, 'w') as match_alignment_fh :
- match_alignment_fh.write( '\t'.join(header_match_alignment) + '\n')
- for match_align in match_alignment_dict[core][sample] :
- match_alignment_fh.write(core + '\t'+ sample +'\t'+ '\t'.join(match_align) + '\n')
+ for core in sorted(match_alignment_dict):
+ for sample in sorted(match_alignment_dict[core]):
+ match_alignment_file = os.path.join(
+ alignment_dir, str("match_alignment_" + core + "_" + sample + ".txt")
+ )
+ with open(match_alignment_file, "w") as match_alignment_fh:
+ match_alignment_fh.write("\t".join(header_match_alignment) + "\n")
+ for match_align in match_alignment_dict[core][sample]:
+ match_alignment_fh.write(
+ core + "\t" + sample + "\t" + "\t".join(match_align) + "\n"
+ )
## Saving protein sequences alignments to file
- logger.info('Saving protein information to files..')
- protein_dir = os.path.join(outputdir,'proteins')
- if os.path.exists(protein_dir) :
+ logger.info("Saving protein information to files..")
+ protein_dir = os.path.join(outputdir, "proteins")
+ if os.path.exists(protein_dir):
shutil.rmtree(protein_dir)
- logger.info('deleting the proteins files from previous execution')
+ logger.info("deleting the proteins files from previous execution")
os.makedirs(protein_dir)
- for core in sorted(protein_dict) :
- for sample in sorted (protein_dict[core]) :
- protein_file = os.path.join(protein_dir, str('protein_' + core + '_' + sample + '.txt'))
- with open(protein_file, 'w') as protein_fh :
- protein_fh.write( '\t'.join(header_protein) + '\n')
- for protein in protein_dict[core][sample] :
- protein_fh.write(core + '\t'+ sample +'\t'+ '\t'.join(protein) + '\n')
+ for core in sorted(protein_dict):
+ for sample in sorted(protein_dict[core]):
+ protein_file = os.path.join(
+ protein_dir, str("protein_" + core + "_" + sample + ".txt")
+ )
+ with open(protein_file, "w") as protein_fh:
+ protein_fh.write("\t".join(header_protein) + "\n")
+ for protein in protein_dict[core][sample]:
+ protein_fh.write(
+ core + "\t" + sample + "\t" + "\t".join(protein) + "\n"
+ )
## Saving summary information to file
- logger.info('Saving summary information to file..')
- summary_result = create_summary (samples_matrix_dict, logger)
- summary_file = os.path.join( outputdir, 'summary_result.tsv')
- with open (summary_file , 'w') as summ_fh:
- for line in summary_result :
- summ_fh.write(line + '\n')
+ logger.info("Saving summary information to file..")
+ summary_result = create_summary(samples_matrix_dict, logger)
+ summary_file = os.path.join(outputdir, "summary_result.tsv")
+ with open(summary_file, "w") as summ_fh:
+ for line in summary_result:
+ summ_fh.write(line + "\n")
## Modify the result file to remove the PLOT_ string for creating the file to use in the tree diagram
-# logger.info('Saving result information for tree diagram')
-# tree_diagram_file = os.path.join ( outputdir, 'result_for_tree_diagram.tsv')
-# with open (result_file, 'r') as result_fh:
-# with open(tree_diagram_file, 'w') as td_fh:
-# for line in result_fh:
-# tree_line = line.replace('PLOT_','')
-# td_fh.write(tree_line)
+ # logger.info('Saving result information for tree diagram')
+ # tree_diagram_file = os.path.join ( outputdir, 'result_for_tree_diagram.tsv')
+ # with open (result_file, 'r') as result_fh:
+ # with open(tree_diagram_file, 'w') as td_fh:
+ # for line in result_fh:
+ # tree_line = line.replace('PLOT_','')
+ # td_fh.write(tree_line)
###########################################################################################
# Guardando report de prodigal. Temporal
-# prodigal_report_file = os.path.join (outputdir, 'prodigal_report.tsv')
+ # prodigal_report_file = os.path.join (outputdir, 'prodigal_report.tsv')
# saving prodigal predictions to file
-# with open (prodigal_report_file, 'w') as out_fh:
-# out_fh.write ('\t'.join(header_prodigal_report)+ '\n')
-# for prodigal_result in prodigal_report:
-# out_fh.write ('\t'.join(prodigal_result)+ '\n')
+ # with open (prodigal_report_file, 'w') as out_fh:
+ # out_fh.write ('\t'.join(header_prodigal_report)+ '\n')
+ # for prodigal_result in prodigal_report:
+ # out_fh.write ('\t'.join(prodigal_result)+ '\n')
# Guardando coverage de new_sseq para estimar el threshold a establecer. Temporal
-# newsseq_coverage_file = os.path.join (outputdir, 'newsseq_coverage_report.tsv')
+ # newsseq_coverage_file = os.path.join (outputdir, 'newsseq_coverage_report.tsv')
# saving the coverage information to file
-# with open (newsseq_coverage_file, 'w') as out_fh:
-# out_fh.write ('\t' + '\t'.join(header_newsseq_coverage_report)+ '\n')
-# for coverage in shorter_seq_coverage:
-# out_fh.write ('Shorter new sequence' + '\t' + '\t'.join(coverage)+ '\n')
-# for coverage in longer_seq_coverage:
-# out_fh.write ('Longer new sequence' + '\t' + '\t'.join(coverage)+ '\n')
-# for coverage in equal_seq_coverage:
-# out_fh.write ('Same length new sequence' + '\t' + '\t'.join(coverage)+ '\n')
+ # with open (newsseq_coverage_file, 'w') as out_fh:
+ # out_fh.write ('\t' + '\t'.join(header_newsseq_coverage_report)+ '\n')
+ # for coverage in shorter_seq_coverage:
+ # out_fh.write ('Shorter new sequence' + '\t' + '\t'.join(coverage)+ '\n')
+ # for coverage in longer_seq_coverage:
+ # out_fh.write ('Longer new sequence' + '\t' + '\t'.join(coverage)+ '\n')
+ # for coverage in equal_seq_coverage:
+ # out_fh.write ('Same length new sequence' + '\t' + '\t'.join(coverage)+ '\n')
# Guardando coverage de la sseq obtenida tras blast para estimar el threshold a establecer. Temporal
-# blast_coverage_file = os.path.join (outputdir, 'blast_coverage_report.tsv')
+ # blast_coverage_file = os.path.join (outputdir, 'blast_coverage_report.tsv')
# saving the result information to file
-# with open (blast_coverage_file, 'w') as out_fh:
-# out_fh.write ('\t' + '\t'.join(header_blast_coverage_report)+ '\n')
-# for coverage in shorter_blast_seq_coverage:
-# out_fh.write ('Shorter blast sequence' + '\t' + '\t'.join(coverage)+ '\n')
-# for coverage in longer_blast_seq_coverage:
-# out_fh.write ('Longer blast sequence' + '\t' + '\t'.join(coverage)+ '\n')
-# for coverage in equal_blast_seq_coverage:
-# out_fh.write ('Same length blast sequence' + '\t' + '\t'.join(coverage)+ '\n')
+ # with open (blast_coverage_file, 'w') as out_fh:
+ # out_fh.write ('\t' + '\t'.join(header_blast_coverage_report)+ '\n')
+ # for coverage in shorter_blast_seq_coverage:
+ # out_fh.write ('Shorter blast sequence' + '\t' + '\t'.join(coverage)+ '\n')
+ # for coverage in longer_blast_seq_coverage:
+ # out_fh.write ('Longer blast sequence' + '\t' + '\t'.join(coverage)+ '\n')
+ # for coverage in equal_blast_seq_coverage:
+ # out_fh.write ('Same length blast sequence' + '\t' + '\t'.join(coverage)+ '\n')
###########################################################################################
return True
-
-def save_allele_calling_plots (outputdir, sample_list_files, count_exact, count_inf, count_asm, count_alm, count_lnf, count_tpr, count_plot, count_niph, count_niphem, count_error, logger):
-
+def save_allele_calling_plots(
+ outputdir,
+ sample_list_files,
+ count_exact,
+ count_inf,
+ count_asm,
+ count_alm,
+ count_lnf,
+ count_tpr,
+ count_plot,
+ count_niph,
+ count_niphem,
+ count_error,
+ logger,
+):
## Create result plots directory
- plots_dir = os.path.join(outputdir,'plots')
+ plots_dir = os.path.join(outputdir, "plots")
try:
os.makedirs(plots_dir)
except:
- logger.info('Deleting the results plots directory for a previous execution without cleaning up')
- shutil.rmtree(os.path.join(outputdir, 'plots'))
+ logger.info(
+ "Deleting the results plots directory for a previous execution without cleaning up"
+ )
+ shutil.rmtree(os.path.join(outputdir, "plots"))
try:
os.makedirs(plots_dir)
- logger.info ('Results plots folder %s has been created again', plots_dir)
+ logger.info("Results plots folder %s has been created again", plots_dir)
except:
- logger.info('Unable to create again the results plots directory %s', plots_dir)
- print('Cannot create Results plots directory on ', plots_dir)
+ logger.info(
+ "Unable to create again the results plots directory %s", plots_dir
+ )
+ print("Cannot create Results plots directory on ", plots_dir)
exit(0)
for sample_file in sample_list_files:
- sample_name = '.'.join(os.path.basename(sample_file).split('.')[:-1])
+ sample_name = ".".join(os.path.basename(sample_file).split(".")[:-1])
## Obtain interactive piechart
- logger.info('Creating interactive results piecharts')
- create_sunburst_allele_call (outputdir, sample_name, count_exact[sample_name], count_inf[sample_name], count_asm[sample_name], count_alm[sample_name], count_lnf[sample_name], count_tpr[sample_name], count_plot[sample_name], count_niph[sample_name], count_niphem[sample_name], count_error[sample_name])
+ logger.info("Creating interactive results piecharts")
+ create_sunburst_allele_call(
+ outputdir,
+ sample_name,
+ count_exact[sample_name],
+ count_inf[sample_name],
+ count_asm[sample_name],
+ count_alm[sample_name],
+ count_lnf[sample_name],
+ count_tpr[sample_name],
+ count_plot[sample_name],
+ count_niph[sample_name],
+ count_niphem[sample_name],
+ count_error[sample_name],
+ )
return True
-
-def create_sunburst_allele_call (outputdir, sample_name, count_exact, count_inf, count_asm, count_alm, count_lnf, count_tpr, count_plot, count_niph, count_niphem, count_error):
- ### logger
-
- total_locus = count_exact['total'] + count_inf['total'] + count_asm['total'] + count_alm['total'] + count_lnf['total'] + count_tpr['total'] + count_plot['total'] \
- + count_niph['total'] + count_niphem['total'] + count_error['total']
-
- tag_counts = [total_locus, count_exact['total'], count_exact['good_quality'], count_exact['bad_quality'], count_exact['no_start'], count_exact['no_start_stop'],
- count_exact['no_stop'], count_exact['multiple_stop'], count_inf['total'], count_inf['good_quality'], count_inf['bad_quality'], count_inf['no_start'],
- count_inf['no_start_stop'], count_inf['no_stop'], count_inf['multiple_stop'], count_asm['total'], count_asm['insertion'], count_asm['deletion'],
- count_asm['substitution'], count_alm['total'], count_alm['insertion'], count_alm['deletion'], count_alm['substitution'], count_plot['total'],
- count_niph['total'], count_niph['good_quality'], count_niph['bad_quality'], count_niph['no_start'], count_niph['no_start_stop'], count_niph['no_stop'],
- count_niph['multiple_stop'], count_niphem['total'], count_niphem['good_quality'], count_niphem['bad_quality'], count_niphem['no_start'],
- count_niphem['no_start_stop'], count_niphem['no_stop'], count_niphem['multiple_stop'], count_lnf['total'], count_lnf['not_found'], count_lnf['low_id'],
- count_lnf['low_coverage'], count_tpr['total'], count_tpr['good_quality'], count_tpr['bad_quality'], count_tpr['no_start'], count_tpr['no_start_stop'],
- count_tpr['no_stop'], count_tpr['multiple_stop'], count_error['total'], count_error['good_quality'], count_error['bad_quality'], count_error['no_start'],
- count_error['no_start_stop'], count_error['no_stop'], count_error['multiple_stop']]
-
- fig=go.Figure(go.Sunburst(
- ids=[
- sample_name, "Exact Match", "Good Quality - Exact Match", "Bad Quality - Exact Match",
- "No start - Bad Quality - Exact Match", "No start-stop - Bad Quality - Exact Match",
- "No stop - Bad Quality - Exact Match", "Multiple stop - Bad Quality - Exact Match",
- "INF", "Good Quality - INF", "Bad Quality - INF", "No start - Bad Quality - INF",
- "No start-stop - Bad Quality - INF", "No stop - Bad Quality - INF", "Multiple stop - Bad Quality - INF",
- "ASM", "Insertion - ASM", "Deletion - ASM", "Substitution - ASM", "ALM", "Insertion - ALM",
- "Deletion - ALM", "Substitution - ALM", "PLOT", "NIPH", "Good Quality - NIPH",
- "Bad Quality - NIPH", "No start - Bad Quality - NIPH", "No start-stop - Bad Quality - NIPH",
- "No stop - Bad Quality - NIPH", "Multiple stop - Bad Quality - NIPH", "NIPHEM",
- "Good Quality - NIPHEM", "Bad Quality - NIPHEM", "No start - Bad Quality - NIPHEM",
- "No start-stop - Bad Quality - NIPHEM", "No stop - Bad Quality - NIPHEM",
- "Multiple stop - Bad Quality - NIPHEM", "LNF", "Not found",
- "Low ID", "Low coverage", "TPR", "Good Quality - TPR", "Bad Quality - TPR",
- "No start - Bad Quality - TPR", "No start-stop - Bad Quality - TPR", "No stop - Bad Quality - TPR",
- "Multiple stop - Bad Quality - TPR", "Error", "Good Quality - Error", "Bad Quality - Error",
- "No start - Bad Quality - Error", "No start-stop - Bad Quality - Error",
- "No stop - Bad Quality - Error", "Multiple stop - Bad Quality - Error"
- ],
- labels= [
- sample_name, "Exact
Match", "Good
Quality", "Bad
Quality",
- "No
start", "No
start-stop", "No
stop", "Multiple
stop",
- "INF", "Good
Quality", "Bad
Quality", "No
start",
- "No
start-stop", "No
stop", "Multiple
stop", "ASM", "Insertion",
- "Deletion", "Substitution", "ALM", "Insertion", "Deletion",
- "Substitution", "PLOT", "NIPH", "Good
Quality", "Bad
Quality",
- "No
start", "No
start-stop", "No
stop", "Multiple
stop",
- "NIPHEM", "Good
Quality", "Bad
Quality", "No
start",
- "No
start-stop", "No
stop", "Multiple
stop", "LNF", "Not
found",
- "Low
ID", "Low
coverage", "TPR", "Good
Quality", "Bad
Quality",
- "No
start", "No
start-stop", "No
stop", "Multiple
stop",
- "Error", "Good
Quality", "Bad
Quality","No
start",
- "No
start-stop", "No
stop", "Multiple
stop"
- ],
- parents=[
- "", sample_name, "Exact Match", "Exact Match", "Bad Quality - Exact Match",
- "Bad Quality - Exact Match", "Bad Quality - Exact Match", "Bad Quality - Exact Match",
- sample_name, "INF", "INF", "Bad Quality - INF", "Bad Quality - INF", "Bad Quality - INF",
- "Bad Quality - INF", sample_name, "ASM", "ASM", "ASM", sample_name, "ALM", "ALM", "ALM", sample_name,
- sample_name, "NIPH", "NIPH", "Bad Quality - NIPH", "Bad Quality - NIPH", "Bad Quality - NIPH",
- "Bad Quality - NIPH", sample_name, "NIPHEM", "NIPHEM", "Bad Quality - NIPHEM",
- "Bad Quality - NIPHEM", "Bad Quality - NIPHEM", "Bad Quality - NIPHEM", sample_name, "LNF",
- "LNF", "LNF", sample_name, "TPR", "TPR", "Bad Quality - TPR", "Bad Quality - TPR",
- "Bad Quality - TPR", "Bad Quality - TPR", sample_name, "Error", "Error", "Bad Quality - Error",
- "Bad Quality - Error", "Bad Quality - Error", "Bad Quality - Error"
- ],
- values=tag_counts,
- branchvalues="total",
- ))
-
- fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
-
- plotsdir = os.path.join(outputdir, 'plots', sample_name + '.html')
+def create_sunburst_allele_call(
+ outputdir,
+ sample_name,
+ count_exact,
+ count_inf,
+ count_asm,
+ count_alm,
+ count_lnf,
+ count_tpr,
+ count_plot,
+ count_niph,
+ count_niphem,
+ count_error,
+):
+ ### logger
+
+ total_locus = (
+ count_exact["total"]
+ + count_inf["total"]
+ + count_asm["total"]
+ + count_alm["total"]
+ + count_lnf["total"]
+ + count_tpr["total"]
+ + count_plot["total"]
+ + count_niph["total"]
+ + count_niphem["total"]
+ + count_error["total"]
+ )
+
+ tag_counts = [
+ total_locus,
+ count_exact["total"],
+ count_exact["good_quality"],
+ count_exact["bad_quality"],
+ count_exact["no_start"],
+ count_exact["no_start_stop"],
+ count_exact["no_stop"],
+ count_exact["multiple_stop"],
+ count_inf["total"],
+ count_inf["good_quality"],
+ count_inf["bad_quality"],
+ count_inf["no_start"],
+ count_inf["no_start_stop"],
+ count_inf["no_stop"],
+ count_inf["multiple_stop"],
+ count_asm["total"],
+ count_asm["insertion"],
+ count_asm["deletion"],
+ count_asm["substitution"],
+ count_alm["total"],
+ count_alm["insertion"],
+ count_alm["deletion"],
+ count_alm["substitution"],
+ count_plot["total"],
+ count_niph["total"],
+ count_niph["good_quality"],
+ count_niph["bad_quality"],
+ count_niph["no_start"],
+ count_niph["no_start_stop"],
+ count_niph["no_stop"],
+ count_niph["multiple_stop"],
+ count_niphem["total"],
+ count_niphem["good_quality"],
+ count_niphem["bad_quality"],
+ count_niphem["no_start"],
+ count_niphem["no_start_stop"],
+ count_niphem["no_stop"],
+ count_niphem["multiple_stop"],
+ count_lnf["total"],
+ count_lnf["not_found"],
+ count_lnf["low_id"],
+ count_lnf["low_coverage"],
+ count_tpr["total"],
+ count_tpr["good_quality"],
+ count_tpr["bad_quality"],
+ count_tpr["no_start"],
+ count_tpr["no_start_stop"],
+ count_tpr["no_stop"],
+ count_tpr["multiple_stop"],
+ count_error["total"],
+ count_error["good_quality"],
+ count_error["bad_quality"],
+ count_error["no_start"],
+ count_error["no_start_stop"],
+ count_error["no_stop"],
+ count_error["multiple_stop"],
+ ]
+
+ fig = go.Figure(
+ go.Sunburst(
+ ids=[
+ sample_name,
+ "Exact Match",
+ "Good Quality - Exact Match",
+ "Bad Quality - Exact Match",
+ "No start - Bad Quality - Exact Match",
+ "No start-stop - Bad Quality - Exact Match",
+ "No stop - Bad Quality - Exact Match",
+ "Multiple stop - Bad Quality - Exact Match",
+ "INF",
+ "Good Quality - INF",
+ "Bad Quality - INF",
+ "No start - Bad Quality - INF",
+ "No start-stop - Bad Quality - INF",
+ "No stop - Bad Quality - INF",
+ "Multiple stop - Bad Quality - INF",
+ "ASM",
+ "Insertion - ASM",
+ "Deletion - ASM",
+ "Substitution - ASM",
+ "ALM",
+ "Insertion - ALM",
+ "Deletion - ALM",
+ "Substitution - ALM",
+ "PLOT",
+ "NIPH",
+ "Good Quality - NIPH",
+ "Bad Quality - NIPH",
+ "No start - Bad Quality - NIPH",
+ "No start-stop - Bad Quality - NIPH",
+ "No stop - Bad Quality - NIPH",
+ "Multiple stop - Bad Quality - NIPH",
+ "NIPHEM",
+ "Good Quality - NIPHEM",
+ "Bad Quality - NIPHEM",
+ "No start - Bad Quality - NIPHEM",
+ "No start-stop - Bad Quality - NIPHEM",
+ "No stop - Bad Quality - NIPHEM",
+ "Multiple stop - Bad Quality - NIPHEM",
+ "LNF",
+ "Not found",
+ "Low ID",
+ "Low coverage",
+ "TPR",
+ "Good Quality - TPR",
+ "Bad Quality - TPR",
+ "No start - Bad Quality - TPR",
+ "No start-stop - Bad Quality - TPR",
+ "No stop - Bad Quality - TPR",
+ "Multiple stop - Bad Quality - TPR",
+ "Error",
+ "Good Quality - Error",
+ "Bad Quality - Error",
+ "No start - Bad Quality - Error",
+ "No start-stop - Bad Quality - Error",
+ "No stop - Bad Quality - Error",
+ "Multiple stop - Bad Quality - Error",
+ ],
+ labels=[
+ sample_name,
+ "Exact
Match",
+ "Good
Quality",
+ "Bad
Quality",
+ "No
start",
+ "No
start-stop",
+ "No
stop",
+ "Multiple
stop",
+ "INF",
+ "Good
Quality",
+ "Bad
Quality",
+ "No
start",
+ "No
start-stop",
+ "No
stop",
+ "Multiple
stop",
+ "ASM",
+ "Insertion",
+ "Deletion",
+ "Substitution",
+ "ALM",
+ "Insertion",
+ "Deletion",
+ "Substitution",
+ "PLOT",
+ "NIPH",
+ "Good
Quality",
+ "Bad
Quality",
+ "No
start",
+ "No
start-stop",
+ "No
stop",
+ "Multiple
stop",
+ "NIPHEM",
+ "Good
Quality",
+ "Bad
Quality",
+ "No
start",
+ "No
start-stop",
+ "No
stop",
+ "Multiple
stop",
+ "LNF",
+ "Not
found",
+ "Low
ID",
+ "Low
coverage",
+ "TPR",
+ "Good
Quality",
+ "Bad
Quality",
+ "No
start",
+ "No
start-stop",
+ "No
stop",
+ "Multiple
stop",
+ "Error",
+ "Good
Quality",
+ "Bad
Quality",
+ "No
start",
+ "No
start-stop",
+ "No
stop",
+ "Multiple
stop",
+ ],
+ parents=[
+ "",
+ sample_name,
+ "Exact Match",
+ "Exact Match",
+ "Bad Quality - Exact Match",
+ "Bad Quality - Exact Match",
+ "Bad Quality - Exact Match",
+ "Bad Quality - Exact Match",
+ sample_name,
+ "INF",
+ "INF",
+ "Bad Quality - INF",
+ "Bad Quality - INF",
+ "Bad Quality - INF",
+ "Bad Quality - INF",
+ sample_name,
+ "ASM",
+ "ASM",
+ "ASM",
+ sample_name,
+ "ALM",
+ "ALM",
+ "ALM",
+ sample_name,
+ sample_name,
+ "NIPH",
+ "NIPH",
+ "Bad Quality - NIPH",
+ "Bad Quality - NIPH",
+ "Bad Quality - NIPH",
+ "Bad Quality - NIPH",
+ sample_name,
+ "NIPHEM",
+ "NIPHEM",
+ "Bad Quality - NIPHEM",
+ "Bad Quality - NIPHEM",
+ "Bad Quality - NIPHEM",
+ "Bad Quality - NIPHEM",
+ sample_name,
+ "LNF",
+ "LNF",
+ "LNF",
+ sample_name,
+ "TPR",
+ "TPR",
+ "Bad Quality - TPR",
+ "Bad Quality - TPR",
+ "Bad Quality - TPR",
+ "Bad Quality - TPR",
+ sample_name,
+ "Error",
+ "Error",
+ "Bad Quality - Error",
+ "Bad Quality - Error",
+ "Bad Quality - Error",
+ "Bad Quality - Error",
+ ],
+ values=tag_counts,
+ branchvalues="total",
+ )
+ )
+
+ fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))
+
+ plotsdir = os.path.join(outputdir, "plots", sample_name + ".html")
fig.write_html(plotsdir)
@@ -1656,11 +2715,19 @@ def create_sunburst_allele_call (outputdir, sample_name, count_exact, count_inf,
# Update core genes schema adding new inferred alleles found for each locus in allele calling analysis #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * #
-def update_schema (updateschema, schemadir, outputdir, core_gene_list_files, inferred_alleles_dict, alleles_in_locus_dict, logger):
+def update_schema(
+ updateschema,
+ schemadir,
+ outputdir,
+ core_gene_list_files,
+ inferred_alleles_dict,
+ alleles_in_locus_dict,
+ logger,
+):
if len(inferred_alleles_dict) > 0:
## Create a copy of core genes schema if updateschema = 'new' / 'New'
- if updateschema == 'new':
+ if updateschema == "new":
no_updated_schemadir = schemadir
##schemadir_name = os.path.dirname(no_updated_schemadir) ---> se puede usar si guardo finalmente el nuevo esquema en el mismo directorio que el antiguo esquema, pero para ello debo verificar si termina o no el path en / para eliminarlo o no del path antes de hacer el dirname
schemadir_name = no_updated_schemadir.split("/")
@@ -1669,52 +2736,82 @@ def update_schema (updateschema, schemadir, outputdir, core_gene_list_files, inf
else:
schemadir_name = schemadir_name[-1]
- schemadir = os.path.join(outputdir, schemadir_name + '_updated')
+ schemadir = os.path.join(outputdir, schemadir_name + "_updated")
try:
shutil.copytree(no_updated_schemadir, schemadir)
- logger.info ('Schema copy %s has been created to update schema', schemadir)
+ logger.info(
+ "Schema copy %s has been created to update schema", schemadir
+ )
except:
- logger.info('Deleting preexisting directory')
+ logger.info("Deleting preexisting directory")
shutil.rmtree(schemadir)
try:
shutil.copytree(no_updated_schemadir, schemadir)
- logger.info ('Schema copy %s has been created to update schema', schemadir)
+ logger.info(
+ "Schema copy %s has been created to update schema", schemadir
+ )
except:
- logger.info('Unable to create schema copy %s', schemadir)
- print('Cannot create schema copy on ', schemadir)
+ logger.info("Unable to create schema copy %s", schemadir)
+ print("Cannot create schema copy on ", schemadir)
exit(0)
## Get INF alleles for each core gene and update each locus fasta file
for core_file in core_gene_list_files:
- core_name = os.path.basename(core_file).split('.')[0]
+ core_name = os.path.basename(core_file).split(".")[0]
if core_name in inferred_alleles_dict:
- logger.info('Updating core gene file %s adding new INF alleles', core_file)
+ logger.info(
+ "Updating core gene file %s adding new INF alleles", core_file
+ )
inf_list = inferred_alleles_dict[core_name]
try:
- alleles_ids = [int(allele) for allele in alleles_in_locus_dict[core_name]]
+ alleles_ids = [
+ int(allele) for allele in alleles_in_locus_dict[core_name]
+ ]
allele_number = max(alleles_ids)
- locus_schema_file = os.path.join(schemadir, core_name + '.fasta')
- with open (locus_schema_file, 'a') as core_fh:
+ locus_schema_file = os.path.join(schemadir, core_name + ".fasta")
+ with open(locus_schema_file, "a") as core_fh:
for inf in inf_list:
allele_number += 1
- core_fh.write('\n' + '>' + str(allele_number) + ' # ' + 'INF by Taranis' + '\n' + inf + '\n')
+ core_fh.write(
+ "\n"
+ + ">"
+ + str(allele_number)
+ + " # "
+ + "INF by Taranis"
+ + "\n"
+ + inf
+ + "\n"
+ )
except:
- alleles_ids = [int(allele.split('_')[-1]) for allele in alleles_in_locus_dict[core_name]]
+ alleles_ids = [
+ int(allele.split("_")[-1])
+ for allele in alleles_in_locus_dict[core_name]
+ ]
allele_number = max(alleles_ids)
- locus_schema_file = os.path.join(schemadir, core_name + '.fasta')
- with open (locus_schema_file, 'a') as core_fh:
+ locus_schema_file = os.path.join(schemadir, core_name + ".fasta")
+ with open(locus_schema_file, "a") as core_fh:
for inf in inf_list:
allele_number += 1
- complete_inf_id = core_name + '_' + str(allele_number)
- core_fh.write('\n' + '>' + complete_inf_id + ' # ' + 'INF by Taranis' + '\n' + inf + '\n')
+ complete_inf_id = core_name + "_" + str(allele_number)
+ core_fh.write(
+ "\n"
+ + ">"
+ + complete_inf_id
+ + " # "
+ + "INF by Taranis"
+ + "\n"
+ + inf
+ + "\n"
+ )
return True
+
"""
def update_schema (updateschema, schemadir, storedir, core_gene_list_files, inferred_alleles_dict, alleles_in_locus_dict, logger):
@@ -1751,32 +2848,67 @@ def update_schema (updateschema, schemadir, storedir, core_gene_list_files, infe
# Allele calling analysis to find each core gene in schema and its variants in samples #
# · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * · * #
-def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in_locus_dict, contigs_in_sample_dict, query_directory, reference_alleles_directory, blast_db_directory, prodigal_directory, blast_results_seq_directory, blast_results_db_directory, inputdir, outputdir, cpus, percentlength, coverage, evalue, perc_identity_ref, perc_identity_loc, reward, penalty, gapopen, gapextend, max_target_seqs, max_hsps, num_threads, flankingnts, schema_variability, schema_statistics, schema_quality, annotation_core_dict, profile_csv_path, logger ):
- prodigal_report = [] # TEMPORAL. prodigal_report para checkear las secuencias obtenidas con prodigal vs blast y las posiciones sstart y send
+def allele_call_nucleotides(
+ core_gene_list_files,
+ sample_list_files,
+ alleles_in_locus_dict,
+ contigs_in_sample_dict,
+ query_directory,
+ reference_alleles_directory,
+ blast_db_directory,
+ prodigal_directory,
+ blast_results_seq_directory,
+ blast_results_db_directory,
+ inputdir,
+ outputdir,
+ cpus,
+ percentlength,
+ coverage,
+ evalue,
+ perc_identity_ref,
+ perc_identity_loc,
+ reward,
+ penalty,
+ gapopen,
+ gapextend,
+ max_target_seqs,
+ max_hsps,
+ num_threads,
+ flankingnts,
+ schema_variability,
+ schema_statistics,
+ schema_quality,
+ annotation_core_dict,
+ profile_csv_path,
+ logger,
+):
+ prodigal_report = (
+ []
+ ) # TEMPORAL. prodigal_report para checkear las secuencias obtenidas con prodigal vs blast y las posiciones sstart y send
# listas añadidas para calcular coverage medio de new_sseq con respecto a alelo para establecer coverage mínimo por debajo del cual considerar LNF
- shorter_seq_coverage = [] # TEMPORAL
- longer_seq_coverage = [] # TEMPORAL
- equal_seq_coverage = [] # TEMPORAL
+ shorter_seq_coverage = [] # TEMPORAL
+ longer_seq_coverage = [] # TEMPORAL
+ equal_seq_coverage = [] # TEMPORAL
# listas añadidas para calcular coverage medio de sseq con respecto a alelo tras blast para establecer coverage mínimo por debajo del cual considerar LNF
- shorter_blast_seq_coverage = [] # TEMPORAL
- longer_blast_seq_coverage = [] # TEMPORAL
- equal_blast_seq_coverage = [] # TEMPORAL
+ shorter_blast_seq_coverage = [] # TEMPORAL
+ longer_blast_seq_coverage = [] # TEMPORAL
+ equal_blast_seq_coverage = [] # TEMPORAL
full_gene_list = []
- samples_matrix_dict = {} # to keep allele number
- matching_genes_dict = {} # to keep start and stop positions
- exact_dict = {} # c/m: to keep exact matches found for each sample
- inferred_alleles_dict = {} # to keep track of the new inferred alleles
- inf_dict = {} # to keep inferred alleles found for each sample
- paralog_dict = {} # to keep paralogs found for each sample
- asm_dict = {} # c/m: to keep track of asm
- alm_dict = {} # c/m: to keep track of alm
- list_asm = {} # c/m: to keep asm found for each sample
- list_alm = {} # c/m: to keep alm found for each sample
- lnf_tpr_dict = {} # c/m: to keep locus not found for each sample
- plot_dict = {} # c/m: to keep plots for each sample
- snp_dict = {} # c/m: to keep snp information for each sample
+ samples_matrix_dict = {} # to keep allele number
+ matching_genes_dict = {} # to keep start and stop positions
+ exact_dict = {} # c/m: to keep exact matches found for each sample
+ inferred_alleles_dict = {} # to keep track of the new inferred alleles
+ inf_dict = {} # to keep inferred alleles found for each sample
+ paralog_dict = {} # to keep paralogs found for each sample
+ asm_dict = {} # c/m: to keep track of asm
+ alm_dict = {} # c/m: to keep track of alm
+ list_asm = {} # c/m: to keep asm found for each sample
+ list_alm = {} # c/m: to keep alm found for each sample
+ lnf_tpr_dict = {} # c/m: to keep locus not found for each sample
+ plot_dict = {} # c/m: to keep plots for each sample
+ snp_dict = {} # c/m: to keep snp information for each sample
protein_dict = {}
match_alignment_dict = {}
@@ -1794,71 +2926,138 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"'
- print('Allele calling starts')
- pbar = ProgressBar ()
-
+ print("Allele calling starts")
+ pbar = ProgressBar()
## # # # # # # # # # # # # # # # # # # # # # # # # ##
## Processing the search for each schema core gene ##
## # # # # # # # # # # # # # # # # # # # # # # # # ##
- for core_file in pbar(core_gene_list_files) :
- core_name = os.path.basename(core_file).split('.')[0]
+ for core_file in pbar(core_gene_list_files):
+ core_name = os.path.basename(core_file).split(".")[0]
full_gene_list.append(core_name)
- logger.info('Processing core gene file %s ', core_file)
+ logger.info("Processing core gene file %s ", core_file)
# Get path to this locus fasta file
- locus_alleles_path = os.path.join(query_directory, str(core_name + '.fasta'))
+ locus_alleles_path = os.path.join(query_directory, str(core_name + ".fasta"))
# Get path to reference allele fasta file for this locus
- core_reference_allele_path = os.path.join(reference_alleles_directory, core_name + '.fasta')
+ core_reference_allele_path = os.path.join(
+ reference_alleles_directory, core_name + ".fasta"
+ )
# Get length thresholds for INF, ASM and ALM classification
- max_length_threshold, min_length_threshold = length_thresholds(core_name, schema_statistics, percentlength)
+ max_length_threshold, min_length_threshold = length_thresholds(
+ core_name, schema_statistics, percentlength
+ )
# Get length thresholds for LNF, ASM and ALM classification
- max_coverage_threshold, min_coverage_threshold = length_thresholds(core_name, schema_statistics, coverage)
+ max_coverage_threshold, min_coverage_threshold = length_thresholds(
+ core_name, schema_statistics, coverage
+ )
## # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ##
## Processing the search for each schema core gene in each sample ##
## # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ##
for sample_file in sample_list_files:
- logger.info('Processing sample file %s ', sample_file)
+ logger.info("Processing sample file %s ", sample_file)
- sample_name = '.'.join(os.path.basename(sample_file).split('.')[:-1])
+ sample_name = ".".join(os.path.basename(sample_file).split(".")[:-1])
# (recuento tags para plots)
if sample_name not in count_exact:
- count_exact[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0}
+ count_exact[sample_name] = {
+ "good_quality": 0,
+ "bad_quality": 0,
+ "no_start": 0,
+ "no_start_stop": 0,
+ "no_stop": 0,
+ "multiple_stop": 0,
+ "total": 0,
+ }
if sample_name not in count_inf:
- count_inf[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0}
+ count_inf[sample_name] = {
+ "good_quality": 0,
+ "bad_quality": 0,
+ "no_start": 0,
+ "no_start_stop": 0,
+ "no_stop": 0,
+ "multiple_stop": 0,
+ "total": 0,
+ }
if sample_name not in count_asm:
- count_asm[sample_name] = {"insertion" : 0, "deletion" : 0, "substitution" : 0, "total" : 0}
+ count_asm[sample_name] = {
+ "insertion": 0,
+ "deletion": 0,
+ "substitution": 0,
+ "total": 0,
+ }
if sample_name not in count_alm:
- count_alm[sample_name] = {"insertion" : 0, "deletion" : 0, "substitution" : 0, "total" : 0}
+ count_alm[sample_name] = {
+ "insertion": 0,
+ "deletion": 0,
+ "substitution": 0,
+ "total": 0,
+ }
if sample_name not in count_lnf:
- count_lnf[sample_name] = {"not_found" : 0, "low_id" : 0, "low_coverage" : 0, "total" : 0}
+ count_lnf[sample_name] = {
+ "not_found": 0,
+ "low_id": 0,
+ "low_coverage": 0,
+ "total": 0,
+ }
if sample_name not in count_tpr:
- count_tpr[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0}
+ count_tpr[sample_name] = {
+ "good_quality": 0,
+ "bad_quality": 0,
+ "no_start": 0,
+ "no_start_stop": 0,
+ "no_stop": 0,
+ "multiple_stop": 0,
+ "total": 0,
+ }
if sample_name not in count_plot:
- count_plot[sample_name] = {"total" : 0}
+ count_plot[sample_name] = {"total": 0}
if sample_name not in count_niph:
- count_niph[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0}
+ count_niph[sample_name] = {
+ "good_quality": 0,
+ "bad_quality": 0,
+ "no_start": 0,
+ "no_start_stop": 0,
+ "no_stop": 0,
+ "multiple_stop": 0,
+ "total": 0,
+ }
if sample_name not in count_niphem:
- count_niphem[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0}
+ count_niphem[sample_name] = {
+ "good_quality": 0,
+ "bad_quality": 0,
+ "no_start": 0,
+ "no_start_stop": 0,
+ "no_stop": 0,
+ "multiple_stop": 0,
+ "total": 0,
+ }
if sample_name not in count_error:
- count_error[sample_name] = {"good_quality" : 0, "bad_quality" : 0, "no_start" : 0, "no_start_stop" : 0, "no_stop" : 0, "multiple_stop" : 0, "total" : 0}
-
+ count_error[sample_name] = {
+ "good_quality": 0,
+ "bad_quality": 0,
+ "no_start": 0,
+ "no_start_stop": 0,
+ "no_stop": 0,
+ "multiple_stop": 0,
+ "total": 0,
+ }
# Initialize the sample list to add the number of alleles and the start, stop positions
if not sample_name in samples_matrix_dict:
@@ -1872,7 +3071,20 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
# Sample contigs VS reference allele(s) BLAST for locus detection in sample #
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * #
- cline = NcbiblastnCommandline(db=blast_db_name, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path)
+ cline = NcbiblastnCommandline(
+ db=blast_db_name,
+ evalue=evalue,
+ perc_identity=perc_identity_ref,
+ reward=reward,
+ penalty=penalty,
+ gapopen=gapopen,
+ gapextend=gapextend,
+ outfmt=blast_parameters,
+ max_target_seqs=max_target_seqs,
+ max_hsps=max_hsps,
+ num_threads=num_threads,
+ query=core_reference_allele_path,
+ )
out, err = cline()
out_lines = out.splitlines()
@@ -1881,49 +3093,113 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
# ······························································ #
# LNF if there are no BLAST results for this gene in this sample #
# ······························································ #
- if len (out_lines) == 0:
-
+ if len(out_lines) == 0:
# Trying to get the allele number to avoid that a bad quality assembly impact on the tree diagram
- cline = NcbiblastnCommandline(db=blast_db_name, evalue=evalue, perc_identity = 70, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=1, max_hsps=1, num_threads=1, query=core_reference_allele_path)
+ cline = NcbiblastnCommandline(
+ db=blast_db_name,
+ evalue=evalue,
+ perc_identity=70,
+ reward=reward,
+ penalty=penalty,
+ gapopen=gapopen,
+ gapextend=gapextend,
+ outfmt=blast_parameters,
+ max_target_seqs=1,
+ max_hsps=1,
+ num_threads=1,
+ query=core_reference_allele_path,
+ )
out, err = cline()
out_lines = out.splitlines()
- if len (out_lines) > 0 :
-
- for line in out_lines :
- values = line.split('\t')
- if float(values[8]) > bigger_bitscore:
- qseqid , sseqid , pident , qlen , s_length , mismatch , r_gapopen , r_evalue , bitscore , sstart , send , qstart , qend ,sseq , qseq = values
+ if len(out_lines) > 0:
+ for line in out_lines:
+ values = line.split("\t")
+ if float(values[8]) > bigger_bitscore:
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = values
bigger_bitscore = float(bitscore)
# Keep LNF info
- lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, qseqid, pident, '-', '-', perc_identity_ref, '-', schema_quality, annotation_core_dict, count_lnf, logger)
+ lnf_tpr_tag(
+ core_name,
+ sample_name,
+ alleles_in_locus_dict,
+ samples_matrix_dict,
+ lnf_tpr_dict,
+ schema_statistics,
+ locus_alleles_path,
+ qseqid,
+ pident,
+ "-",
+ "-",
+ perc_identity_ref,
+ "-",
+ schema_quality,
+ annotation_core_dict,
+ count_lnf,
+ logger,
+ )
else:
# Keep LNF info
- lnf_tpr_tag(core_name, sample_name, '-', samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, '-', '-', '-', '-', '-', '-', schema_quality, annotation_core_dict, count_lnf, logger)
+ lnf_tpr_tag(
+ core_name,
+ sample_name,
+ "-",
+ samples_matrix_dict,
+ lnf_tpr_dict,
+ schema_statistics,
+ locus_alleles_path,
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ schema_quality,
+ annotation_core_dict,
+ count_lnf,
+ logger,
+ )
continue
## Continue classification process if the core gene has been detected in sample after BLAST search
- if len (out_lines) > 0:
-
+ if len(out_lines) > 0:
# Parse contigs for this sample
- #contig_file = os.path.join(inputdir, sample_name + ".fasta") ## parse
- #records = list(SeqIO.parse(contig_file, "fasta")) ## parse
+ # contig_file = os.path.join(inputdir, sample_name + ".fasta") ## parse
+ # records = list(SeqIO.parse(contig_file, "fasta")) ## parse
## Keep BLAST results after locus detection in sample using reference allele
# Path to BLAST results fasta file
- path_to_blast_seq = os.path.join(blast_results_seq_directory, sample_name, core_name + "_blast.fasta")
+ path_to_blast_seq = os.path.join(
+ blast_results_seq_directory, sample_name, core_name + "_blast.fasta"
+ )
- with open (path_to_blast_seq, 'w') as outblast_fh:
+ with open(path_to_blast_seq, "w") as outblast_fh:
seq_number = 1
- for line in out_lines :
- values = line.split('\t')
+ for line in out_lines:
+ values = line.split("\t")
qseqid = values[0]
if values[1] not in contigs_in_sample_dict[sample_name]:
- sseqid = '|'.join(values[1].split('|')[1:-1])
+ sseqid = "|".join(values[1].split("|")[1:-1])
else:
sseqid = values[1]
sstart = values[9]
@@ -1932,10 +3208,10 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
# Get flanked BLAST sequences from contig for correct allele tagging
accession_sequence = contigs_in_sample_dict[sample_name][sseqid]
- #for record in records: ## parse
- #if record.id == sseqid : ## parse
- #break ## parse
- #accession_sequence = str(record.seq) ## parse
+ # for record in records: ## parse
+ # if record.id == sseqid : ## parse
+ # break ## parse
+ # accession_sequence = str(record.seq) ## parse
if int(send) > int(sstart):
max_index = int(send)
@@ -1946,94 +3222,188 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
if (flankingnts + 1) <= min_index:
if flankingnts <= (len(accession_sequence) - max_index):
- flanked_sseq = accession_sequence[ min_index -1 -flankingnts : max_index + flankingnts ]
+ flanked_sseq = accession_sequence[
+ min_index
+ - 1
+ - flankingnts : max_index
+ + flankingnts
+ ]
else:
- flanked_sseq = accession_sequence[ min_index -1 -flankingnts : ]
+ flanked_sseq = accession_sequence[
+ min_index - 1 - flankingnts :
+ ]
else:
- flanked_sseq = accession_sequence[ : max_index + flankingnts ]
-
- seq_id = str(seq_number) + '_' + sseqid
- outblast_fh.write('>' + seq_id + ' # ' + ' # '.join(values[0:13]) + '\n' + flanked_sseq + '\n' + '\n' )
+ flanked_sseq = accession_sequence[: max_index + flankingnts]
+
+ seq_id = str(seq_number) + "_" + sseqid
+ outblast_fh.write(
+ ">"
+ + seq_id
+ + " # "
+ + " # ".join(values[0:13])
+ + "\n"
+ + flanked_sseq
+ + "\n"
+ + "\n"
+ )
seq_number += 1
## Create local BLAST database for BLAST results after locus detection in sample using reference allele
db_name = os.path.join(blast_results_db_directory, sample_name)
- if not create_blastdb(path_to_blast_seq, db_name, 'nucl', logger):
- print('Error when creating the blastdb for blast results file for locus %s at sample %s. Check log file for more information. \n ', core_name, sample_name)
+ if not create_blastdb(path_to_blast_seq, db_name, "nucl", logger):
+ print(
+ "Error when creating the blastdb for blast results file for locus %s at sample %s. Check log file for more information. \n ",
+ core_name,
+ sample_name,
+ )
return False
# Path to local BLAST database for BLAST results after locus detection in sample using reference allele
- locus_blast_db_name = os.path.join(blast_results_db_directory, sample_name, os.path.basename(core_name) + '_blast', os.path.basename(core_name) + '_blast')
-
+ locus_blast_db_name = os.path.join(
+ blast_results_db_directory,
+ sample_name,
+ os.path.basename(core_name) + "_blast",
+ os.path.basename(core_name) + "_blast",
+ )
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * #
# BLAST result sequences VS ALL alleles in locus BLAST for allele identification detection in sample #
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * #
- cline = NcbiblastnCommandline(db=locus_blast_db_name, evalue=evalue, perc_identity=perc_identity_loc, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt = blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=locus_alleles_path)
+ cline = NcbiblastnCommandline(
+ db=locus_blast_db_name,
+ evalue=evalue,
+ perc_identity=perc_identity_loc,
+ reward=reward,
+ penalty=penalty,
+ gapopen=gapopen,
+ gapextend=gapextend,
+ outfmt=blast_parameters,
+ max_target_seqs=max_target_seqs,
+ max_hsps=max_hsps,
+ num_threads=num_threads,
+ query=locus_alleles_path,
+ )
out, err = cline()
out_lines = out.splitlines()
- allele_found = {} # To keep filtered BLAST results
+ allele_found = {} # To keep filtered BLAST results
## Check if there is any BLAST result with ID = 100 ##
for line in out_lines:
-
- values = line.split('\t')
+ values = line.split("\t")
pident = values[2]
if float(pident) == 100:
-
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = values
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = values
# Parse core gene fasta file to get matching allele sequence and length
- #alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse
- #for allele in alleles_in_locus : ## parse
- #if allele.id == qseqid : ## parse
- #break ## comentar parse
- #matching_allele_seq = str(allele.seq) ## parse
- #matching_allele_length = len(matching_allele_seq) ## parse
+ # alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse
+ # for allele in alleles_in_locus : ## parse
+ # if allele.id == qseqid : ## parse
+ # break ## comentar parse
+ # matching_allele_seq = str(allele.seq) ## parse
+ # matching_allele_length = len(matching_allele_seq) ## parse
matching_allele_seq = alleles_in_locus_dict[core_name][qseqid]
matching_allele_length = len(matching_allele_seq)
# Keep BLAST results with ID = 100 and same length as matching allele
if int(s_length) == matching_allele_length:
- #get_blast_results (values, records, allele_found, logger)
- get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found, logger)
+ # get_blast_results (values, records, allele_found, logger)
+ get_blast_results(
+ sample_name,
+ values,
+ contigs_in_sample_dict,
+ allele_found,
+ logger,
+ )
# ·································································································································· #
# NIPHEM (paralog) if there are multiple BLAST results with ID = 100 and same length as matching allele for this gene in this sample #
# ·································································································································· #
if len(allele_found) > 1:
-
# Keep NIPHEM info
- paralog_exact_tag(sample_name, core_name, 'NIPHEM', schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, paralog_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_niphem, logger)
+ paralog_exact_tag(
+ sample_name,
+ core_name,
+ "NIPHEM",
+ schema_quality,
+ matching_genes_dict,
+ samples_matrix_dict,
+ allele_found,
+ paralog_dict,
+ prodigal_report,
+ prodigal_directory,
+ blast_parameters,
+ annotation_core_dict,
+ count_niphem,
+ logger,
+ )
continue
## Check for possible paralogs with ID < 100 if there is only one BLAST result with ID = 100 and same length as matching allele
- elif len(allele_found) == 1 :
+ elif len(allele_found) == 1:
+ for line in out_lines:
+ values = line.split("\t")
- for line in out_lines :
- values = line.split('\t')
-
- sseq_no_gaps = values[13].replace('-', '')
+ sseq_no_gaps = values[13].replace("-", "")
s_length_no_gaps = len(sseq_no_gaps)
# Keep BLAST result if its coverage is within min and max thresholds
- if min_length_threshold <= s_length_no_gaps <= max_length_threshold:
- #get_blast_results (values, records, allele_found, logger)
- get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found, logger)
+ if (
+ min_length_threshold
+ <= s_length_no_gaps
+ <= max_length_threshold
+ ):
+ # get_blast_results (values, records, allele_found, logger)
+ get_blast_results(
+ sample_name,
+ values,
+ contigs_in_sample_dict,
+ allele_found,
+ logger,
+ )
# ································································ #
# EXACT MATCH if there is any paralog for this gene in this sample #
# ································································ #
- if len(allele_found) == 1 :
-
- paralog_exact_tag(sample_name, core_name, 'EXACT', schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, exact_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_exact, logger)
+ if len(allele_found) == 1:
+ paralog_exact_tag(
+ sample_name,
+ core_name,
+ "EXACT",
+ schema_quality,
+ matching_genes_dict,
+ samples_matrix_dict,
+ allele_found,
+ exact_dict,
+ prodigal_report,
+ prodigal_directory,
+ blast_parameters,
+ annotation_core_dict,
+ count_exact,
+ logger,
+ )
continue
@@ -2041,111 +3411,267 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
# NIPH if there there are paralogs with ID < 100 for this gene in this sample #
# ··········································································· #
else:
-
- paralog_exact_tag(sample_name, core_name, 'NIPH', schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, paralog_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_niph, logger)
+ paralog_exact_tag(
+ sample_name,
+ core_name,
+ "NIPH",
+ schema_quality,
+ matching_genes_dict,
+ samples_matrix_dict,
+ allele_found,
+ paralog_dict,
+ prodigal_report,
+ prodigal_directory,
+ blast_parameters,
+ annotation_core_dict,
+ count_niph,
+ logger,
+ )
continue
## Look for the best BLAST result if there are no results with ID = 100 ##
elif len(allele_found) == 0:
-
bigger_bitscore_seq_values = []
- for line in out_lines :
- values = line.split('\t')
+ for line in out_lines:
+ values = line.split("\t")
- if float(values[8]) > bigger_bitscore:
- s_length_no_gaps = len(values[13].replace('-', ''))
+ if float(values[8]) > bigger_bitscore:
+ s_length_no_gaps = len(values[13].replace("-", ""))
# Keep BLAST result if its coverage is within min and max thresholds and its bitscore is bigger than the one previously kept
- if min_coverage_threshold <= s_length_no_gaps <= max_coverage_threshold:
+ if (
+ min_coverage_threshold
+ <= s_length_no_gaps
+ <= max_coverage_threshold
+ ):
bigger_bitscore_seq_values = values
bigger_bitscore = float(bigger_bitscore_seq_values[8])
## Check if best BLAST result out of coverage thresholds is a possible PLOT or LNF due to low coverage ##
- #if len(allele_found) == 0:
+ # if len(allele_found) == 0:
if len(bigger_bitscore_seq_values) == 0:
-
# Look for best bitscore BLAST result out of coverage thresholds to check possible PLOT or reporting LNF due to low coverage
- for line in out_lines :
- values = line.split('\t')
-
- if float(values[8]) > bigger_bitscore:
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = values
- bigger_bitscore_seq_values_out_cov = values ###
+ for line in out_lines:
+ values = line.split("\t")
+
+ if float(values[8]) > bigger_bitscore:
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = values
+ bigger_bitscore_seq_values_out_cov = values ###
bigger_bitscore = float(bitscore)
# Get BLAST values relatives to contig for bigger bitscore result
- lnf_plot_found = {} ###
-
- get_blast_results (sample_name, bigger_bitscore_seq_values_out_cov, contigs_in_sample_dict, lnf_plot_found, logger) ###
-
- allele_id = str(list(lnf_plot_found.keys())[0]) ###
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = lnf_plot_found[allele_id]
+ lnf_plot_found = {} ###
+
+ get_blast_results(
+ sample_name,
+ bigger_bitscore_seq_values_out_cov,
+ contigs_in_sample_dict,
+ lnf_plot_found,
+ logger,
+ ) ###
+
+ allele_id = str(list(lnf_plot_found.keys())[0]) ###
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = lnf_plot_found[allele_id]
# Get contig sequence and length for best bitscore BLAST result ID
- #for record in records: ## parse
- #if record.id == sseqid : ## parse
- #break ## parse
- #accession_sequence = record.seq ## parse
- #length_sseqid = len(accession_sequence) ## parse
+ # for record in records: ## parse
+ # if record.id == sseqid : ## parse
+ # break ## parse
+ # accession_sequence = record.seq ## parse
+ # length_sseqid = len(accession_sequence) ## parse
accession_sequence = contigs_in_sample_dict[sample_name][sseqid]
length_sseqid = len(accession_sequence)
# Check if best BLAST result out of coverage thresholds is a possible PLOT. If so, keep result info for later PLOT classification
- if int(sstart) == length_sseqid or int(send) == length_sseqid or int(sstart) == 1 or int(send) == 1:
- bigger_bitscore_seq_values = bigger_bitscore_seq_values_out_cov ###
+ if (
+ int(sstart) == length_sseqid
+ or int(send) == length_sseqid
+ or int(sstart) == 1
+ or int(send) == 1
+ ):
+ bigger_bitscore_seq_values = (
+ bigger_bitscore_seq_values_out_cov ###
+ )
# ·············································································································································· #
# LNF if there are no BLAST results within coverage thresholds for this gene in this sample and best out threshold result is not a possible PLOT #
# ·············································································································································· #
else:
# Get sequence length
- s_length_no_gaps = len(bigger_bitscore_seq_values_out_cov[13].replace('-', ''))
+ s_length_no_gaps = len(
+ bigger_bitscore_seq_values_out_cov[13].replace("-", "")
+ )
# Keep LNF info
- lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, qseqid, pident, s_length_no_gaps, '-', '-', coverage, schema_quality, annotation_core_dict, count_lnf, logger)
+ lnf_tpr_tag(
+ core_name,
+ sample_name,
+ alleles_in_locus_dict,
+ samples_matrix_dict,
+ lnf_tpr_dict,
+ schema_statistics,
+ locus_alleles_path,
+ qseqid,
+ pident,
+ s_length_no_gaps,
+ "-",
+ "-",
+ coverage,
+ schema_quality,
+ annotation_core_dict,
+ count_lnf,
+ logger,
+ )
## Keep result with bigger bitscore in allele_found dict and look for possible paralogs ##
if len(bigger_bitscore_seq_values) > 0:
-
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = bigger_bitscore_seq_values
-
- #get_blast_results (bigger_bitscore_seq_values, records, allele_found, logger)
- get_blast_results (sample_name, bigger_bitscore_seq_values, contigs_in_sample_dict, allele_found, logger)
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = bigger_bitscore_seq_values
+
+ # get_blast_results (bigger_bitscore_seq_values, records, allele_found, logger)
+ get_blast_results(
+ sample_name,
+ bigger_bitscore_seq_values,
+ contigs_in_sample_dict,
+ allele_found,
+ logger,
+ )
# Possible paralogs search
- for line in out_lines :
- values = line.split('\t')
-
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = values
- sseq_no_gaps = sseq.replace('-', '')
+ for line in out_lines:
+ values = line.split("\t")
+
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = values
+ sseq_no_gaps = sseq.replace("-", "")
s_length_no_gaps = len(sseq_no_gaps)
- if min_length_threshold <= s_length_no_gaps <= max_length_threshold:
-
- #get_blast_results (values, records, allele_found, logger)
- get_blast_results (sample_name, values, contigs_in_sample_dict, allele_found, logger)
+ if (
+ min_length_threshold
+ <= s_length_no_gaps
+ <= max_length_threshold
+ ):
+ # get_blast_results (values, records, allele_found, logger)
+ get_blast_results(
+ sample_name,
+ values,
+ contigs_in_sample_dict,
+ allele_found,
+ logger,
+ )
# ····························································· #
# NIPH if there there are paralogs for this gene in this sample #
# ····························································· #
- if len(allele_found) > 1 :
-
- paralog_exact_tag(sample_name, core_name, 'NIPH', schema_quality, matching_genes_dict, samples_matrix_dict, allele_found, paralog_dict, prodigal_report, prodigal_directory, blast_parameters, annotation_core_dict, count_niph, logger)
+ if len(allele_found) > 1:
+ paralog_exact_tag(
+ sample_name,
+ core_name,
+ "NIPH",
+ schema_quality,
+ matching_genes_dict,
+ samples_matrix_dict,
+ allele_found,
+ paralog_dict,
+ prodigal_report,
+ prodigal_directory,
+ blast_parameters,
+ annotation_core_dict,
+ count_niph,
+ logger,
+ )
continue
## Continue classification if there are no paralogs ##
- elif len(allele_found) == 1 :
-
+ elif len(allele_found) == 1:
allele_id = str(list(allele_found.keys())[0])
- qseqid, sseqid, pident, qlen, s_length, mismatch, r_gapopen, r_evalue, bitscore, sstart, send, qstart, qend, sseq, qseq = allele_found[allele_id]
-
- sseq_no_gaps = sseq.replace('-', '')
+ (
+ qseqid,
+ sseqid,
+ pident,
+ qlen,
+ s_length,
+ mismatch,
+ r_gapopen,
+ r_evalue,
+ bitscore,
+ sstart,
+ send,
+ qstart,
+ qend,
+ sseq,
+ qseq,
+ ) = allele_found[allele_id]
+
+ sseq_no_gaps = sseq.replace("-", "")
s_length_no_gaps = len(sseq_no_gaps)
# Get matching allele quality
@@ -2153,48 +3679,99 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
# Get matching allele sequence and length
- #alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse
- #for allele in alleles_in_locus : ## parse
- #if allele.id == qseqid : ## parse
- #break ## parse
- #matching_allele_seq = allele.seq ## parse
- #matching_allele_length = len(matching_allele_seq) ## parse
+ # alleles_in_locus = list (SeqIO.parse(locus_alleles_path, "fasta")) ## parse
+ # for allele in alleles_in_locus : ## parse
+ # if allele.id == qseqid : ## parse
+ # break ## parse
+ # matching_allele_seq = allele.seq ## parse
+ # matching_allele_length = len(matching_allele_seq) ## parse
- matching_allele_seq = alleles_in_locus_dict [core_name][qseqid]
+ matching_allele_seq = alleles_in_locus_dict[core_name][
+ qseqid
+ ]
matching_allele_length = len(matching_allele_seq)
# Get contig sequence and length for ID found in BLAST
- #for record in records: ## parse
- #if record.id == sseqid : ## parse
- #break ## parse
- #accession_sequence = record.seq ## parse
- #length_sseqid = len(accession_sequence) ## parse
+ # for record in records: ## parse
+ # if record.id == sseqid : ## parse
+ # break ## parse
+ # accession_sequence = record.seq ## parse
+ # length_sseqid = len(accession_sequence) ## parse
- accession_sequence = contigs_in_sample_dict[sample_name][sseqid]
+ accession_sequence = contigs_in_sample_dict[sample_name][
+ sseqid
+ ]
length_sseqid = len(accession_sequence)
# ········································································································· #
# PLOT if found sequence is shorter than matching allele and it is located on the edge of the sample contig #
# ········································································································· #
- if int(sstart) == length_sseqid or int(send) == length_sseqid or int(sstart) == 1 or int(send) == 1:
+ if (
+ int(sstart) == length_sseqid
+ or int(send) == length_sseqid
+ or int(sstart) == 1
+ or int(send) == 1
+ ):
if int(s_length) < matching_allele_length:
-
### sacar sec prodigal para PLOT?
# Get prodigal predicted sequence if matching allele quality is "bad quality"
- if 'bad_quality' in allele_quality:
- complete_predicted_seq, start_prodigal, end_prodigal = get_prodigal_sequence(sseq_no_gaps, sseqid, prodigal_directory, sample_name, blast_parameters, logger)
+ if "bad_quality" in allele_quality:
+ (
+ complete_predicted_seq,
+ start_prodigal,
+ end_prodigal,
+ ) = get_prodigal_sequence(
+ sseq_no_gaps,
+ sseqid,
+ prodigal_directory,
+ sample_name,
+ blast_parameters,
+ logger,
+ )
# Keep info for prodigal report
- prodigal_report.append([core_name, sample_name, qseqid, 'PLOT', sstart, send, start_prodigal, end_prodigal, sseq_no_gaps, complete_predicted_seq])
+ prodigal_report.append(
+ [
+ core_name,
+ sample_name,
+ qseqid,
+ "PLOT",
+ sstart,
+ send,
+ start_prodigal,
+ end_prodigal,
+ sseq_no_gaps,
+ complete_predicted_seq,
+ ]
+ )
else:
- complete_predicted_seq = '-'
- start_prodigal = '-'
- end_prodigal = '-'
+ complete_predicted_seq = "-"
+ start_prodigal = "-"
+ end_prodigal = "-"
# Keep PLOT info
- inf_asm_alm_tag(core_name, sample_name, 'PLOT', allele_found[allele_id], allele_quality, '-', matching_allele_length, '-', plot_dict, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_plot, logger)
+ inf_asm_alm_tag(
+ core_name,
+ sample_name,
+ "PLOT",
+ allele_found[allele_id],
+ allele_quality,
+ "-",
+ matching_allele_length,
+ "-",
+ plot_dict,
+ samples_matrix_dict,
+ matching_genes_dict,
+ prodigal_report,
+ start_prodigal,
+ end_prodigal,
+ complete_predicted_seq,
+ annotation_core_dict,
+ count_plot,
+ logger,
+ )
continue
@@ -2203,121 +3780,345 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
# * * * * * * * * * * * * * * * * * * * * #
## Get Prodigal predicted sequence ##
- complete_predicted_seq, start_prodigal, end_prodigal = get_prodigal_sequence(sseq_no_gaps, sseqid, prodigal_directory, sample_name, blast_parameters, logger)
+ (
+ complete_predicted_seq,
+ start_prodigal,
+ end_prodigal,
+ ) = get_prodigal_sequence(
+ sseq_no_gaps,
+ sseqid,
+ prodigal_directory,
+ sample_name,
+ blast_parameters,
+ logger,
+ )
## Search for new codon stop using contig sequence info ##
# Check matching allele sequence direction
- query_direction = check_sequence_order(matching_allele_seq, logger)
+ query_direction = check_sequence_order(
+ matching_allele_seq, logger
+ )
# Get extended BLAST sequence for stop codon search
- if query_direction == 'reverse':
- if int(send) > int (sstart):
- sample_gene_sequence = accession_sequence[ : int(send) ]
- sample_gene_sequence = str(Seq.Seq(sample_gene_sequence).reverse_complement())
+ if query_direction == "reverse":
+ if int(send) > int(sstart):
+ sample_gene_sequence = accession_sequence[
+ : int(send)
+ ]
+ sample_gene_sequence = str(
+ Seq.Seq(
+ sample_gene_sequence
+ ).reverse_complement()
+ )
else:
- sample_gene_sequence = accession_sequence[ int(send) -1 : ]
+ sample_gene_sequence = accession_sequence[
+ int(send) - 1 :
+ ]
else:
- if int(sstart) > int (send):
- sample_gene_sequence = accession_sequence[ : int(sstart) ]
- sample_gene_sequence = str(Seq.Seq(sample_gene_sequence).reverse_complement())
+ if int(sstart) > int(send):
+ sample_gene_sequence = accession_sequence[
+ : int(sstart)
+ ]
+ sample_gene_sequence = str(
+ Seq.Seq(
+ sample_gene_sequence
+ ).reverse_complement()
+ )
else:
- sample_gene_sequence = accession_sequence[ int(sstart) -1 : ]
+ sample_gene_sequence = accession_sequence[
+ int(sstart) - 1 :
+ ]
# Get new stop codon index
stop_index = get_stop_codon_index(sample_gene_sequence)
## Classification of final new sequence if it is found ##
if stop_index != False:
- new_sequence_length = stop_index +3
- new_sseq = str(sample_gene_sequence[0:new_sequence_length])
+ new_sequence_length = stop_index + 3
+ new_sseq = str(
+ sample_gene_sequence[0:new_sequence_length]
+ )
#########################################################################################################################
### c/m: introducido para determinar qué umbral de coverage poner. TEMPORAL
- new_sseq_coverage = new_sequence_length/matching_allele_length ### introduciendo coverage new_sseq /// debería ser con respecto a la media?
+ new_sseq_coverage = (
+ new_sequence_length / matching_allele_length
+ ) ### introduciendo coverage new_sseq /// debería ser con respecto a la media?
if new_sseq_coverage < 1:
- shorter_seq_coverage.append([core_name, sample_name, str(matching_allele_length), str(new_sequence_length), str(schema_statistics[core_name][0]), str(new_sseq_coverage), str(new_sequence_length/schema_statistics[core_name][0])])
+ shorter_seq_coverage.append(
+ [
+ core_name,
+ sample_name,
+ str(matching_allele_length),
+ str(new_sequence_length),
+ str(schema_statistics[core_name][0]),
+ str(new_sseq_coverage),
+ str(
+ new_sequence_length
+ / schema_statistics[core_name][0]
+ ),
+ ]
+ )
elif new_sseq_coverage > 1:
- longer_seq_coverage.append([core_name, sample_name, str(matching_allele_length), str(new_sequence_length), str(schema_statistics[core_name][0]), str(new_sseq_coverage), str(new_sequence_length/schema_statistics[core_name][0])])
+ longer_seq_coverage.append(
+ [
+ core_name,
+ sample_name,
+ str(matching_allele_length),
+ str(new_sequence_length),
+ str(schema_statistics[core_name][0]),
+ str(new_sseq_coverage),
+ str(
+ new_sequence_length
+ / schema_statistics[core_name][0]
+ ),
+ ]
+ )
elif new_sseq_coverage == 1:
- equal_seq_coverage.append([core_name, sample_name, str(matching_allele_length), str(new_sequence_length), str(schema_statistics[core_name][0]), str(new_sseq_coverage), str(new_sequence_length/schema_statistics[core_name][0])])
+ equal_seq_coverage.append(
+ [
+ core_name,
+ sample_name,
+ str(matching_allele_length),
+ str(new_sequence_length),
+ str(schema_statistics[core_name][0]),
+ str(new_sseq_coverage),
+ str(
+ new_sequence_length
+ / schema_statistics[core_name][0]
+ ),
+ ]
+ )
#########################################################################################################################
# Get and keep SNP and DNA and protein alignment
- keep_snp_alignment_info(sseq, new_sseq, matching_allele_seq, qseqid, query_direction, core_name, sample_name, reward, penalty, gapopen, gapextend, snp_dict, match_alignment_dict, protein_dict, logger)
+ keep_snp_alignment_info(
+ sseq,
+ new_sseq,
+ matching_allele_seq,
+ qseqid,
+ query_direction,
+ core_name,
+ sample_name,
+ reward,
+ penalty,
+ gapopen,
+ gapextend,
+ snp_dict,
+ match_alignment_dict,
+ protein_dict,
+ logger,
+ )
# ····································································································· #
# INF if final new sequence length is within min and max length thresholds for this gene in this sample #
# ····································································································· #
- if min_length_threshold <= new_sequence_length <= max_length_threshold:
-
+ if (
+ min_length_threshold
+ <= new_sequence_length
+ <= max_length_threshold
+ ):
# Keep INF info
- inf_asm_alm_tag(core_name, sample_name, 'INF', allele_found[allele_id], allele_quality, new_sseq, matching_allele_length, inferred_alleles_dict, inf_dict, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_inf, logger) ### introducido start_prodigal, end_prodigal, complete_predicted_seq, prodigal_report como argumento a inf_asm_alm_tag para report prodigal, temporal
+ inf_asm_alm_tag(
+ core_name,
+ sample_name,
+ "INF",
+ allele_found[allele_id],
+ allele_quality,
+ new_sseq,
+ matching_allele_length,
+ inferred_alleles_dict,
+ inf_dict,
+ samples_matrix_dict,
+ matching_genes_dict,
+ prodigal_report,
+ start_prodigal,
+ end_prodigal,
+ complete_predicted_seq,
+ annotation_core_dict,
+ count_inf,
+ logger,
+ ) ### introducido start_prodigal, end_prodigal, complete_predicted_seq, prodigal_report como argumento a inf_asm_alm_tag para report prodigal, temporal
# ············································································································································ #
# ASM if final new sequence length is under min length threshold but its coverage is above min coverage threshold for this gene in this sample #
# ············································································································································ #
- elif min_coverage_threshold <= new_sequence_length < min_length_threshold:
-
+ elif (
+ min_coverage_threshold
+ <= new_sequence_length
+ < min_length_threshold
+ ):
# Keep ASM info
- inf_asm_alm_tag(core_name, sample_name, 'ASM', allele_found[allele_id], allele_quality, new_sseq, matching_allele_length, asm_dict, list_asm, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_asm, logger)
+ inf_asm_alm_tag(
+ core_name,
+ sample_name,
+ "ASM",
+ allele_found[allele_id],
+ allele_quality,
+ new_sseq,
+ matching_allele_length,
+ asm_dict,
+ list_asm,
+ samples_matrix_dict,
+ matching_genes_dict,
+ prodigal_report,
+ start_prodigal,
+ end_prodigal,
+ complete_predicted_seq,
+ annotation_core_dict,
+ count_asm,
+ logger,
+ )
# ············································································································································ #
# ALM if final new sequence length is above max length threshold but its coverage is under max coverage threshold for this gene in this sample #
# ············································································································································ #
- elif max_length_threshold < new_sequence_length <= max_coverage_threshold:
-
+ elif (
+ max_length_threshold
+ < new_sequence_length
+ <= max_coverage_threshold
+ ):
# Keep ALM info
- inf_asm_alm_tag(core_name, sample_name, 'ALM', allele_found[allele_id], allele_quality, new_sseq, matching_allele_length, alm_dict, list_alm, samples_matrix_dict, matching_genes_dict, prodigal_report, start_prodigal, end_prodigal, complete_predicted_seq, annotation_core_dict, count_alm, logger) ### introducido start_prodigal, end_prodigal, complete_predicted_seq, prodigal_report como argumento a inf_asm_alm_tag para report prodigal, temporal
+ inf_asm_alm_tag(
+ core_name,
+ sample_name,
+ "ALM",
+ allele_found[allele_id],
+ allele_quality,
+ new_sseq,
+ matching_allele_length,
+ alm_dict,
+ list_alm,
+ samples_matrix_dict,
+ matching_genes_dict,
+ prodigal_report,
+ start_prodigal,
+ end_prodigal,
+ complete_predicted_seq,
+ annotation_core_dict,
+ count_alm,
+ logger,
+ ) ### introducido start_prodigal, end_prodigal, complete_predicted_seq, prodigal_report como argumento a inf_asm_alm_tag para report prodigal, temporal
# ························································································· #
# TPR if final new sequence coverage is not within thresholds for this gene in this sample #
# ························································································· #
else:
-
# Keep TPR info
- lnf_tpr_tag(core_name, sample_name, alleles_in_locus_dict, samples_matrix_dict, lnf_tpr_dict, schema_statistics, locus_alleles_path, qseqid, pident, s_length_no_gaps, new_sequence_length, '-', coverage, schema_quality, annotation_core_dict, count_tpr, logger)
+ lnf_tpr_tag(
+ core_name,
+ sample_name,
+ alleles_in_locus_dict,
+ samples_matrix_dict,
+ lnf_tpr_dict,
+ schema_statistics,
+ locus_alleles_path,
+ qseqid,
+ pident,
+ s_length_no_gaps,
+ new_sequence_length,
+ "-",
+ coverage,
+ schema_quality,
+ annotation_core_dict,
+ count_tpr,
+ logger,
+ )
# ········································ #
# ERROR if final new sequence is not found #
# ········································ #
else:
- logger.error('ERROR : Stop codon was not found for the core %s and the sample %s', core_name, sample_name)
- samples_matrix_dict[sample_name].append('ERROR not stop codon')
- if not sseqid in matching_genes_dict[sample_name] :
+ logger.error(
+ "ERROR : Stop codon was not found for the core %s and the sample %s",
+ core_name,
+ sample_name,
+ )
+ samples_matrix_dict[sample_name].append(
+ "ERROR not stop codon"
+ )
+ if not sseqid in matching_genes_dict[sample_name]:
matching_genes_dict[sample_name][sseqid] = []
- if sstart > send :
- #matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'-', 'ERROR'])
- matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, send,'-', 'ERROR'])
+ if sstart > send:
+ # matching_genes_dict[sample_name][sseqid].append([core_name, sstart, send,'-', 'ERROR'])
+ matching_genes_dict[sample_name][sseqid].append(
+ [core_name, qseqid, sstart, send, "-", "ERROR"]
+ )
else:
- #matching_genes_dict[sample_name][sseqid].append([core_name, sstart,send,'+', 'ERROR'])
- matching_genes_dict[sample_name][sseqid].append([core_name, qseqid, sstart, send,'+', 'ERROR'])
+ # matching_genes_dict[sample_name][sseqid].append([core_name, sstart,send,'+', 'ERROR'])
+ matching_genes_dict[sample_name][sseqid].append(
+ [core_name, qseqid, sstart, send, "+", "ERROR"]
+ )
# (recuento tags para plot)
- count_error[sample_name]['total'] += 1
+ count_error[sample_name]["total"] += 1
for count_class in count_error[sample_name]:
if count_class in allele_quality:
- if "no_start_stop" not in count_class and "no_start_stop" in allele_quality:
+ if (
+ "no_start_stop" not in count_class
+ and "no_start_stop" in allele_quality
+ ):
if count_class == "bad_quality":
- count_error[sample_name][count_class] += 1
+ count_error[sample_name][
+ count_class
+ ] += 1
else:
count_error[sample_name][count_class] += 1
-
## Save results and create reports
- if not save_allele_call_results (outputdir, full_gene_list, samples_matrix_dict, exact_dict, paralog_dict, inf_dict, plot_dict, matching_genes_dict, list_asm, list_alm, lnf_tpr_dict, snp_dict, match_alignment_dict, protein_dict, prodigal_report, shorter_seq_coverage, longer_seq_coverage, equal_seq_coverage, shorter_blast_seq_coverage, longer_blast_seq_coverage, equal_blast_seq_coverage, logger):
- print('There is an error while saving the allele calling results. Check the log file to get more information \n')
- # exit(0)
-
+ if not save_allele_call_results(
+ outputdir,
+ full_gene_list,
+ samples_matrix_dict,
+ exact_dict,
+ paralog_dict,
+ inf_dict,
+ plot_dict,
+ matching_genes_dict,
+ list_asm,
+ list_alm,
+ lnf_tpr_dict,
+ snp_dict,
+ match_alignment_dict,
+ protein_dict,
+ prodigal_report,
+ shorter_seq_coverage,
+ longer_seq_coverage,
+ equal_seq_coverage,
+ shorter_blast_seq_coverage,
+ longer_blast_seq_coverage,
+ equal_blast_seq_coverage,
+ logger,
+ ):
+ print(
+ "There is an error while saving the allele calling results. Check the log file to get more information \n"
+ )
+ # exit(0)
## Saving sample results plots
- if not save_allele_calling_plots (outputdir, sample_list_files, count_exact, count_inf, count_asm, count_alm, count_lnf, count_tpr, count_plot, count_niph, count_niphem, count_error, logger):
- print('There is an error while saving the allele calling results plots. Check the log file to get more information \n')
-
+ if not save_allele_calling_plots(
+ outputdir,
+ sample_list_files,
+ count_exact,
+ count_inf,
+ count_asm,
+ count_alm,
+ count_lnf,
+ count_tpr,
+ count_plot,
+ count_niph,
+ count_niphem,
+ count_error,
+ logger,
+ ):
+ print(
+ "There is an error while saving the allele calling results plots. Check the log file to get more information \n"
+ )
return True, inferred_alleles_dict, inf_dict, exact_dict
@@ -2326,8 +4127,9 @@ def allele_call_nucleotides (core_gene_list_files, sample_list_files, alleles_in
# Processing gene by gene allele calling #
# * * * * * * * * * * * * * * * * * * * #
-def processing_allele_calling (arguments) :
- '''
+
+def processing_allele_calling(arguments):
+ """
Description:
This is the main function for allele calling.
With the support of additional functions it will create the output files
@@ -2340,93 +4142,145 @@ def processing_allele_calling (arguments) :
????
Return:
????
- '''
+ """
start_time = datetime.now()
- print('Start the execution at :', start_time )
+ print("Start the execution at :", start_time)
# Open log file
- logger = open_log ('taranis_wgMLST.log')
- #print('Checking the pre-requisites.')
+ logger = open_log("taranis_wgMLST.log")
+ # print('Checking the pre-requisites.')
############################################################
## Check additional programs are installed in your system ##
############################################################
- #pre_requisites_list = [['blastp', '2.9'], ['makeblastdb', '2.9']]
- #if not check_prerequisites (pre_requisites_list, logger):
+ # pre_requisites_list = [['blastp', '2.9'], ['makeblastdb', '2.9']]
+ # if not check_prerequisites (pre_requisites_list, logger):
# print ('your system does not fulfill the pre-requistes to run the script ')
# exit(0)
######################################################
## Check that given directories contain fasta files ##
######################################################
- print('Validating schema fasta files in ' , arguments.coregenedir , '\n')
+ print("Validating schema fasta files in ", arguments.coregenedir, "\n")
valid_core_gene_files = get_fasta_file_list(arguments.coregenedir, logger)
- if not valid_core_gene_files :
- print ('There are not valid fasta files in ', arguments.coregenedir , ' directory. Check log file for more information ')
+ if not valid_core_gene_files:
+ print(
+ "There are not valid fasta files in ",
+ arguments.coregenedir,
+ " directory. Check log file for more information ",
+ )
exit(0)
- print('Validating reference alleles fasta files in ' , arguments.refalleles , '\n')
+ print("Validating reference alleles fasta files in ", arguments.refalleles, "\n")
valid_reference_alleles_files = get_fasta_file_list(arguments.refalleles, logger)
- if not valid_reference_alleles_files :
- print ('There are not valid reference alleles fasta files in ', arguments.refalleles, ' directory. Check log file for more information ')
+ if not valid_reference_alleles_files:
+ print(
+ "There are not valid reference alleles fasta files in ",
+ arguments.refalleles,
+ " directory. Check log file for more information ",
+ )
exit(0)
- print('Validating sample fasta files in ' , arguments.inputdir , '\n')
+ print("Validating sample fasta files in ", arguments.inputdir, "\n")
valid_sample_files = get_fasta_file_list(arguments.inputdir, logger)
- if not valid_sample_files :
- print ('There are not valid fasta files in ', arguments.inputdir , ' directory. Check log file for more information ')
+ if not valid_sample_files:
+ print(
+ "There are not valid fasta files in ",
+ arguments.inputdir,
+ " directory. Check log file for more information ",
+ )
exit(0)
#################################
## Prepare the coreMLST schema ##
#################################
- tmp_core_gene_dir = os.path.join(arguments.outputdir,'tmp','cgMLST')
+ tmp_core_gene_dir = os.path.join(arguments.outputdir, "tmp", "cgMLST")
try:
os.makedirs(tmp_core_gene_dir)
except:
- logger.info('Deleting the temporary directory for a previous execution without cleaning up')
- shutil.rmtree(os.path.join(arguments.outputdir, 'tmp'))
+ logger.info(
+ "Deleting the temporary directory for a previous execution without cleaning up"
+ )
+ shutil.rmtree(os.path.join(arguments.outputdir, "tmp"))
try:
os.makedirs(tmp_core_gene_dir)
- logger.info ('Temporary folder %s has been created again', tmp_core_gene_dir)
+ logger.info(
+ "Temporary folder %s has been created again", tmp_core_gene_dir
+ )
except:
- logger.info('Unable to create again the temporary directory %s', tmp_core_gene_dir)
- print('Cannot create temporary directory on ', tmp_core_gene_dir)
+ logger.info(
+ "Unable to create again the temporary directory %s", tmp_core_gene_dir
+ )
+ print("Cannot create temporary directory on ", tmp_core_gene_dir)
exit(0)
- alleles_in_locus_dict, annotation_core_dict, schema_variability, schema_statistics, schema_quality = prepare_core_gene (valid_core_gene_files, tmp_core_gene_dir, arguments.refalleles, arguments.genus, arguments.species, str(arguments.usegenus).lower(), logger)
- #alleles_in_locus_dict, annotation_core_dict, schema_variability, schema_statistics, schema_quality = prepare_core_gene (valid_core_gene_files, tmp_core_gene_dir, arguments.refalleles, arguments.outputdir, logger)
+ (
+ alleles_in_locus_dict,
+ annotation_core_dict,
+ schema_variability,
+ schema_statistics,
+ schema_quality,
+ ) = prepare_core_gene(
+ valid_core_gene_files,
+ tmp_core_gene_dir,
+ arguments.refalleles,
+ arguments.genus,
+ arguments.species,
+ str(arguments.usegenus).lower(),
+ logger,
+ )
+ # alleles_in_locus_dict, annotation_core_dict, schema_variability, schema_statistics, schema_quality = prepare_core_gene (valid_core_gene_files, tmp_core_gene_dir, arguments.refalleles, arguments.outputdir, logger)
if not alleles_in_locus_dict:
- print('There is an error while processing the schema preparation phase. Check the log file to get more information \n')
- logger.info('Deleting the temporary directory to clean up the temporary files created')
- shutil.rmtree(os.path.join(arguments.outputdir, 'tmp'))
+ print(
+ "There is an error while processing the schema preparation phase. Check the log file to get more information \n"
+ )
+ logger.info(
+ "Deleting the temporary directory to clean up the temporary files created"
+ )
+ shutil.rmtree(os.path.join(arguments.outputdir, "tmp"))
exit(0)
###############################
## Prepare the samples files ##
###############################
- tmp_samples_dir = os.path.join(arguments.outputdir,'tmp','samples')
+ tmp_samples_dir = os.path.join(arguments.outputdir, "tmp", "samples")
try:
os.makedirs(tmp_samples_dir)
except:
- logger.info('Deleting the temporary directory for a previous execution without properly cleaning up')
+ logger.info(
+ "Deleting the temporary directory for a previous execution without properly cleaning up"
+ )
shutil.rmtree(tmp_samples_dir)
try:
os.makedirs(tmp_samples_dir)
- logger.info('Temporary folder %s has been created again', tmp_samples_dir)
+ logger.info("Temporary folder %s has been created again", tmp_samples_dir)
except:
- logger.info('Unable to create again the temporary directory %s', tmp_samples_dir)
- shutil.rmtree(os.path.join(arguments.outputdir, 'tmp'))
- logger.info('Cleaned up temporary directory ', )
- print('Cannot create temporary directory on ', tmp_samples_dir, 'Check the log file to get more information \n')
+ logger.info(
+ "Unable to create again the temporary directory %s", tmp_samples_dir
+ )
+ shutil.rmtree(os.path.join(arguments.outputdir, "tmp"))
+ logger.info(
+ "Cleaned up temporary directory ",
+ )
+ print(
+ "Cannot create temporary directory on ",
+ tmp_samples_dir,
+ "Check the log file to get more information \n",
+ )
exit(0)
- contigs_in_sample_dict = prepare_samples(valid_sample_files, tmp_samples_dir, arguments.refgenome, logger)
- if not contigs_in_sample_dict :
- print('There is an error while processing the saving temporary files. Check the log file to get more information \n')
- logger.info('Deleting the temporary directory to clean up the temporary files created')
- shutil.rmtree(os.path.join(arguments.outputdir, 'tmp'))
+ contigs_in_sample_dict = prepare_samples(
+ valid_sample_files, tmp_samples_dir, arguments.refgenome, logger
+ )
+ if not contigs_in_sample_dict:
+ print(
+ "There is an error while processing the saving temporary files. Check the log file to get more information \n"
+ )
+ logger.info(
+ "Deleting the temporary directory to clean up the temporary files created"
+ )
+ shutil.rmtree(os.path.join(arguments.outputdir, "tmp"))
exit(0)
##################################
@@ -2434,51 +4288,126 @@ def processing_allele_calling (arguments) :
##################################
query_directory = arguments.coregenedir
reference_alleles_directory = arguments.refalleles
- blast_db_directory = os.path.join(tmp_samples_dir,'blastdb')
- prodigal_directory = os.path.join(tmp_samples_dir,'prodigal')
- blast_results_seq_directory = os.path.join(tmp_samples_dir,'blast_results', 'blast_results_seq') ### path a directorio donde guardar secuencias encontradas tras blast con alelo de referencia
- blast_results_db_directory = os.path.join(tmp_samples_dir,'blast_results', 'blast_results_db') ### path a directorio donde guardar db de secuencias encontradas tras blast con alelo de referencia
-
- complete_allele_call, inferred_alleles_dict, inf_dict, exact_dict = allele_call_nucleotides(valid_core_gene_files, valid_sample_files, alleles_in_locus_dict, contigs_in_sample_dict, query_directory, reference_alleles_directory, blast_db_directory, prodigal_directory, blast_results_seq_directory, blast_results_db_directory, arguments.inputdir, arguments.outputdir, int(arguments.cpus), arguments.percentlength, arguments.coverage, float(arguments.evalue), int(arguments.perc_identity_ref), int(arguments.perc_identity_loc), int(arguments.reward), int(arguments.penalty), int(arguments.gapopen), int(arguments.gapextend), int(arguments.max_target_seqs), int(arguments.max_hsps), int(arguments.num_threads), int(arguments.flankingnts), schema_variability, schema_statistics, schema_quality, annotation_core_dict, arguments.profile, logger)
+ blast_db_directory = os.path.join(tmp_samples_dir, "blastdb")
+ prodigal_directory = os.path.join(tmp_samples_dir, "prodigal")
+ blast_results_seq_directory = os.path.join(
+ tmp_samples_dir, "blast_results", "blast_results_seq"
+ ) ### path a directorio donde guardar secuencias encontradas tras blast con alelo de referencia
+ blast_results_db_directory = os.path.join(
+ tmp_samples_dir, "blast_results", "blast_results_db"
+ ) ### path a directorio donde guardar db de secuencias encontradas tras blast con alelo de referencia
+
+ (
+ complete_allele_call,
+ inferred_alleles_dict,
+ inf_dict,
+ exact_dict,
+ ) = allele_call_nucleotides(
+ valid_core_gene_files,
+ valid_sample_files,
+ alleles_in_locus_dict,
+ contigs_in_sample_dict,
+ query_directory,
+ reference_alleles_directory,
+ blast_db_directory,
+ prodigal_directory,
+ blast_results_seq_directory,
+ blast_results_db_directory,
+ arguments.inputdir,
+ arguments.outputdir,
+ int(arguments.cpus),
+ arguments.percentlength,
+ arguments.coverage,
+ float(arguments.evalue),
+ int(arguments.perc_identity_ref),
+ int(arguments.perc_identity_loc),
+ int(arguments.reward),
+ int(arguments.penalty),
+ int(arguments.gapopen),
+ int(arguments.gapextend),
+ int(arguments.max_target_seqs),
+ int(arguments.max_hsps),
+ int(arguments.num_threads),
+ int(arguments.flankingnts),
+ schema_variability,
+ schema_statistics,
+ schema_quality,
+ annotation_core_dict,
+ arguments.profile,
+ logger,
+ )
if not complete_allele_call:
- print('There is an error while processing the allele calling. Check the log file to get more information \n')
+ print(
+ "There is an error while processing the allele calling. Check the log file to get more information \n"
+ )
exit(0)
#########################################################
## Update core gene schema adding new inferred alleles ##
#########################################################
if inferred_alleles_dict:
- if str(arguments.updateschema).lower() == 'true' or str(arguments.updateschema).lower() == 'new':
- if not update_schema (str(arguments.updateschema).lower(), arguments.coregenedir, arguments.outputdir, valid_core_gene_files, inferred_alleles_dict, alleles_in_locus_dict, logger):
- print('There is an error adding new inferred alleles found to the core genes schema. Check the log file to get more information \n')
+ if (
+ str(arguments.updateschema).lower() == "true"
+ or str(arguments.updateschema).lower() == "new"
+ ):
+ if not update_schema(
+ str(arguments.updateschema).lower(),
+ arguments.coregenedir,
+ arguments.outputdir,
+ valid_core_gene_files,
+ inferred_alleles_dict,
+ alleles_in_locus_dict,
+ logger,
+ ):
+ print(
+ "There is an error adding new inferred alleles found to the core genes schema. Check the log file to get more information \n"
+ )
exit(0)
- if str(arguments.profile).lower() != 'false':
-
+ if str(arguments.profile).lower() != "false":
############################
## Get ST for each sample ##
############################
- complete_ST, inf_ST = get_ST_profile(arguments.outputdir, arguments.profile, exact_dict, inf_dict, valid_core_gene_files, valid_sample_files, logger)
+ complete_ST, inf_ST = get_ST_profile(
+ arguments.outputdir,
+ arguments.profile,
+ exact_dict,
+ inf_dict,
+ valid_core_gene_files,
+ valid_sample_files,
+ logger,
+ )
if not complete_ST:
- print('There is an error while processing ST analysis. Check the log file to get more information \n')
+ print(
+ "There is an error while processing ST analysis. Check the log file to get more information \n"
+ )
exit(0)
###########################################
## Update ST profile file adding new STs ##
###########################################
- if str(arguments.updateprofile).lower() == 'true' or str(arguments.updateprofile).lower() == 'new':
+ if (
+ str(arguments.updateprofile).lower() == "true"
+ or str(arguments.updateprofile).lower() == "new"
+ ):
if len(inf_ST) > 0:
- if not update_st_profile (str(arguments.updateprofile).lower(), arguments.profile, arguments.outputdir, inf_ST, valid_core_gene_files, logger):
- print('There is an error adding new STs found to the ST profile file. Check the log file to get more information \n')
+ if not update_st_profile(
+ str(arguments.updateprofile).lower(),
+ arguments.profile,
+ arguments.outputdir,
+ inf_ST,
+ valid_core_gene_files,
+ logger,
+ ):
+ print(
+ "There is an error adding new STs found to the ST profile file. Check the log file to get more information \n"
+ )
exit(0)
- shutil.rmtree(os.path.join(arguments.outputdir, 'tmp'))
+ shutil.rmtree(os.path.join(arguments.outputdir, "tmp"))
end_time = datetime.now()
- print('completed execution at :', end_time )
+ print("completed execution at :", end_time)
return True
-
-
-
diff --git a/taranis/analyze_schema.py b/taranis/analyze_schema.py
index cf218bf..c17fd99 100644
--- a/taranis/analyze_schema.py
+++ b/taranis/analyze_schema.py
@@ -4,9 +4,12 @@
import rich.console
import statistics
from pathlib import Path
+import Bio.Data.CodonTable
from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
+
+# from Bio.SeqRecord import SeqRecord
+from collections import OrderedDict
import taranis.utils
@@ -43,66 +46,89 @@ def __init__(
self.species = species
self.usegenus = usegenus
- def check_allele_quality(self):
- a_quality = {}
+ def check_allele_quality(self, prokka_annotation):
+ a_quality = OrderedDict()
allele_seq = {}
bad_quality_record = []
with open(self.schema_allele) as fh:
for record in SeqIO.parse(self.schema_allele, "fasta"):
- a_quality[record.id] = {"quality": "Good quality", "reason": "-"}
+ try:
+ prokka_ann = prokka_annotation[record.id]
+ except:
+ prokka_ann = "Not found in prokka"
+ a_quality[record.id] = {
+ "allele_name": self.allele_name,
+ "quality": "Good quality",
+ "reason": "-",
+ "direction": "forward",
+ "start_codon_alt": "standard",
+ "protein_seq": "",
+ "cds_coding": prokka_ann,
+ }
allele_seq[record.id] = str(record.seq)
- a_quality[record.id]["length"] = len(str(record.seq))
- if len(record.seq) % 3 != 0:
- a_quality[record.id]["quality"] = "Bad quality"
- a_quality[record.id]["reason"] = "Can not be converted to protein"
- a_quality[record.id]["order"] = "-"
- else:
- sequence_order = taranis.utils.check_sequence_order(str(record.seq))
- if sequence_order == "Error":
- a_quality[record.id]["quality"] = "Bad quality"
- a_quality[record.id]["reason"] = "Start or end codon not found"
- a_quality[record.id]["order"] = "-"
- elif sequence_order == "reverse":
- record_sequence = str(record.seq.reverse_complement())
+ a_quality[record.id]["length"] = str(len(str(record.seq)))
+ a_quality[record.id]["dna_seq"] = str(record.seq)
+ sequence_direction = taranis.utils.get_seq_direction(str(record.seq))
+
+ if sequence_direction == "reverse":
+ record.seq = record.seq.reverse_complement()
+ a_quality[record.id]["direction"] = sequence_direction
+ elif sequence_direction == "Error":
+ a_quality[record.id]["direction"] = "-"
+ try:
+ a_quality[record.id]["protein_seq"] = str(
+ record.seq.translate(table=1, cds=True)
+ )
+
+ except Bio.Data.CodonTable.TranslationError as e:
+ if "not a start codon" in str(e):
+ try:
+ # Check if sequence has an alternative start codon
+ # for protein coding
+ a_quality[record.id]["protein_seq"] = str(
+ record.seq.translate(table=2, cds=True)
+ )
+ a_quality[record.id]["start_codon_alt"] = "alternative"
+ except Bio.Data.CodonTable.TranslationError as e_2:
+ if "stop" in str(e_2):
+ a_quality[record.id]["reason"] = str(e_2).replace(
+ "'", ""
+ )
+ else:
+ a_quality[record.id]["reason"] = str(e).replace("'", "")
+ a_quality[record.id]["quality"] = "Bad quality"
else:
- record_sequence = str(record.seq)
- a_quality[record.id]["order"] = sequence_order
- if record_sequence[0:3] not in taranis.utils.START_CODON_FORWARD:
a_quality[record.id]["quality"] = "Bad quality"
- a_quality[record.id]["reason"] = "Start codon not found"
- elif record_sequence[-3:] not in taranis.utils.STOP_CODON_FORWARD:
- a_quality[record.id]["quality"] = "Bad quality"
- a_quality[record.id]["reason"] = "Stop codon not found"
-
- elif taranis.utils.find_multiple_stop_codons(record_sequence):
- a_quality[record.id]["quality"] = "Bad quality"
- a_quality[record.id]["reason"] = "Multiple stop codons found"
+ a_quality[record.id]["reason"] = str(e).replace("'", "")
+
if (
self.remove_no_cds
and a_quality[record.id]["quality"] == "Bad quality"
):
bad_quality_record.append(record.id)
- if self.remove_duplicated:
- # get the unique sequences and compare the length with all sequences
- unique_seq = list(set(list(allele_seq.values())))
- if len(unique_seq) < len(allele_seq):
- tmp_dict = {}
- for rec_id, seq_value in allele_seq.items():
- if seq_value not in tmp_dict:
- tmp_dict[seq_value] = 0
- else:
- bad_quality_record.append(rec_id)
- a_quality[rec_id]["quality"] ="Bad quality"
- a_quality[rec_id]["reason"] ="Duplicate allele"
- if self.remove_subset:
- unique_seq = list(set(list(allele_seq.values())))
+ # check if there are duplicated alleles
+ # get the unique sequences and compare the length with all sequences
+ unique_seq = list(set(list(allele_seq.values())))
+ if len(unique_seq) < len(allele_seq):
+ tmp_dict = {}
for rec_id, seq_value in allele_seq.items():
- unique_seq.remove(seq_value)
- if seq_value in unique_seq:
+ if seq_value not in tmp_dict:
+ tmp_dict[seq_value] = 0
+ else:
+ a_quality[rec_id]["quality"] = "Bad quality"
+ a_quality[rec_id]["reason"] = "Duplicate allele"
+ if self.remove_duplicated:
+ bad_quality_record.append(rec_id)
+
+ for rec_id, seq_value in allele_seq.items():
+ unique_seq.remove(seq_value)
+ if seq_value in unique_seq:
+ a_quality[rec_id]["quality"] = "Bad quality"
+ a_quality[rec_id]["reason"] = "Sub set allele"
+ if self.remove_subset:
bad_quality_record.append(rec_id)
- a_quality[rec_id]["quality"] ="Bad quality"
- a_quality[rec_id]["reason"] ="Sub set allele"
+
new_schema_folder = os.path.join(self.output, "new_schema")
_ = taranis.utils.create_new_folder(new_schema_folder)
new_schema_file = os.path.join(new_schema_folder, self.allele_name + ".fasta")
@@ -116,22 +142,38 @@ def check_allele_quality(self):
SeqIO.write(record, fo, "fasta")
# update the schema allele with the new file
self.schema_allele = new_schema_file
+
+ """
+ if self.output_allele_annot:
+ # dump allele annotation to file
+ ann_heading = ["gene", "allele", "allele direction","nucleotide sequence", "protein sequence", "nucleotide sequence length", "star codon", "CDS coding", "allele quality", "bad quality reason" ]
+ ann_fields = ["direction", "dna_seq", "protein_seq", "length", "start_codon_alt","cds_coding", "quality", "reason"]
+ f_name = os.path.join(self.output, self.allele_name +"_allele_annotation.csv")
+ with open (f_name, "w") as fo:
+ fo.write(",".join(ann_heading) + "\n")
+ for allele in a_quality.keys():
+ data_field = [a_quality[allele][field] for field in ann_fields]
+ fo.write(self.allele_name + "," + allele + "," + ",".join(data_field) + "\n")
+ """
+
return a_quality
def fetch_statistics_from_alleles(self, a_quality):
- possible_bad_quality = ["Can not be converted to protein", "Start codon not found", "Stop codon not found", "Multiple stop codons found" ,"Duplicate allele", "Sub set allele"]
+ # POSIBLE_BAD_QUALITY = ["not a start codon", "not a stop codon", "Extra in frame stop codon", "is not a multiple of three", "Duplicate allele", "Sub set allele"]
record_data = {}
bad_quality_reason = {}
a_length = []
bad_quality_counter = 0
for record_id in a_quality.keys():
record_data["allele_name"] = self.allele_name
- a_length.append(a_quality[record_id]["length"])
+ a_length.append(int(a_quality[record_id]["length"]))
if a_quality[record_id]["quality"] == "Bad quality":
bad_quality_counter += 1
- bad_quality_reason[a_quality[record_id]["reason"]] = (
- bad_quality_reason.get(a_quality[record_id]["reason"], 0) + 1
- )
+ for reason in taranis.utils.POSIBLE_BAD_QUALITY:
+ if reason in a_quality[record_id]["reason"]:
+ bad_quality_reason[reason] = (
+ bad_quality_reason.get(reason, 0) + 1
+ )
total_alleles = len(a_length)
record_data["min_length"] = min(a_length)
record_data["max_length"] = max(a_length)
@@ -140,25 +182,26 @@ def fetch_statistics_from_alleles(self, a_quality):
record_data["good_percent"] = round(
100 * (total_alleles - bad_quality_counter) / total_alleles, 2
)
- for item in possible_bad_quality:
- record_data[item] = bad_quality_reason[item] if item in bad_quality_reason else 0
- # record_data["bad_quality_reason"] = bad_quality_reason
+ for item in taranis.utils.POSIBLE_BAD_QUALITY:
+ record_data[item] = (
+ bad_quality_reason[item] if item in bad_quality_reason else 0
+ )
+
return record_data
def analyze_allele_in_schema(self):
allele_data = {}
- # Perform quality
- a_quality = self.check_allele_quality()
# run annotations
prokka_folder = os.path.join(self.output, "prokka", self.allele_name)
anotation_files = taranis.utils.create_annotation_files(
self.schema_allele, prokka_folder, self.allele_name
)
- allele_data["annotation_gene"] = taranis.utils.read_annotation_file(
- anotation_files + ".tsv", self.allele_name
- ).get(self.allele_name)
- allele_data.update(self.fetch_statistics_from_alleles(a_quality))
- return allele_data
+ prokka_annotation = taranis.utils.read_annotation_file(anotation_files + ".gff")
+
+ # Perform quality
+ a_quality = self.check_allele_quality(prokka_annotation)
+ allele_data = self.fetch_statistics_from_alleles(a_quality)
+ return [allele_data, a_quality]
def parallel_execution(
@@ -184,17 +227,30 @@ def parallel_execution(
return schema_obj.analyze_allele_in_schema()
-
-def collect_statistics(stat_data, out_folder):
+def collect_statistics(data, out_folder, output_allele_annot):
def stats_graphics(stats_folder):
- print(out_folder)
+ allele_range = [0, 300, 600, 1000, 1500]
+
graphic_folder = os.path.join(stats_folder, "graphics")
_ = taranis.utils.create_new_folder(graphic_folder)
# create graphic for alleles/number of genes
- genes_alleles_df = stats_df["num_alleles"].value_counts().rename_axis("alleles").sort_index().reset_index(name="genes")
- _ = taranis.utils.create_graphic(graphic_folder, "num_genes_per_allele.png", "lines", genes_alleles_df["alleles"].to_list(), genes_alleles_df["genes"].to_list(), ["Allele", "number of genes"],"title")
+ # genes_alleles_df = stats_df["num_alleles"].value_counts().rename_axis("alleles").sort_index().reset_index(name="genes")
+ group_alleles_df = stats_df.groupby(
+ pd.cut(stats_df["num_alleles"], allele_range)
+ ).count()
+ _ = taranis.utils.create_graphic(
+ graphic_folder,
+ "num_genes_per_allele.png",
+ "bar",
+ allele_range[1:],
+ group_alleles_df["num_alleles"].to_list(),
+ ["Allele", "number of genes"],
+ "title",
+ )
+ # _ = taranis.utils.create_graphic(graphic_folder, "num_genes_per_allele.png", "lines", genes_alleles_df["alleles"].to_list(), genes_alleles_df["genes"].to_list(), ["Allele", "number of genes"],"title")
# create pie graph for good quality
-
+
+ """
good_percent = [round(stats_df["good_percent"].mean(),2)]
good_percent.append(100 - good_percent[0])
labels = ["Good quality", "Bad quality"]
@@ -202,25 +258,90 @@ def stats_graphics(stats_folder):
_ = taranis.utils.create_graphic(graphic_folder, "quality_of_locus.png", "pie", good_percent, "", labels, "Quality of locus")
# create pie graph for bad quality reason. This is optional if there are
# bad quality alleles
+ """
+ sum_all_alleles = stats_df["num_alleles"].sum()
+
labels = []
values = []
for item in taranis.utils.POSIBLE_BAD_QUALITY:
labels.append(item)
values.append(stats_df[item].sum())
- if sum(values) > 0:
- _ = taranis.utils.create_graphic(graphic_folder, "bad_quality_reason.png", "pie", values, "", labels, "Bad quality reason")
- # create pie graph for not found gene name
+ labels.append("Good quality")
+ values.append(sum_all_alleles - sum(values))
+ _ = taranis.utils.create_graphic(
+ graphic_folder,
+ "quality_percent.png",
+ "pie",
+ values,
+ "",
+ labels,
+ "Quality percent",
+ )
+ # create box plot for allele length variability
+ _ = taranis.utils.create_graphic(
+ graphic_folder,
+ "allele_variability.png",
+ "box",
+ "",
+ stats_df["mean_length"].to_list(),
+ "",
+ "Allele variability",
+ )
+
+ summary_data = []
+ a_quality = []
+ for idx in range(len(data)):
# pdb.set_trace()
- times_not_found_gene = len(stats_df[stats_df["annotation_gene"] == "Not found by Prokka"])
- if times_not_found_gene > 0:
- gene_not_found = [times_not_found_gene, len(stat_data)]
- labels = ["Not found gene name", "Number of alleles"]
- _ = taranis.utils.create_graphic(graphic_folder, "gene_not_found.png", "pie", gene_not_found, "", labels, "Quality of locus")
-
- stats_df = pd.DataFrame(stat_data)
+ summary_data.append(data[idx][0])
+ a_quality.append(data[idx][1])
+
+ stats_df = pd.DataFrame(summary_data)
+ # a_quality = data[1]
stats_folder = os.path.join(out_folder, "statistics")
_ = taranis.utils.create_new_folder(stats_folder)
_ = taranis.utils.write_data_to_file(stats_folder, "statistics.csv", stats_df)
+ # pdb.set_trace()
stats_graphics(stats_folder)
- print(stats_df)
+ if output_allele_annot:
+ # dump allele annotation to file
+ ann_heading = [
+ "gene",
+ "allele",
+ "allele direction",
+ "nucleotide sequence",
+ "protein sequence",
+ "nucleotide sequence length",
+ "star codon",
+ "CDS coding",
+ "allele quality",
+ "bad quality reason",
+ ]
+ ann_fields = [
+ "direction",
+ "dna_seq",
+ "protein_seq",
+ "length",
+ "start_codon_alt",
+ "cds_coding",
+ "quality",
+ "reason",
+ ]
+ # f_name = os.path.join(self.output, self.allele_name +"_allele_annotation.csv")
+ ann_data = ",".join(ann_heading) + "\n"
+ for gene in a_quality:
+ for allele in gene.keys():
+ data_field = [gene[allele][field] for field in ann_fields]
+ ann_data += (
+ gene[allele]["allele_name"]
+ + ","
+ + allele
+ + ","
+ + ",".join(data_field)
+ + "\n"
+ )
+
+ _ = taranis.utils.write_data_to_compress_filed(
+ out_folder, "allele_annotation.csv", ann_data
+ )
+ return
diff --git a/taranis/blast.py b/taranis/blast.py
index e4b17f0..923b640 100644
--- a/taranis/blast.py
+++ b/taranis/blast.py
@@ -17,29 +17,57 @@
)
-class Blast():
+class Blast:
def __init__(self, db_type):
self.db_type = db_type
- def create_blastdb (self, file_name, blast_dir):
+ def create_blastdb(self, file_name, blast_dir):
self.f_name = Path(file_name).stem
- db_dir = os.path.join(blast_dir,self.f_name)
+ db_dir = os.path.join(blast_dir, self.f_name)
self.out_blast_dir = os.path.join(db_dir, self.f_name)
- blast_command = ["makeblastdb" , "-in" , file_name , "-parse_seqids", "-dbtype", self.db_type, "-out" , self.out_blast_dir]
+ blast_command = [
+ "makeblastdb",
+ "-in",
+ file_name,
+ "-parse_seqids",
+ "-dbtype",
+ self.db_type,
+ "-out",
+ self.out_blast_dir,
+ ]
try:
- _ = subprocess.run(blast_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+ _ = subprocess.run(
+ blast_command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ check=True,
+ )
except Exception as e:
log.error("Unable to create blast db for %s ", self.f_name)
log.error(e)
- stderr.print(f"[red] Unable to create blast database for sample %s", self.f_name)
+ stderr.print(
+ f"[red] Unable to create blast database for sample %s", self.f_name
+ )
exit(1)
return
-
- def run_blast(self, query, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, max_target_seqs=10, max_hsps=10, num_threads=1):
+
+ def run_blast(
+ self,
+ query,
+ evalue=0.001,
+ perc_identity=90,
+ reward=1,
+ penalty=-2,
+ gapopen=1,
+ gapextend=1,
+ max_target_seqs=10,
+ max_hsps=10,
+ num_threads=1,
+ ):
"""_summary_
blastn -outfmt "6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq" -query /media/lchapado/Reference_data/proyectos_isciii/taranis/documentos_antiguos/pasteur_schema/lmo0002.fasta -db /media/lchapado/Reference_data/proyectos_isciii/taranis/test/blastdb/RA-L2073_R1/RA-L2073_R1 -evalue 0.001 -penalty -2 -reward 1 -gapopen 1 -gapextend 1 -perc_identity 100 > /media/lchapado/Reference_data/proyectos_isciii/taranis/test/blast_sample_locus002.txt
-
+
Args:
query (_type_): _description_
evalue (float, optional): _description_. Defaults to 0.001.
@@ -54,14 +82,28 @@ def run_blast(self, query, evalue=0.001, perc_identity=90, reward=1, penalty=-2,
"""
blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"'
pdb.set_trace()
- #db=self.blast_dir, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path)
- cline = NcbiblastnCommandline(db=self.out_blast_dir, evalue=evalue, perc_identity=perc_identity, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=query)
+ # db=self.blast_dir, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path)
+ cline = NcbiblastnCommandline(
+ db=self.out_blast_dir,
+ evalue=evalue,
+ perc_identity=perc_identity,
+ reward=reward,
+ penalty=penalty,
+ gapopen=gapopen,
+ gapextend=gapextend,
+ outfmt=blast_parameters,
+ max_target_seqs=max_target_seqs,
+ max_hsps=max_hsps,
+ num_threads=num_threads,
+ query=query,
+ )
try:
out, _ = cline()
except Exception as e:
log.error("Unable to run blast for %s ", self.out_blast_dir)
log.error(e)
- stderr.print(f"[red] Unable to run blast for database %s", self.out_blast_dir)
+ stderr.print(
+ f"[red] Unable to run blast for database %s", self.out_blast_dir
+ )
exit(1)
return out.splitlines()
-
\ No newline at end of file
diff --git a/taranis/prediction.py b/taranis/prediction.py
index 1706853..da2a395 100644
--- a/taranis/prediction.py
+++ b/taranis/prediction.py
@@ -15,7 +15,7 @@
)
-class Prediction():
+class Prediction:
def __init__(self, genome_ref, sample_file, out_dir):
self.genome_ref = genome_ref
self.sample_file = sample_file
@@ -33,33 +33,54 @@ def __init__(self, genome_ref, sample_file, out_dir):
except OSError as e:
log.error("Cannot create %s directory", self.out_dir)
log.error(e)
- stderr.print (f"[red] Unable to create {self.out_dir} folder")
+ stderr.print(f"[red] Unable to create {self.out_dir} folder")
exit(1)
def training(self):
- prodigal_command = ["prodigal" , "-i", self.genome_ref, "-t", self.train]
+ prodigal_command = ["prodigal", "-i", self.genome_ref, "-t", self.train]
try:
- _ = subprocess.run(prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+ _ = subprocess.run(
+ prodigal_command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ check=True,
+ )
except Exception as e:
log.error("Unable to execute prodigal command for training")
log.error(e)
- stderr.print (f"[red] Unable to run prodigal commmand. ERROR {e} ")
+ stderr.print(f"[red] Unable to run prodigal commmand. ERROR {e} ")
exit(1)
return
-
-
def prediction(self):
-
- prodigal_command = ["prodigal" , "-i", self.sample_file , "-t", self.train, "-f", "gff", "-o", self.pred_coord, "-a", self.pred_protein, "-d", self.pred_gene]
+ prodigal_command = [
+ "prodigal",
+ "-i",
+ self.sample_file,
+ "-t",
+ self.train,
+ "-f",
+ "gff",
+ "-o",
+ self.pred_coord,
+ "-a",
+ self.pred_protein,
+ "-d",
+ self.pred_gene,
+ ]
try:
- _ = subprocess.run(prodigal_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+ _ = subprocess.run(
+ prodigal_command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ check=True,
+ )
except Exception as e:
log.error("Unable to execute prodigal command for training")
log.error(e)
- stderr.print (f"[red] Unable to run prodigal commmand. ERROR {e} ")
+ stderr.print(f"[red] Unable to run prodigal commmand. ERROR {e} ")
exit(1)
return
def get_sequence(self):
- return
\ No newline at end of file
+ return
diff --git a/taranis/pruebas.py b/taranis/pruebas.py
index 4481b72..0303bf2 100644
--- a/taranis/pruebas.py
+++ b/taranis/pruebas.py
@@ -25,7 +25,7 @@
locus_list = []
for line in lines:
line = line.strip()
- if line == "#Cluster 5" :
+ if line == "#Cluster 5":
if alleles_found == False:
alleles_found = True
continue
@@ -37,7 +37,9 @@
# import pdb; pdb.set_trace()
rand_locus = random.choice(locus_list)
schema_file = "/media/lchapado/Reference_data/proyectos_isciii/taranis/taranis_testing_data/listeria_testing_schema/lmo0002.fasta"
-new_schema_file = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/cluster_lmo0002.fasta"
+new_schema_file = (
+ "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/cluster_lmo0002.fasta"
+)
q_file = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/q_file.fasta"
with open(schema_file) as fh:
with open(new_schema_file, "w") as fo:
@@ -52,15 +54,39 @@
if record.id == rand_locus:
SeqIO.write(record, fo, "fasta")
break
-print ("Selected locus: " , rand_locus)
-db_name ="/media/lchapado/Reference_data/proyectos_isciii/taranis/test/testing_clster/lmo0002"
-blast_command = ['makeblastdb' , '-in' , new_schema_file , '-parse_seqids', '-dbtype', "nucl", '-out' , db_name]
-blast_result = subprocess.run(blast_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+print("Selected locus: ", rand_locus)
+db_name = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/testing_clster/lmo0002"
+blast_command = [
+ "makeblastdb",
+ "-in",
+ new_schema_file,
+ "-parse_seqids",
+ "-dbtype",
+ "nucl",
+ "-out",
+ db_name,
+]
+blast_result = subprocess.run(
+ blast_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+)
blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"'
# pdb.set_trace()
-#db=self.blast_dir, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path)
-cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=q_file)
+# db=self.blast_dir, evalue=evalue, perc_identity=perc_identity_ref, reward=reward, penalty=penalty, gapopen=gapopen, gapextend=gapextend, outfmt=blast_parameters, max_target_seqs=max_target_seqs, max_hsps=max_hsps, num_threads=num_threads, query=core_reference_allele_path)
+cline = NcbiblastnCommandline(
+ db=db_name,
+ evalue=0.001,
+ perc_identity=90,
+ reward=1,
+ penalty=-2,
+ gapopen=1,
+ gapextend=1,
+ outfmt=blast_parameters,
+ max_target_seqs=1100,
+ max_hsps=1000,
+ num_threads=4,
+ query=q_file,
+)
try:
out, _ = cline()
@@ -69,4 +95,4 @@
b_lines = out.splitlines()
print("longitud del cluster = ", len(locus_list))
print("numero de matches = ", len(b_lines))
-# pdb.set_trace()
\ No newline at end of file
+# pdb.set_trace()
diff --git a/taranis/reference_alleles.py b/taranis/reference_alleles.py
index db20a61..13819af 100644
--- a/taranis/reference_alleles.py
+++ b/taranis/reference_alleles.py
@@ -21,6 +21,7 @@
force_terminal=taranis.utils.rich_force_colors(),
)
+
class ReferenceAlleles:
def __init__(self, fasta_file, output):
self.fasta_file = fasta_file
@@ -45,7 +46,7 @@ def check_locus_quality(self):
# Check if multiple stop codon by translating to protein and
# comparing length
locus_prot = Seq(record.seq).translate()
- if len(locus_prot) == int(len(seq)/3):
+ if len(locus_prot) == int(len(seq) / 3):
self.locus_quality[record.id] = "good quality"
self.selected_locus[record.id] = seq
else:
@@ -59,7 +60,7 @@ def check_locus_quality(self):
# Matched reverse start codon
if s_codon_f.group(1) in STOP_CODONS_REVERSE:
locus_prot = Seq(record.seq).reverse_complement().translate()
- if len(locus_prot) == int(len(record.seq)/3):
+ if len(locus_prot) == int(len(record.seq) / 3):
self.locus_quality[record.id] = "good quality"
self.selected_locus[record.id] = seq
else:
@@ -73,92 +74,119 @@ def check_locus_quality(self):
def create_matrix_distance(self):
# f_name = os.path.basename(self.fasta_file).split('.')[0]
f_name = os.path.basename(self.fasta_file)
- mash_folder = os.path.join(self.output, "mash" )
+ mash_folder = os.path.join(self.output, "mash")
# _ = taranis.utils.write_fasta_file(mash_folder, self.selected_locus, multiple_files=True, extension=False)
# save directory to return after mash
working_dir = os.getcwd()
os.chdir(mash_folder)
# run mash sketch command
- sketch_file = "reference.msh"
+ sketch_file = "reference.msh"
mash_sketch_command = ["mash", "sketch", "-i", "-o", sketch_file, f_name]
# mash sketch -i -o prueba.msh lmo0003.fasta
# mash_sketch_command += list(self.selected_locus.keys())
-
- mash_sketch_result = subprocess.run(mash_sketch_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+ mash_sketch_result = subprocess.run(
+ mash_sketch_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
# Get pairwise allele sequences mash distances
# mash_distance_command = ["mash", "dist", sketch_path, sketch_path]
- mash_distance_command = ["mash", "triangle", "-i", "reference.msh"]
- mash_distance_result = subprocess.Popen(mash_distance_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ mash_distance_command = ["mash", "triangle", "-i", "reference.msh"]
+ mash_distance_result = subprocess.Popen(
+ mash_distance_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
# pdb.set_trace()
out, err = mash_distance_result.communicate()
- with open ("matrix_distance.tsv", "w") as fo:
+ with open("matrix_distance.tsv", "w") as fo:
# adding alleles to create a heading
# the value are not required to be in order, just only any name and the right length
- fo.write( "alleles\t" + "\t".join(list(self.selected_locus.keys())) + "\n")
+ fo.write("alleles\t" + "\t".join(list(self.selected_locus.keys())) + "\n")
fo.write(out.decode("UTF-8"))
import pandas as pd
+
locus_num = len(self.selected_locus)
# pdb.set_trace()
matrix_df = pd.read_csv("matrix_distance.tsv", sep="\t").fillna(value=0)
# remove the first line of the matrix that contain only the number of alleles
matrix_df = matrix_df.drop(0)
- locus_list = matrix_df.iloc[0:locus_num,0]
- matrix_np = matrix_df.iloc[:,1:].to_numpy()
+ locus_list = matrix_df.iloc[0:locus_num, 0]
+ matrix_np = matrix_df.iloc[:, 1:].to_numpy()
# convert the triangular matrix to mirror up triangular part
t_matrix_np = matrix_np.transpose()
- matrix_np = t_matrix_np + matrix_np
+ matrix_np = t_matrix_np + matrix_np
# values_np = matrix_df.iloc[:,2].to_numpy()
-
+
# matrix_np = values_np.reshape(locus_num, locus_num)
# out = out.decode('UTF-8').split('\n')
from sklearn.cluster import AgglomerativeClustering
- clusterer = AgglomerativeClustering(n_clusters=7, metric="precomputed", linkage="average", distance_threshold=None)
+
+ clusterer = AgglomerativeClustering(
+ n_clusters=7,
+ metric="precomputed",
+ linkage="average",
+ distance_threshold=None,
+ )
clusters = clusterer.fit_predict(matrix_np)
# clustering = AgglomerativeClustering(affinity="precomputed").fit(matrix_np)
- mean_distance =np.mean(matrix_np, 0)
+ mean_distance = np.mean(matrix_np, 0)
std = np.std(matrix_np)
min_mean = min(mean_distance)
mean_all_alleles = np.mean(mean_distance)
- max_mean= max(mean_distance)
+ max_mean = max(mean_distance)
# buscar el indice que tiene el minimo valor de media
- min_mean_idx= np.where(mean_distance==float(min_mean))[0][0]
+ min_mean_idx = np.where(mean_distance == float(min_mean))[0][0]
# create fasta file with the allele
min_allele = self.selected_locus[locus_list[min_mean_idx]]
record_allele_folder = os.path.join(os.getcwd(), f_name.split(".")[0])
- min_allele_file = taranis.utils.write_fasta_file(record_allele_folder,min_allele, locus_list[min_mean_idx])
+ min_allele_file = taranis.utils.write_fasta_file(
+ record_allele_folder, min_allele, locus_list[min_mean_idx]
+ )
# pdb.set_trace()
# busca el indice que tiene el valor de la media
- mean_all_closser_value = taranis.utils.find_nearest_numpy_value(mean_distance, mean_all_alleles)
- mean_all_alleles_idx= np.where(mean_distance==float(mean_all_closser_value))[0][0]
+ mean_all_closser_value = taranis.utils.find_nearest_numpy_value(
+ mean_distance, mean_all_alleles
+ )
+ mean_all_alleles_idx = np.where(mean_distance == float(mean_all_closser_value))[
+ 0
+ ][0]
# create fasta file with the allele
mean_allele = self.selected_locus[locus_list[mean_all_alleles_idx]]
# record_allele_folder = os.path.join(mash_folder, f_name)
- mean_allele_file = taranis.utils.write_fasta_file(record_allele_folder,mean_allele, locus_list[mean_all_alleles_idx])
-
+ mean_allele_file = taranis.utils.write_fasta_file(
+ record_allele_folder, mean_allele, locus_list[mean_all_alleles_idx]
+ )
+
# busca el indice con la mayor distancia
- max_mean_idx= np.where(mean_distance==float(max_mean))[0][0]
+ max_mean_idx = np.where(mean_distance == float(max_mean))[0][0]
# create fasta file with the allele
max_allele = self.selected_locus[locus_list[max_mean_idx]]
- max_allele_file = taranis.utils.write_fasta_file(record_allele_folder,max_allele, locus_list[max_mean_idx])
-
+ max_allele_file = taranis.utils.write_fasta_file(
+ record_allele_folder, max_allele, locus_list[max_mean_idx]
+ )
# Elijo un outlier lmo0002_185 para ver la distancia
outlier_allele = self.selected_locus[locus_list[184]]
- outlier_allele_file = taranis.utils.write_fasta_file(record_allele_folder,outlier_allele, locus_list[184])
+ outlier_allele_file = taranis.utils.write_fasta_file(
+ record_allele_folder, outlier_allele, locus_list[184]
+ )
- # elijo un segundo outlier lmo0002_95 que tiene como cluster =1
+ # elijo un segundo outlier lmo0002_95 que tiene como cluster =1
outlier2_allele = self.selected_locus[locus_list[95]]
- outlier2_allele_file = taranis.utils.write_fasta_file(record_allele_folder,outlier2_allele, locus_list[95])
-
- # elijo un tercer outlier lmo0002_185 que tiene como cluster =4
+ outlier2_allele_file = taranis.utils.write_fasta_file(
+ record_allele_folder, outlier2_allele, locus_list[95]
+ )
+
+ # elijo un tercer outlier lmo0002_185 que tiene como cluster =4
outlier3_allele = self.selected_locus[locus_list[185]]
- outlier3_allele_file = taranis.utils.write_fasta_file(record_allele_folder,outlier3_allele, locus_list[185])
+ outlier3_allele_file = taranis.utils.write_fasta_file(
+ record_allele_folder, outlier3_allele, locus_list[185]
+ )
# saca una lista de cuantas veces se repite un valor
np.bincount(clusters)
blast_parameters = '"6 , qseqid , sseqid , pident , qlen , length , mismatch , gapopen , evalue , bitscore , sstart , send , qstart , qend , sseq , qseq"'
from Bio.Blast.Applications import NcbiblastnCommandline
+
# Create local BLAST database for all alleles in the locus
db_name = "/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/blast/locus_db"
# db_name = os.path.join("blast", 'locus_blastdb')
@@ -169,16 +197,29 @@ def create_matrix_distance(self):
# taranis.utils.create_blastdb(fasta_file, db_name, 'nucl', logger):
# locus_db_name = os.path.join(db_name, f_name[0], f_name[0])
# query_data= self.selected_locus["lmo0002_1"]
- # All alleles in locus VS reference allele chosen (centroid) BLAST
-
+ # All alleles in locus VS reference allele chosen (centroid) BLAST
+
# ref_query_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/query.fasta"
# cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=100, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=0, max_hsps=0, num_threads=4, query=ref_query_file)
- # minima distancia .
+ # minima distancia .
# min_dist_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_610"
# pdb.set_trace()
min_dist_file = os.path.join(record_allele_folder, min_allele_file)
- cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=min_dist_file)
+ cline = NcbiblastnCommandline(
+ db=db_name,
+ evalue=0.001,
+ perc_identity=90,
+ reward=1,
+ penalty=-2,
+ gapopen=1,
+ gapextend=1,
+ outfmt=blast_parameters,
+ max_target_seqs=1100,
+ max_hsps=1000,
+ num_threads=4,
+ query=min_dist_file,
+ )
out, err = cline()
min_dist_lines = out.splitlines()
min_dist_alleles = []
@@ -187,11 +228,24 @@ def create_matrix_distance(self):
min_np = np.array(min_dist_alleles)
# pdb.set_trace()
print("matches con min distancia: ", len(min_dist_lines))
- print("Not coverage using as reference" , np.setdiff1d(locus_list, min_np))
+ print("Not coverage using as reference", np.setdiff1d(locus_list, min_np))
# distancia media. Sale 133 matches
# mean_dist_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_870"
- mean_dist_file = os.path.join(record_allele_folder, mean_allele_file)
- cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=mean_dist_file)
+ mean_dist_file = os.path.join(record_allele_folder, mean_allele_file)
+ cline = NcbiblastnCommandline(
+ db=db_name,
+ evalue=0.001,
+ perc_identity=90,
+ reward=1,
+ penalty=-2,
+ gapopen=1,
+ gapextend=1,
+ outfmt=blast_parameters,
+ max_target_seqs=1100,
+ max_hsps=1000,
+ num_threads=4,
+ query=mean_dist_file,
+ )
out, err = cline()
mean_dist_lines = out.splitlines()
mean_dist_alleles = []
@@ -199,12 +253,25 @@ def create_matrix_distance(self):
mean_dist_alleles.append(mean_dist.split("\t")[1])
mean_np = np.array(mean_dist_alleles)
print("matches con distancia media: ", len(mean_dist_lines))
- print("Not coverage using as reference" , np.setdiff1d(locus_list, mean_np))
-
- # maxima distancia,
+ print("Not coverage using as reference", np.setdiff1d(locus_list, mean_np))
+
+ # maxima distancia,
# ref_query_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_216"
- max_dist_file = os.path.join(record_allele_folder, max_allele_file)
- cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=max_dist_file)
+ max_dist_file = os.path.join(record_allele_folder, max_allele_file)
+ cline = NcbiblastnCommandline(
+ db=db_name,
+ evalue=0.001,
+ perc_identity=90,
+ reward=1,
+ penalty=-2,
+ gapopen=1,
+ gapextend=1,
+ outfmt=blast_parameters,
+ max_target_seqs=1100,
+ max_hsps=1000,
+ num_threads=4,
+ query=max_dist_file,
+ )
out, err = cline()
max_dist_lines = out.splitlines()
max_dist_alleles = []
@@ -212,12 +279,25 @@ def create_matrix_distance(self):
max_dist_alleles.append(max_dist.split("\t")[1])
max_np = np.array(max_dist_alleles)
print("matches con max distancia: ", len(max_dist_lines))
- print("Not coverage using as reference" , np.setdiff1d(locus_list, max_np))
-
- # eligiendo uno de los outliers ,
+ print("Not coverage using as reference", np.setdiff1d(locus_list, max_np))
+
+ # eligiendo uno de los outliers ,
# outlier_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_183"
- outlier_file = os.path.join(record_allele_folder, outlier_allele_file)
- cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=outlier_file)
+ outlier_file = os.path.join(record_allele_folder, outlier_allele_file)
+ cline = NcbiblastnCommandline(
+ db=db_name,
+ evalue=0.001,
+ perc_identity=90,
+ reward=1,
+ penalty=-2,
+ gapopen=1,
+ gapextend=1,
+ outfmt=blast_parameters,
+ max_target_seqs=1100,
+ max_hsps=1000,
+ num_threads=4,
+ query=outlier_file,
+ )
out, err = cline()
outlier_lines = out.splitlines()
outlier_alleles = []
@@ -226,14 +306,27 @@ def create_matrix_distance(self):
outlier_np = np.array(outlier_alleles)
print("matches con outliers distancia: ", len(outlier_lines))
- print("Alleles added using outlier as reference" , outlier_np)
+ print("Alleles added using outlier as reference", outlier_np)
new_ref_np = np.unique(np.concatenate((min_np, outlier_np), axis=0))
print("\n", "remaining alleles ", np.setdiff1d(locus_list, new_ref_np))
- # eligiendo el segundo de los outliers ,
+ # eligiendo el segundo de los outliers ,
# outlier_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_183"
- outlier2_file = os.path.join(record_allele_folder, outlier2_allele_file)
- cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=outlier2_file)
+ outlier2_file = os.path.join(record_allele_folder, outlier2_allele_file)
+ cline = NcbiblastnCommandline(
+ db=db_name,
+ evalue=0.001,
+ perc_identity=90,
+ reward=1,
+ penalty=-2,
+ gapopen=1,
+ gapextend=1,
+ outfmt=blast_parameters,
+ max_target_seqs=1100,
+ max_hsps=1000,
+ num_threads=4,
+ query=outlier2_file,
+ )
out, err = cline()
outlier2_lines = out.splitlines()
outlier2_alleles = []
@@ -243,12 +336,29 @@ def create_matrix_distance(self):
print("matches con second outliers distance: ", len(outlier2_lines))
# print("Alleles added using second outlier as reference" , outlier2_np)
upd_new_ref_np = np.unique(np.concatenate((new_ref_np, outlier2_np), axis=0))
- print("\n", "remaining alleles after second outlier", np.setdiff1d(locus_list, upd_new_ref_np))
+ print(
+ "\n",
+ "remaining alleles after second outlier",
+ np.setdiff1d(locus_list, upd_new_ref_np),
+ )
- # eligiendo el tercero de los outliers ,
+ # eligiendo el tercero de los outliers ,
# outlier_file="/media/lchapado/Reference_data/proyectos_isciii/taranis/new_taranis_result_code/mash/lmo0002/lmo0002_183"
- outlier3_file = os.path.join(record_allele_folder, outlier3_allele_file)
- cline = NcbiblastnCommandline(db=db_name, evalue=0.001, perc_identity=90, reward=1, penalty=-2, gapopen=1, gapextend=1, outfmt=blast_parameters, max_target_seqs=1100, max_hsps=1000, num_threads=4, query=outlier3_file)
+ outlier3_file = os.path.join(record_allele_folder, outlier3_allele_file)
+ cline = NcbiblastnCommandline(
+ db=db_name,
+ evalue=0.001,
+ perc_identity=90,
+ reward=1,
+ penalty=-2,
+ gapopen=1,
+ gapextend=1,
+ outfmt=blast_parameters,
+ max_target_seqs=1100,
+ max_hsps=1000,
+ num_threads=4,
+ query=outlier3_file,
+ )
out, err = cline()
outlier3_lines = out.splitlines()
outlier3_alleles = []
@@ -257,11 +367,16 @@ def create_matrix_distance(self):
outlier3_np = np.array(outlier3_alleles)
print("matches con third outliers distance: ", len(outlier3_lines))
# print("Alleles added using second outlier as reference" , outlier2_np)
- upd2_new_ref_np = np.unique(np.concatenate((upd_new_ref_np, outlier3_np), axis=0))
- print("\n", "remaining alleles after second outlier", np.setdiff1d(locus_list, upd2_new_ref_np))
-
- print("\n Still missing " ,len( np.setdiff1d(locus_list, upd2_new_ref_np)))
+ upd2_new_ref_np = np.unique(
+ np.concatenate((upd_new_ref_np, outlier3_np), axis=0)
+ )
+ print(
+ "\n",
+ "remaining alleles after second outlier",
+ np.setdiff1d(locus_list, upd2_new_ref_np),
+ )
+ print("\n Still missing ", len(np.setdiff1d(locus_list, upd2_new_ref_np)))
pdb.set_trace()
@@ -270,13 +385,11 @@ def create_matrix_distance(self):
# X = np.array([[0, 2, 3], [2, 0, 3], [3, 3, 0]])
# clustering = AgglomerativeClustering(affinity="precomputed").fit(X)
-
def create_ref_alleles(self):
self.records = taranis.utils.read_fasta_file(self.fasta_file)
_ = self.check_locus_quality()
# pdb.set_trace()
# Prepare data to use mash to create the distance matrix
_ = self.create_matrix_distance()
-
- pass
\ No newline at end of file
+ pass
diff --git a/taranis/utils.py b/taranis/utils.py
index f41f297..bd1b09a 100644
--- a/taranis/utils.py
+++ b/taranis/utils.py
@@ -4,16 +4,17 @@
"""
import glob
-
+import io
import logging
import numpy as np
import questionary
import os
import plotly.graph_objects as go
+import re
import rich.console
import shutil
import subprocess
-
+import tarfile
import sys
@@ -22,8 +23,10 @@
from Bio.SeqRecord import SeqRecord
import pdb
+
log = logging.getLogger(__name__)
+
def rich_force_colors():
"""
Check if any environment variables are set to force Rich to use coloured output
@@ -35,6 +38,8 @@ def rich_force_colors():
):
return True
return None
+
+
stderr = rich.console.Console(
stderr=True,
style="dim",
@@ -42,39 +47,73 @@ def rich_force_colors():
force_terminal=rich_force_colors(),
)
-START_CODON_FORWARD= ['ATG','ATA','ATT','GTG', 'TTG']
-start_codon_reverse= ['CAT', 'TAT','AAT','CAC','CAA']
+START_CODON_FORWARD = ["ATG", "ATA", "ATT", "GTG", "TTG"]
+START_CODON_REVERSE = ["CAT", "TAT", "AAT", "CAC", "CAA"]
+
+STOP_CODON_FORWARD = ["TAA", "TAG", "TGA"]
+STOP_CODON_REVERSE = ["TTA", "CTA", "TCA"]
-STOP_CODON_FORWARD = ['TAA', 'TAG','TGA']
-stop_codon_reverse = ['TTA', 'CTA','TCA']
+POSIBLE_BAD_QUALITY = [
+ "not a start codon",
+ "not a stop codon",
+ "Extra in frame stop codon",
+ "is not a multiple of three",
+ "Duplicate allele",
+ "Sub set allele",
+]
-POSIBLE_BAD_QUALITY = ["Can not be converted to protein", "Start codon not found", "Stop codon not found", "Multiple stop codons found" ,"Duplicate allele", "Sub set allele"]
-def check_sequence_order(allele_sequence):
+def get_seq_direction(allele_sequence):
# check direction
- if allele_sequence[0:3] in START_CODON_FORWARD or allele_sequence[-3:] in STOP_CODON_FORWARD:
- return 'forward'
- if allele_sequence[-3:] in start_codon_reverse or allele_sequence[0:3] in stop_codon_reverse:
- return 'reverse'
+ if (
+ allele_sequence[0:3] in START_CODON_FORWARD
+ or allele_sequence[-3:] in STOP_CODON_FORWARD
+ ):
+ return "forward"
+ if (
+ allele_sequence[-3:] in START_CODON_REVERSE
+ or allele_sequence[0:3] in STOP_CODON_REVERSE
+ ):
+ return "reverse"
return "Error"
-def create_annotation_files(fasta_file, annotation_dir, prefix, genus="Genus", species="species", usegenus=False):
+
+def create_annotation_files(
+ fasta_file,
+ annotation_dir,
+ prefix,
+ genus="Genus",
+ species="species",
+ usegenus=False,
+ cpus=3,
+):
try:
- _ = subprocess.run (['prokka', fasta_file, '--force', '--outdir', annotation_dir, '--genus', genus, '--species', species, '--usegenus', str(usegenus), '--gcode', '11', '--prefix', prefix, '--quiet'])
+ _ = subprocess.run(
+ [
+ "prokka",
+ fasta_file,
+ "--force",
+ "--outdir",
+ annotation_dir,
+ "--genus",
+ genus,
+ "--species",
+ species,
+ "--usegenus",
+ str(usegenus),
+ "--gcode",
+ "11",
+ "--prefix",
+ prefix,
+ "--cpus",
+ str(cpus),
+ "--quiet",
+ ]
+ )
except Exception as e:
- log.error("Unable to run prokka. Error message: %s ", e )
+ log.error("Unable to run prokka. Error message: %s ", e)
stderr.print("[red] Unable to run prokka. Given error; " + e)
sys.exit(1)
- # Check that prokka store files in the requested folder
- # if prokka results are not found in the requested folder then move from the
- # running directory to the right one
- if not folder_exists(annotation_dir):
- try:
- shutil.move(prefix, annotation_dir)
- except Exception as e:
- log.error("Unable to move prokka result folder to %s ", e )
- stderr.print("[red] Unable to move result prokka folder. Error; " + e)
- sys.exit(1)
return os.path.join(annotation_dir, prefix)
@@ -88,14 +127,19 @@ def create_new_folder(folder_name):
return
-def create_graphic(out_folder, f_name, mode, x_data, y_data, labels, title ):
+def create_graphic(out_folder, f_name, mode, x_data, y_data, labels, title):
fig = go.Figure()
# pdb.set_trace()
if mode == "lines":
fig.add_trace(go.Scatter(x=x_data, y=y_data, mode=mode, name=title))
elif mode == "pie":
fig.add_trace(go.Pie(labels=labels, values=x_data))
- fig.update_layout(title_text= title)
+ elif mode == "bar":
+ fig.add_trace(go.Bar(x=x_data, y=y_data))
+ elif mode == "box":
+ fig.add_trace(go.Box(y=y_data))
+
+ fig.update_layout(title_text=title)
fig.write_image(os.path.join(out_folder, f_name))
@@ -106,7 +150,7 @@ def get_files_in_folder(folder, extension=None):
Args:
folder (string): folder path
extension (string, optional): extension for filtering. Defaults to None.
-
+
Returns:
list: list of files which match the condition
"""
@@ -116,14 +160,16 @@ def get_files_in_folder(folder, extension=None):
sys.exit(1)
if extension is None:
extension = "*"
- folder_files = os.path.join(folder , "*." + extension)
+ folder_files = os.path.join(folder, "*." + extension)
files_in_folder = glob.glob(folder_files)
if len(files_in_folder) == 0:
- log.error("Folder %s does not have any file which the extension %s", folder, extension)
+ log.error(
+ "Folder %s does not have any file which the extension %s", folder, extension
+ )
stderr.print("[red] Folder does not have any file which match your request")
sys.exit(1)
return files_in_folder
-
+
def file_exists(file_to_check):
"""Checks if input file exists
@@ -133,28 +179,18 @@ def file_exists(file_to_check):
Returns:
boolean: True if exists
- """
+ """
if os.path.isfile(file_to_check):
return True
return False
-def find_multiple_stop_codons(seq) :
- stop_codons = ['TAA', 'TAG','TGA']
- c_index = []
- for idx in range (0, len(seq) -2, 3) :
- c_seq = seq[idx : idx + 3]
- if c_seq in stop_codons :
- c_index.append(idx)
- if len(c_index) == 1:
- return False
- return True
-
def find_nearest_numpy_value(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return array[idx]
+
def folder_exists(folder_to_check):
"""Checks if input folder exists
@@ -163,15 +199,17 @@ def folder_exists(folder_to_check):
Returns:
boolean: True if exists
- """
+ """
if os.path.isdir(folder_to_check):
return True
return False
+
def prompt_text(msg):
source = questionary.text(msg).unsafe_ask()
return source
+
def query_user_yes_no(question, default):
"""Query the user to choose yes or no for the query question
@@ -180,8 +218,8 @@ def query_user_yes_no(question, default):
default (string): default option to be used: yes or no
Returns:
- user select: True continue with code
- """
+ user select: True continue with code
+ """
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
@@ -204,36 +242,37 @@ def query_user_yes_no(question, default):
else:
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
-def read_annotation_file(ann_file, allele_name, only_first_line=True):
- """ example of annotation file
- locus_tag ftype length_bp gene EC_number COG product
- IEKBEMEO_00001 CDS 1344 yeeO_1 COG0534 putative FMN/FAD exporter YeeO
- IEKBEMEO_00002 CDS 1344 yeeO_2 COG0534 putative FMN/FAD exporter YeeO
+def read_annotation_file(ann_file):
+ """example of annotation file
+
+ lmo0002_782 Prodigal:002006 CDS 1 1146 . + 0 ID=OJGEGONH_00782;Name=dnaN_782;db_xref=COG:COG0592;gene=dnaN_782;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P05649;locus_tag=OJGEGONH_00782;product=Beta sliding clamp
+ lmo0002_783 Prodigal:002006 CDS 1 1146 . + 0 ID=OJGEGONH_00783;Name=dnaN_783;db_xref=COG:COG0592;gene=dnaN_783;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:P05649;locus_tag=OJGEGONH_00783;product=Beta sliding clamp
+ lmo0049_3 Prodigal:002006 CDS 1 162 . + 0 ID=CODOCEEL_00001;inference=ab initio prediction:Prodigal:002006;locus_tag=CODOCEEL_00001;product=hypothetical protein
+ lmo0049_6 Prodigal:002006 CDS 1 162 . + 0 ID=CODOCEEL_00002;inference=ab initio prediction:Prodigal:002006;locus_tag=CODOCEEL_00002;product=hypothetical protein
"""
ann_data = {}
- with open (ann_file, "r") as fh:
+ with open(ann_file, "r") as fh:
lines = fh.readlines()
- heading = lines[0].split("\t")
- locus_tag_idx = heading.index("locus_tag")
- gene_idx = heading.index("gene")
- if only_first_line:
- first_line = lines[1].split("\t")
- ann_data[allele_name] = first_line[gene_idx] if first_line[gene_idx] != "" else "Not found by Prokka"
- else:
- # Return all annotation lines
- for line in lines[1:]:
- s_line = line.strip().split("\t")
- allele_key = allele_name + "_" + s_line[locus_tag_idx].split("_")[1]
- ann_data[allele_key] = s_line[gene_idx] if s_line[gene_idx] != "" else "Not found by Prokka"
- return ann_data
+ for line in lines:
+ if "Prodigal" in line:
+ gene_match = re.search(r"(.*)[\t]Prodigal.*gene=(\w+)_.*", line)
+ if gene_match:
+ ann_data[gene_match.group(1)] = gene_match.group(2)
+ else:
+ pred_match = re.search(r"(.*)[\t]Prodigal.*product=(\w+)_.*", line)
+ if pred_match:
+ ann_data[pred_match.group(1)] = pred_match.group(2).strip()
+ if "fasta" in line:
+ break
+ return ann_data
def read_fasta_file(fasta_file):
return SeqIO.parse(fasta_file, "fasta")
-
+
def write_fasta_file(out_folder, seq_data, allele_name=None, f_name=None):
try:
@@ -246,22 +285,55 @@ def write_fasta_file(out_folder, seq_data, allele_name=None, f_name=None):
# use the fasta name as file name
f_name = key + ".fasta"
f_path_name = os.path.join(out_folder, f_name)
- with open (f_path_name, "w") as fo:
+ with open(f_path_name, "w") as fo:
fo.write(">" + key + "\n")
fo.write(seq)
else:
if f_name is None:
f_name = allele_name
f_path_name = os.path.join(out_folder, f_name)
- with open (f_path_name, "w") as fo:
+ with open(f_path_name, "w") as fo:
fo.write(">" + allele_name + "\n")
fo.write(seq_data)
return f_name
-def write_data_to_file(out_folder, f_name, data, include_header=True, data_type="pandas", extension="csv"):
- f_path_name = os.path.join(out_folder,f_name)
+
+def write_data_to_compress_filed(out_folder, f_name, dump_data):
+ with io.BytesIO() as buffer:
+ with tarfile.open(fileobj=buffer, mode="w:gz") as tar:
+ # Add data to the tar archive
+ tarinfo = tarfile.TarInfo(f_name)
+ # Example: Write a string to the tar.gz file (replace this with your data)
+ data_bytes = dump_data.encode("utf-8")
+ tarinfo.size = len(data_bytes)
+ tar.addfile(tarinfo, io.BytesIO(data_bytes))
+
+ # Get the content of the in-memory tar.gz file
+ buffer.seek(0)
+ tar_data = buffer.read()
+ file_path_name = os.path.join(out_folder, Path(f_name).stem + ".tar.gz")
+ with open(file_path_name, "wb") as fo:
+ fo.write(tar_data)
+
+
+def write_data_to_file(
+ out_folder, f_name, data, include_header=True, data_type="pandas", extension="csv"
+):
+ f_path_name = os.path.join(out_folder, f_name)
if data_type == "pandas":
- data.to_csv(f_path_name, sep=",",header=include_header)
+ data.to_csv(f_path_name, sep=",", header=include_header)
return
+"""
+def find_multiple_stop_codons(seq) :
+ stop_codons = ['TAA', 'TAG','TGA']
+ c_index = []
+ for idx in range (0, len(seq) -2, 3) :
+ c_seq = seq[idx : idx + 3]
+ if c_seq in stop_codons :
+ c_index.append(idx)
+ if len(c_index) == 1:
+ return False
+ return True
+"""