Update analyze schema with Comments in previous PR. Added liting work…

…flow
BU-ISCIII · Jan 30, 2024 · 0874569 · 0874569
1 parent d212ab4
commit 0874569
Show file tree

Hide file tree

Showing 13 changed files with 3,628 additions and 1,296 deletions.
diff --git a/.github/workflows/dockerhub_push_release.yml b/.github/workflows/dockerhub_push_release.yml
diff --git a/.github/workflows/python_lint.yml b/.github/workflows/python_lint.yml
@@ -0,0 +1,35 @@
+name: python_lint
+
+on:
+  push:
+    paths:
+      - '**.py'
+  pull_request:
+    paths:
+      - '**.py'
+
+jobs:
+  flake8_py3:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.9.x
+          architecture: x64
+      - name: Checkout PyTorch
+        uses: actions/checkout@master
+      - name: Install flake8
+        run: pip install flake8
+      - name: Run flake8
+        run: flake8 --ignore E501,W503,E203,W605
+
+  black_lint:
+    runs-on: ubuntu-latest
+    steps:
+        - name: Setup
+          uses: actions/checkout@v2
+        - name: Install black in jupyter
+          run: pip install black[jupyter]
+        - name: Check code lints with Black
+          uses: psf/black@stable
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 from setuptools import setup, find_packages
 
-version = "2.2.0"
+version = "3.0.0"
 
 with open("README.md") as f:
     readme = f.read()

diff --git a/taranis/__main__.py b/taranis/__main__.py
@@ -52,7 +52,7 @@ def run_taranis():
     )
 
     # stderr.print("[green]                                          `._,._,'\n", highlight=False)
-    __version__ = "2.1.0"
+    __version__ = "3.0.0"
     stderr.print(
         "\n" "[grey39]    Taranis version {}".format(__version__), highlight=False
     )
@@ -166,6 +166,12 @@ def taranis_cli(verbose, log_file):
     default=False,
     help="Remove no CDS alleles from the schema.",
 )
+@click.option(
+    "--output-allele-annot/--no-output-allele-annot",
+    required=False,
+    default=True,
+    help="get extension annotation for all alleles in locus",
+)
 @click.option(
     "--genus",
     required=False,
@@ -184,29 +190,41 @@ def taranis_cli(verbose, log_file):
     default="Genus",
     help="Use genus-specific BLAST databases for Prokka schema genes annotation (needs --genus). Default is False.",
 )
+@click.option(
+    "--cpus",
+    required=False,
+    multiple=False,
+    type=int,
+    default=1,
+    help="Number of cpus used for execution",
+)
 def analyze_schema(
     inputdir,
     output,
     remove_subset,
     remove_duplicated,
     remove_no_cds,
+    output_allele_annot,
     genus,
     species,
     usegenus,
+    cpus,
 ):
     schema_files = taranis.utils.get_files_in_folder(inputdir, "fasta")
 
     """
-    schema_analyze = {}
+    schema_analyze = []
     for schema_file in schema_files:
         schema_obj = taranis.analyze_schema.AnalyzeSchema(schema_file, output, remove_subset, remove_duplicated, remove_no_cds, genus, species, usegenus)
-        schema_analyze.update(schema_obj.analyze_allele_in_schema())
-    
-    """
+        schema_analyze.append(schema_obj.analyze_allele_in_schema())
+    import pdb; pdb.set_trace()
+    _ = taranis.analyze_schema.collect_statistics(schema_analyze, output, output_allele_annot)
+    sys.exit(0)
     # for schema_file in schema_files:
+    """
     results = []
     start = time.perf_counter()
-    with concurrent.futures.ProcessPoolExecutor() as executor:
+    with concurrent.futures.ProcessPoolExecutor(max_workers=cpus) as executor:
         futures = [
             executor.submit(
                 taranis.analyze_schema.parallel_execution,
@@ -224,10 +242,11 @@ def analyze_schema(
         # Collect results as they complete
         for future in concurrent.futures.as_completed(futures):
             results.append(future.result())
-    _ = taranis.analyze_schema.collect_statistics(results, output)
+    _ = taranis.analyze_schema.collect_statistics(results, output, output_allele_annot)
     finish = time.perf_counter()
     print(f"Schema analyze finish in {round((finish-start)/60, 2)} minutes")
 
+
 # Reference alleles
 @taranis_cli.command(help_priority=2)
 @click.option(

diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py
@@ -5,12 +5,14 @@
 
 import taranis.utils
 import taranis.blast
+
 # import numpy
 import pandas as pd
 from pathlib import Path
 
 
 import pdb
+
 log = logging.getLogger(__name__)
 stderr = rich.console.Console(
     stderr=True,
@@ -19,6 +21,7 @@
     force_terminal=taranis.utils.rich_force_colors(),
 )
 
+
 class AlleleCalling:
     def __init__(self, prediction, sample_file, schema, reference_alleles, out_folder):
         self.prediction = prediction
@@ -27,9 +30,25 @@ def __init__(self, prediction, sample_file, schema, reference_alleles, out_folde
         self.ref_alleles = reference_alleles
         self.out_folder = out_folder
         self.s_name = Path(sample_file).stem
-        self.blast_dir = os.path.join(out_folder,"blastdb")
+        self.blast_dir = os.path.join(out_folder, "blastdb")
         self.blast_sample = os.path.join(self.blast_dir, self.s_name)
-        self.blast_heading = ["qseqid", "sseqid", "pident", "qlen", "length", "mismatch", "gapopen", "evalue", "bitscore", "sstart", "send", "qstart", "qend", "sseq", "qseq"]
+        self.blast_heading = [
+            "qseqid",
+            "sseqid",
+            "pident",
+            "qlen",
+            "length",
+            "mismatch",
+            "gapopen",
+            "evalue",
+            "bitscore",
+            "sstart",
+            "send",
+            "qstart",
+            "qend",
+            "sseq",
+            "qseq",
+        ]
 
     def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene):
         """_summary_
@@ -39,38 +58,36 @@ def assign_allele_type(self, query_seq, allele_name, sample_contig, schema_gene)
             allele_name (_type_): _description_
             sample_contig (_type_): _description_
             schema_gene (_type_): _description_
-        """        
+        """
         s_alleles_blast = taranis.blast.Blast("nucl")
         ref_allele_blast_dir = os.path.join(self.blast_dir, "ref_alleles")
         query_path = os.path.join(self.out_folder, "tmp", allele_name)
-        # Write to file the sequence to find out the loci name that fully match 
+        # Write to file the sequence to find out the loci name that fully match
         f_name = taranis.utils.write_fasta_file(query_path, query_seq, allele_name)
         query_file = os.path.join(query_path, f_name)
         _ = s_alleles_blast.create_blastdb(schema_gene, ref_allele_blast_dir)
-        # Blast with sample sequence to find the allele in the schema 
+        # Blast with sample sequence to find the allele in the schema
         seq_blast_match = s_alleles_blast.run_blast(query_file, perc_identity=100)
         pdb.set_trace()
         if len(seq_blast_match) >= 1:
-            # allele is named as NIPHEM 
-           
+            # allele is named as NIPHEM
+
             # Hacer un blast con la query esta secuencia y la database del alelo
             # Create  blast db with sample file
-
 
             pass
         elif len(seq_blast_match) == 1:
             pass
         else:
             pass
 
-
-    def search_alleles (self, ref_allele):
+    def search_alleles(self, ref_allele):
         allele_name = Path(ref_allele).stem
-        schema_gene = os.path.join(self.schema, allele_name + ".fasta")  
+        schema_gene = os.path.join(self.schema, allele_name + ".fasta")
         allele_name = Path(ref_allele).stem
         # run blast with sample as db and reference allele as query
         sample_blast_match = self.sample_blast.run_blast(ref_allele)
-        if len(sample_blast_match) > 0 :
+        if len(sample_blast_match) > 0:
             pd_lines = pd.DataFrame([item.split("\t") for item in sample_blast_match])
             pd_lines.columns = self.blast_heading
             pd_lines["pident"] = pd_lines["pident"].apply(pd.to_numeric)
@@ -84,16 +101,17 @@ def search_alleles (self, ref_allele):
             # sel_row = np_lines[mask, :] = np_lines[mask, :]
             # query_seq = sel_row[0,14]
             sample_contig = sel_max["sseqid"]
-            abbr = self.assign_allele_type(query_seq, allele_name, sample_contig, schema_gene)
+            abbr = self.assign_allele_type(
+                query_seq, allele_name, sample_contig, schema_gene
+            )
         else:
             # Sample does not have a reference allele to be matched
             # Keep LNF info
             # ver el codigo de espe
-            #lnf_tpr_tag()
+            # lnf_tpr_tag()
             pass
         pdb.set_trace()
 
-
     def analyze_sample(self):
         # Create  blast db with sample file
         self.sample_blast = taranis.blast.Blast("nucl")
@@ -107,4 +125,3 @@ def analyze_sample(self):
 
         pdb.set_trace()
         return
-