initial analyze amplicons pipeline

TGenNorth · Jul 12, 2024 · dad2330 · dad2330
1 parent e98c6ce
commit dad2330
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 32 deletions.
diff --git a/q2_asap/analyzeAmplicons.py b/q2_asap/analyzeAmplicons.py
@@ -48,11 +48,20 @@ def analyzeAmplicons(sequences: CasavaOneEightSingleLanePerSampleDirFmt, name: s
     output_dir_bam = BAMSortedAndIndexedDirFmt()
     output_dir_bwa = BWAIndexDirFmt()
     output_dir_xml = ASAPXMLOutputDirFmt()
+
+
+    result = subprocess.run(['ls', '-l', f'{sequences}'], stdout=subprocess.PIPE, text=True)
+    print(result)
+    print(os.listdir(f'{sequences}'))
+    subprocess.run(['mkdir', '-p', 'sequences'], check=True)
+    shutil.copytree(f'{sequences}', 'sequences', dirs_exist_ok=True)
 
     # create analyzeAmplicons command
-    command = f'analyzeAmplicons -r {sequences} -n {name} -o {temp_dir}/asap_output -d {depth} -b {breadth} -j /scratch/nsylvester/SARS2_variant_detection.json \
+    command = f'analyzeAmplicons -r sequences -n {name} -o {temp_dir}/asap_output -d {depth} -b {breadth} -j /home/nsylvester/scratch/q2-asap/q2_asap/tests/data/SARS2_variant_detection.json \
     --min_base_qual {min_base_qual} --consensus-proportion {consensus_proportion} --fill-gaps {fill_gaps} -a {aligner} --aligner-args {aligner_args}'
 
+    print(command)
+
     # combine conda environment and command TODO: fix conda environment
     shell_script= f"""
     conda run -p /home/dlemmer/.conda/envs/asap {command}
@@ -75,14 +84,14 @@ def analyzeAmplicons(sequences: CasavaOneEightSingleLanePerSampleDirFmt, name: s
     for file_name in os.listdir(asap_output_dir):
         file_path = os.path.join(asap_output_dir, file_name)
         if re.search(r'\.(amb|ann|bwt|pac|sa|fasta)$', file_name):
-            shutil.move(file_path, Path(output_dir_bwa.path) / file_name)
+            shutil.copy(file_path, Path(output_dir_bwa.path) / file_name)
         elif re.search(r'\.xml$', file_name):
-                shutil.move(file_path, Path(output_dir_xml.path) / file_name)
+                shutil.copy(file_path, Path(output_dir_xml.path) / file_name)
 
     # move bam files
     for file_name in os.listdir(os.path.join(asap_output_dir, "bwamem")):
         file_path = os.path.join(asap_output_dir, "bwamem", file_name)
-        shutil.move(file_path, Path(output_dir_bam.path) / file_name)
+        shutil.copy(file_path, Path(output_dir_bam.path) / file_name)
 
     #  remove temp directory
     # shutil.rmtree(temp_dir)

diff --git a/q2_asap/analyzeAmplicons_pipeline.py b/q2_asap/analyzeAmplicons_pipeline.py
@@ -0,0 +1,30 @@
+#TODO: config file for bbduk/fastp/trimmer params
+#TODO: bamprocessor, outputCombiner
+def analyzeAmplicons_pipeline(ctx, sequences, ref_sequence, config_file_path):
+
+    #TODO: long string of params in json file that we will use as trimmer/aligner_params 
+    config = open(config_file_path)
+    data = json.load(config)
+
+    trimmer = data['ASAP_parameters']['trimmer']['name']
+    trimmer_action = ctx.get_action('trimmers', trimmer)
+
+    aligner_index = data['ASAP_parameters']['aligner']['index']
+    aligner_index_action = ctx.get_action('aligners', aligner_index)
+
+    aligner = data['ASAP_parameters']['aligner']['name']
+    aligner_action = ctx.get_action('aligners', aligner)
+
+    # initialize results
+    results = []
+
+    trimmer_results = trimmer_action(sequences)
+    # results.append(trimmer_results)
+
+    aligner_index_result, = aligner_index_action(ref_sequence)
+    # results.append(aligner_index_result)
+
+    aligner_result, = aligner_action(sequences = sequences, reference_index = aligner_index_result)
+    results.append(aligner_result)
+
+    return tuple(results)
diff --git a/q2_asap/plugin_setup.py b/q2_asap/plugin_setup.py
@@ -18,8 +18,11 @@
 from ._formats import ASAPXMLFormat, ASAPHTMLFormat, ASAPXMLOutputDirFmt, ASAPHTMLOutputDirFmt
 from ._types import ASAPXML, ASAPHTML
 from q2_nasp2_types.index import BWAIndex
-from q2_nasp2_types.alignment import BAM
-from q2_types.feature_data import FeatureData
+from q2_nasp2_types.alignment import BAMIndexed, SAM
+from q2_types.feature_data import FeatureData, Sequence
+from q2_asap.analyzeAmplicons_pipeline import analyzeAmplicons_pipeline
+from q2_types.bowtie2 import Bowtie2Index
+
 
 
 plugin = Plugin(
@@ -49,7 +52,7 @@
                 'aligner_args': Str
                 },
     outputs=[
-             ('output_bams', FeatureData[BAM]),
+             ('output_bams', FeatureData[BAMIndexed]),
              ('bwa_index', BWAIndex),
              ('asap_xmls', ASAPXML)
              ],
@@ -59,7 +62,7 @@
                 'depth': 'minimum read depth required to consider a position covered. [default: 100]',
                 'breadth': 'minimum breadth of coverage required to consider an amplicon as present. [default: 0.8]',
                 'min_base_qual': 'what is the minimum base quality score (BQS) to use a position (Phred scale, i.e. 10=90, 20=99, 30=99.9 accuracy',
-                'consensus_proportion': 'minimum proportion required to call at base at that position, else 'N'. [default: 0.8]',
+                'consensus_proportion': 'minimum proportion required to call at base at that position, else "N". [default: 0.8]',
                 'fill_gaps': 'fill no coverage gaps in the consensus sequence [default: False], optional parameter is the character to use for filling [defaut: n]',
                 'aligner': 'aligner to use for read mapping, supports bowtie2, novoalign, and bwa. [default: bowtie2]',
                 'aligner_args': "additional arguments to pass to the aligner, enclosed in ''."},
@@ -87,3 +90,30 @@
 )
 
 
+# register analyzeAmplicon pipeline
+plugin.pipelines.register_function(
+    function=analyzeAmplicons_pipeline,
+    inputs={
+        'sequences':  (SampleData[SequencesWithQuality] | SampleData[PairedEndSequencesWithQuality]),
+        'ref_sequence': FeatureData[Sequence],
+    },
+    parameters={'config_file_path': Str},
+    outputs=[
+        # ('trimmer_results', (SampleData[SequencesWithQuality] | SampleData[PairedEndSequencesWithQuality])),
+        # ('aligner_index_result', (BWAIndex | Bowtie2Index)),
+        ('aligner_result', FeatureData[SAM])
+    ],
+    input_descriptions={
+        'sequences': 'the sequences to be analyzed',
+        'ref_sequence': 'The reference sequence used to build bwa index.',
+    },
+    parameter_descriptions={'config_file_path': 'path to config file that holds all params'},
+    output_descriptions={
+        # 'trimmer_results': 'The results after completing trimming',
+        # 'aligner_index_result': 'The resulting aligner index',
+        'aligner_result': 'The result of aligning reads with specified aligner'
+    },
+    name='Analyze Amplicons Pipeline',
+    description=("A pipeline to run analyze amplicons")
+) 
+
diff --git a/q2_asap/tests/test_methods.py b/q2_asap/tests/test_methods.py
@@ -7,38 +7,29 @@
 # ----------------------------------------------------------------------------
 
 import pandas as pd
-import pandas.testing as pdt
 
 from qiime2.plugin.testing import TestPluginBase
 from qiime2.plugin.util import transform
-from q2_types.feature_table import BIOMV100Format
+from qiime2 import Artifact
 
-from q2_asap._methods import duplicate_table
+# from q2_asap.analyzeAmplicons_pipeline import analyzeAmplicons_pipeline
 
-
-class DuplicateTableTests(TestPluginBase):
+class TestAnalyzeAmpliconPipeline(TestPluginBase):
     package = 'q2_asap.tests'
 
-    def test_simple1(self):
-        in_table = pd.DataFrame(
-            [[1, 2, 3, 4, 5], [9, 10, 11, 12, 13]],
-            columns=['abc', 'def', 'jkl', 'mno', 'pqr'],
-            index=['sample-1', 'sample-2'])
-        observed = duplicate_table(in_table)
-
-        expected = in_table
+    def test_analyzeAmplicon_pipeline(self):
+        # access the pipeline as QIIME 2 sees it,
+        # for correct assignment of `ctx` variable
+        analyzeAmplicons_pipeline = self.plugin.pipelines['analyzeAmplicons_pipeline']
 
-        pdt.assert_frame_equal(observed, expected)
+        #import artifact for reference sequence
+        ref_sequence_artifact = Artifact.import_data('FeatureData[Sequence]', '/home/nsylvester/scratch/q2-asap/q2_asap/tests/data/wuhan_sequence.fasta')
 
-    def test_simple2(self):
-        # test table duplication with table loaded from file this time
-        # (for demonstration purposes)
-        in_table = transform(
-            self.get_data_path('table-1.biom'),
-            from_type=BIOMV100Format,
-            to_type=pd.DataFrame)
-        observed = duplicate_table(in_table)
+        # get sequences (paired-end-demux.qza)
+        sequences_artifact = Artifact.load('q2_asap/tests/data/paired-end-demux.qza')
 
-        expected = in_table
+        # run the pipeline
+        results = analyzeAmplicons_pipeline(sequences = sequences_artifact, ref_sequence = ref_sequence_artifact)
 
-        pdt.assert_frame_equal(observed, expected)
+        # assert output
+        self.assertTrue(len(results) > 0)