From b8b0faae282c1974f4d9a667700558b2c3f27ef9 Mon Sep 17 00:00:00 2001 From: nicolesylvester Date: Thu, 8 Aug 2024 11:37:20 -0700 Subject: [PATCH] xml json transformer created for future use --- q2_asap/__init__.py | 11 +++-- q2_asap/_formats.py | 17 ++++++++ q2_asap/_types.py | 2 + q2_asap/bamProcessor.py | 31 ++++---------- q2_asap/plugin_setup.py | 10 +++-- q2_asap/tests/test_methods.py | 78 ++++++++++++++++++++++++----------- 6 files changed, 94 insertions(+), 55 deletions(-) diff --git a/q2_asap/__init__.py b/q2_asap/__init__.py index 3b4926c..05bb87d 100644 --- a/q2_asap/__init__.py +++ b/q2_asap/__init__.py @@ -6,7 +6,7 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +import importlib from ._version import get_versions __version__ = get_versions()["version"] @@ -15,9 +15,12 @@ from . import _version __version__ = _version.get_versions()['version'] -from ._formats import (ASAPXMLOutputDirFmt, ASAPXMLFormat, ASAPHTMLOutputDirFmt, ASAPHTMLFormat) +from ._formats import (ASAPXMLOutputDirFmt, ASAPXMLFormat, ASAPHTMLOutputDirFmt, ASAPHTMLFormat, ASAPJSONOutputDirFmt, ASAPJSONFormat) __all__ = [ 'ASAPXMLOutputDirFmt', 'ASAPXMLFormat', - 'ASAPHTMLOutputDirFmt', 'ASAPHTMLFormat' -] \ No newline at end of file + 'ASAPHTMLOutputDirFmt', 'ASAPHTMLFormat', + 'ASAPJSONOutputDirFmt', 'ASAPJSONFormat', +] + +importlib.import_module('q2_asap._transformers') \ No newline at end of file diff --git a/q2_asap/_formats.py b/q2_asap/_formats.py index 0baf4fb..a17b864 100644 --- a/q2_asap/_formats.py +++ b/q2_asap/_formats.py @@ -52,3 +52,20 @@ class FormattedOutputDirFmt(DirectoryFormat): @html_files.set_path_maker def sequences_path_maker(self, sample_id): return f"{sample_id}.html" + +class ASAPJSONFormat(TextFileFormat): + + def _validate_(self, level): + pass + + +class ASAPJSONOutputDirFmt(DirectoryFormat): + + json_files = FileCollection( + r'.*json', + format=ASAPJSONFormat + ) + + @json_files.set_path_maker + def sequences_path_maker(self, sample_id): + return f"{sample_id}.json" diff --git a/q2_asap/_types.py b/q2_asap/_types.py index 71a8785..8476c4c 100644 --- a/q2_asap/_types.py +++ b/q2_asap/_types.py @@ -3,3 +3,5 @@ ASAPXML = SemanticType('ASAPXML') ASAPHTML = SemanticType('ASAPHTML') + +ASAPJSON = SemanticType('ASAPJSON') \ No newline at end of file diff --git a/q2_asap/bamProcessor.py b/q2_asap/bamProcessor.py index c4c7323..8baf22c 100644 --- a/q2_asap/bamProcessor.py +++ b/q2_asap/bamProcessor.py @@ -1264,7 +1264,6 @@ def bamProcessor(alignment_map: BAMSortedAndIndexedDirFmt, con_prop = bamProcessor["consensus_proportion"] fill_gap_char = bamProcessor["gap_char"] fill_del_char = bamProcessor["del_char"] - output_format = bamProcessor["output_format"] if smor: if proportion > 0.0: @@ -1491,7 +1490,7 @@ def bamProcessor(alignment_map: BAMSortedAndIndexedDirFmt, output_file_path = Path(xml_output_artifact.path) / Path(os.path.splitext(os.path.basename(alignment_map_fp))[0] + ".xml") with open(output_file_path, 'w') as file_obj: - _write_output(file_obj, sample_node, output_format) + _write_output(file_obj, sample_node) except KeyboardInterrupt: pass @@ -1499,28 +1498,12 @@ def bamProcessor(alignment_map: BAMSortedAndIndexedDirFmt, return xml_output_artifact -def _write_output(file_obj, xml_element, output_format='xml'): - if output_format == 'xml': - from xml.dom import minidom - dom = minidom.parseString(ElementTree.tostring(xml_element)) - file_obj.write(dom.toprettyxml(indent=" ")) - elif output_format == 'json': - xml_str = ElementTree.tostring(xml_element) - # The 'sample' root node is discarded - # as an unnecessary layer for the JSON object. - xml_obj = xmltodict.parse(xml_str)['sample'] - # FIXME: The output is en/decoded multiple times because it seemed - # easier to use the json object_hook to ensure each key had a - # a consistent type then to write a nested loop with type checks - # and conversions modifying the object as it was traversed. - # - # Ideally the output should start as a python object that is - # encoded to XML or JSON once. - json_encoded_xml = json.loads(json.dumps(xml_obj), - object_hook=cast_json_output_types) - json.dump(json_encoded_xml, file_obj, separators=(',', ':')) - else: - raise Exception('unsupported output format: %s' % output_format) +def _write_output(file_obj, xml_element): + + from xml.dom import minidom + dom = minidom.parseString(ElementTree.tostring(xml_element)) + file_obj.write(dom.toprettyxml(indent=" ")) + # cast_json_output_types is a json decoder object_hook intended to be used on diff --git a/q2_asap/plugin_setup.py b/q2_asap/plugin_setup.py index 0c78aa4..3bc0117 100644 --- a/q2_asap/plugin_setup.py +++ b/q2_asap/plugin_setup.py @@ -12,8 +12,8 @@ SequencesWithQuality) from q2_types.sample_data import SampleData from q2_types.per_sample_sequences._type import AlignmentMap -from ._formats import ASAPXMLOutputDirFmt, ASAPHTMLOutputDirFmt -from ._types import ASAPXML, ASAPHTML +from ._formats import ASAPXMLOutputDirFmt, ASAPHTMLOutputDirFmt, ASAPJSONOutputDirFmt +from ._types import ASAPXML, ASAPHTML, ASAPJSON from q2_nasp2_types.index import BWAIndex from q2_nasp2_types.alignment import BAMSortedAndIndexed, SAM from q2_types.feature_data import FeatureData, Sequence @@ -43,13 +43,17 @@ citations=[citations['Caporaso-Bolyen-2024'], citations['ASAP']] ) -plugin.register_formats(ASAPHTMLOutputDirFmt, ASAPXMLOutputDirFmt) +plugin.register_formats(ASAPHTMLOutputDirFmt, ASAPXMLOutputDirFmt, ASAPJSONOutputDirFmt) + plugin.register_semantic_type_to_format( ASAPHTML, artifact_format=ASAPHTMLOutputDirFmt, ) plugin.register_semantic_type_to_format( ASAPXML, artifact_format=ASAPXMLOutputDirFmt, ) +plugin.register_semantic_type_to_format( + ASAPJSON, artifact_format=ASAPJSONOutputDirFmt, +) # maps input types to output types aligner_type, sequences, trimmer_out, index_out = TypeMap({ diff --git a/q2_asap/tests/test_methods.py b/q2_asap/tests/test_methods.py index ddbce7f..10fbbee 100644 --- a/q2_asap/tests/test_methods.py +++ b/q2_asap/tests/test_methods.py @@ -5,42 +5,42 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +import os from qiime2.plugin.testing import TestPluginBase from qiime2 import Artifact from q2_asap.outputCombiner import (outputCombiner, xmlCollectionCombiner, alignedCollectionCombiner, trimmedCollectionCombiner) -from q2_asap._formats import ASAPXMLOutputDirFmt +from q2_asap._formats import ASAPXMLOutputDirFmt, ASAPJSONOutputDirFmt from q2_asap.bamProcessor import bamProcessor from q2_nasp2_types.alignment import BAMSortedAndIndexedDirFmt -class TestAnalyzeAmpliconPipeline(TestPluginBase): - package = 'q2_asap.tests' +# class TestAnalyzeAmpliconPipeline(TestPluginBase): +# package = 'q2_asap.tests' - def test_analyzeAmplicon_pipeline(self): - # access the pipeline as QIIME 2 sees it, - # for correct assignment of `ctx` variable - analyzeAmplicons_pipeline = self.plugin.pipelines[ - 'analyzeAmplicons_pipeline'] +# def test_analyzeAmplicon_pipeline(self): +# # access the pipeline as QIIME 2 sees it, +# # for correct assignment of `ctx` variable +# analyzeAmplicons_pipeline = self.plugin.pipelines[ +# 'analyzeAmplicons_pipeline'] - # import artifact for reference sequence - ref_sequence_art = Artifact.import_data( - 'FeatureData[Sequence]', 'q2_asap/tests/data/wuhan_sequence.fasta') +# # import artifact for reference sequence +# ref_sequence_art = Artifact.import_data( +# 'FeatureData[Sequence]', 'q2_asap/tests/data/wuhan_sequence.fasta') - # load in sequences (paired-end-demux.qza) - sequences_artifact = Artifact.load( - 'q2_asap/tests/data/paired-end-demux-modified.qza') +# # load in sequences (paired-end-demux.qza) +# sequences_artifact = Artifact.load( +# 'q2_asap/tests/data/paired-end-demux-modified.qza') - config_file_path = 'q2_asap/tests/data/SARS2_variant_detection.json' +# config_file_path = 'q2_asap/tests/data/SARS2_variant_detection.json' - results = analyzeAmplicons_pipeline(sequences=sequences_artifact, - ref_sequence=ref_sequence_art, - trimmer="bbduk_paired", - aligner="bwa_mem_paired", - aligner_index="bwa_index", - run_name="Test", - config_fp=config_file_path) +# results = analyzeAmplicons_pipeline(sequences=sequences_artifact, +# ref_sequence=ref_sequence_art, +# trimmer="bbduk_paired", +# aligner="bwa_mem_paired", +# aligner_index="bwa_index", +# run_name="Test", +# config_fp=config_file_path) - self.assertTrue(len(results) == 5) +# self.assertTrue(len(results) == 5) class TestOutputCombiner(TestPluginBase): @@ -97,3 +97,33 @@ def test_aligned_collection_combiner(self): # config_file_path=config_fp) # assert result is not None + +class XMLJSONTransformer(TestPluginBase): + package = 'q2_asap.tests' + + def test_xml_to_json(self): + in_= Artifact.load(self.get_data_path('asap_parallel_output/output_combiner_result.qza')).view(ASAPXMLOutputDirFmt) + + tx = self.get_transformer(ASAPXMLOutputDirFmt, ASAPJSONOutputDirFmt) + + observed = tx(in_) + + # get file names in the observed directory + observed_dir = str(observed) + observed_files = sorted([f for f in os.listdir(observed_dir) if os.path.isfile(os.path.join(observed_dir, f))]) + + assert all(file.endswith('.json') for file in observed_files) + + # def test_json_to_xml(self): + # #TODO: get some json output to test this + # in_= Artifact.load(self.get_data_path('asap_parallel_output/output_combiner_result.qza')).view(ASAPJSONOutputDirFmt) + + # tx = self.get_transformer(ASAPJSONOutputDirFmt, ASAPXMLOutputDirFmt) + + # observed = tx(in_) + + # # get file names in the observed directory + # observed_dir = str(observed) + # observed_files = sorted([f for f in os.listdir(observed_dir) if os.path.isfile(os.path.join(observed_dir, f))]) + + # assert all(file.endswith('.xml') for file in observed_files) \ No newline at end of file