Merge pull request #24 from aradhakrishnanGFDL/16-pythoncalls

16 pythoncalls
NOAA-GFDL · Aug 1, 2024 · c949d6b · c949d6b
2 parents bcb2d17 + c23e862
commit c949d6b
Show file tree

Hide file tree

Showing 9 changed files with 138 additions and 47 deletions.
diff --git a/.github/workflows/conda-env-create-run-pytest.yml b/.github/workflows/conda-env-create-run-pytest.yml
@@ -29,14 +29,6 @@ jobs:
         # install catalogbuilder to conda env directories
         $CONDA/envs/catalogbuilder/bin/python -m pip install --prefix $CONDA/envs/catalogbuilder .
 
-    - name: Run pytest in catalogbuilder conda environment
-      run: |
-        which python
-        python --version
-        $CONDA/envs/catalogbuilder/bin/python --version
-        # which pytest
-        $CONDA/envs/catalogbuilder/bin/pytest catalogbuilder
-
     - name: Make sample data
       run: |
         which python

diff --git a/.github/workflows/conda-pkg-extra-tests.yml b/.github/workflows/conda-pkg-extra-tests.yml
@@ -0,0 +1,35 @@
+name: conda-pkg-extra-tests
+on:
+  pull_request:
+    branches:
+    # for testing conda build w no upload during PRs
+    - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    container:
+      image: continuumio/miniconda3:latest
+    steps:
+    - name: Checkout Files
+      uses: actions/checkout@v4
+    - name: Run Docker to Build
+      run: |
+        conda config --append channels conda-forge
+        conda config --append channels noaa-gfdl
+        conda install conda-build conda-verify
+        conda build .
+    - name: Run additional utilities as tests
+      run: |
+        conda create --name catalogbuildertest 
+        conda install -n catalogbuildertest catalogbuilder --use-local
+        /opt/conda/envs/catalogbuildertest/bin/pytest catalogbuilder/tests/test_create_catalog.py
+        #we will save the output from following alone as manifest
+    - name: upload-artifacts
+      uses: actions/upload-artifact@v4
+      with:
+          name: workflow-artifacts1 
+          path: |
+            sample-mdtf-catalog.csv
+            sample-mdtf-catalog.json
+    - name: Download all workflow run artifacts
+      uses: actions/download-artifact@v4
diff --git a/README.md b/README.md
@@ -4,5 +4,4 @@ Cite our work: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5196586.svg)]
 
 See our [project documentation site ](https://noaa-gfdl.github.io/CatalogBuilder/).
 
-
 This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
diff --git a/catalogbuilder/scripts/configs/config-example.yml b/catalogbuilder/scripts/configs/config-example.yml
@@ -1,2 +1,41 @@
-input_path: "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
-output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+output_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+input_path:  "archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp"
+output_path: "sample-mdtf-catalog"  # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py
@@ -29,23 +29,8 @@
 package_dir = os.path.dirname(os.path.abspath(__file__))
 template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
 
-#Setting up argument parsing/flags
-@click.command()
-#TODO arguments dont have help message. So consider changing arguments to options?
-@click.argument('input_path',required=False,nargs=1)
-#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp')
-@click.argument('output_path',required=False,nargs=1)
-#,help='Specify output filename suffix only. e.g. catalog')
-@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo')
-@click.option('--filter_realm', nargs=1)
-@click.option('--filter_freq', nargs=1)
-@click.option('--filter_chunk', nargs=1)
-@click.option('--overwrite', is_flag=True, default=False)
-@click.option('--append', is_flag=True, default=False)
-@click.option('--slow','-s', is_flag=True, default=False)
-def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
+def create_catalog(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
          overwrite=False, append=False, slow = False):
-
     configyaml = None
     # TODO error catching
     #print("input path: ",input_path, " output path: ", output_path)
@@ -86,7 +71,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
     dictFilter["chunk_freq"] = "5yr"
     dictFilterIgnore["remove"]= 'DO_NOT_USE'
     '''
-    #########################################################
     dictInfo = {}
     project_dir = project_dir.rstrip("/")
     logger.info("Calling gfdlcrawler.crawlLocal")
@@ -125,7 +109,25 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
     print("JSON generated at:", os.path.abspath(json_path))
     print("CSV generated at:", os.path.abspath(csv_path))
     logger.info("CSV generated at" + os.path.abspath(csv_path))
+    return(csv_path,json_path)
 
+#Setting up argument parsing/flags
+@click.command()
+#TODO arguments dont have help message. So consider changing arguments to options?
+@click.argument('input_path',required=False,nargs=1)
+#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp')
+@click.argument('output_path',required=False,nargs=1)
+#,help='Specify output filename suffix only. e.g. catalog')
+@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo')
+@click.option('--filter_realm', nargs=1)
+@click.option('--filter_freq', nargs=1)
+@click.option('--filter_chunk', nargs=1)
+@click.option('--overwrite', is_flag=True, default=False)
+@click.option('--append', is_flag=True, default=False)
+@click.option('--slow','-s', is_flag=True, default=False)
 
+def create_catalog_cli(**kwargs):
+    return create_catalog(**kwargs)
+
 if __name__ == '__main__':
-    main()
+    create_catalog_cli()
diff --git a/catalogbuilder/scripts/gen_intake_gfdl_runner.py b/catalogbuilder/scripts/gen_intake_gfdl_runner.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
 
 #TODO test after conda pkg is published and make changes as needed 
-#from catalogbuilder.scripts import gen_intake_gfdl
-from . import gen_intake_gfdl
+from catalogbuilder.scripts import gen_intake_gfdl
 import sys
 
-input_path = "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
+input_path = "archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp"
 output_path = "test"
-sys.argv = ['INPUT_PATH', input_path, output_path]
-print(sys.argv)
-gen_intake_gfdl.main()
+try:
+  gen_intake_gfdl.create_catalog(input_path,output_path)
+except:
+  sys.exit("Exception occured calling gen_intake_gfdl.create_catalog")
 
diff --git a/catalogbuilder/scripts/gen_intake_gfdl_runner_config.py b/catalogbuilder/scripts/gen_intake_gfdl_runner_config.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python
 
-#from catalogbuilder.scripts import gen_intake_gfdl
-from . import gen_intake_gfdl
-import sys
-
-# this will break at some point #TODO
-sys.argv = ['input_path','--config', '/home/a1r/github/CatalogBuilder/scripts/configs/config-example.yml']
-print(sys.argv)
-gen_intake_gfdl.main()
+from catalogbuilder.scripts import gen_intake_gfdl
+import sys, os 
+
+#This is an example call to run catalog builder using a yaml config file.
+package_dir = os.path.dirname(os.path.abspath(__file__))
+configyaml = os.path.join(package_dir, 'configs/config-example.yml')
+
+def create_catalog_from_config(config=configyaml):
+    csv, json = gen_intake_gfdl.create_catalog(config=configyaml)
+    return(csv,json)
 
diff --git a/catalogbuilder/tests/make_sample_data.py b/catalogbuilder/tests/make_sample_data.py
@@ -12,17 +12,24 @@
 
 """
 import os
-import subdirs
-from subdirs import *
 from pathlib import Path
 
-realm_mapping = [realm]
 root_dir = 'archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp'
-freq_mapping = [freq]
 chunk_freq = '1yr'
 
 def make_sample_data():
     # Create directory
+    try: 
+        import subdirs
+    except:
+        import sys
+        print((os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"tests")))
+        sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),"tests"))
+        import subdirs
+    from subdirs import realm, freq, time,vars 
+    realm_mapping = [realm]
+    freq_mapping = [freq]
+
     realm_ctr = (len(subdirs.realm))
     i = 0
     for j in range(0, realm_ctr):

diff --git a/catalogbuilder/tests/test_create_catalog.py b/catalogbuilder/tests/test_create_catalog.py
@@ -0,0 +1,15 @@
+def test_create_catalog():
+      from pathlib import Path
+      import catalogbuilder
+      from catalogbuilder.scripts import gen_intake_gfdl_runner_config
+      from catalogbuilder.tests import make_sample_data
+      make_sample_data.make_sample_data()
+      json, csv = gen_intake_gfdl_runner_config.create_catalog_from_config()
+      #to output success/failure in pytest run with conda pkg local install in extra-tests CI workflow#
+      print(csv)
+      csvpath = Path(csv)
+      jsonpath = Path(json)
+      assert csvpath.is_file()
+      assert jsonpath.is_file()
+
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,5 +4,4 @@ Cite our work: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5196586.svg)]

		See our [project documentation site ](https://noaa-gfdl.github.io/CatalogBuilder/).


		This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!