tmerlis additions to config, notebooks to build and use X-SHiELD data…

… catalog on stellar HPC
NOAA-GFDL · Nov 26, 2024 · 3d21a4e · 3d21a4e
1 parent 038de4d
commit 3d21a4e
Show file tree

Hide file tree

Showing 3 changed files with 2,629 additions and 0 deletions.
diff --git a/configs/config-xshield_stellar.yaml b/configs/config-xshield_stellar.yaml
@@ -0,0 +1,68 @@
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+# default, failed because standard_name not compatible with X-SHiELD output
+# may be okay with updated diag table that uses cmip variable names
+#headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+#                  "frequency", "realm", "table_id",
+#                  "member_id", "grid_label", "variable_id",
+#                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+# working prototype v1
+#headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+#                  "frequency", "realm", "table_id",
+#                  "member_id", "grid_label", "variable_id",
+#                  "time_range", "chunk_freq","platform","dimensions","cell_methods","path"]
+
+# eliminated some unused headers, but
+# eliminating frequency, realm, table_id, chunk_freq, causes a problem
+# when col.search is called
+headerlist: ["source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","path"]
+
+#headerlist: ["source_id","platform","activity_id", "experiment_id",
+#                  "frequency", "realm", "table_id",
+#                  "member_id", "grid_label", "variable_id",
+#                  "time_range", "chunk_freq","platform","dimensions","cell_methods","path"]
+
+#output_path_template: ['NA','NA','source_id','platform','activity_id','experiment_id','custom_pp','time_range']
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template: ['NA','NA','NA','source_id','platform','member_id','experiment_id','custom_pp','time_range']
+
+output_file_template: ['variable_id','NA']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+# note: this input path is built from softlinks to the directory /scratch/cimes/GLOBALFV3/stellar_run/
+# cp -as /scratch/cimes/GLOBALFV3/stellar_run/processed/  /scratch/cimes/tmerlis/GLOBALFV3/stellar_run/processed
+# cp -as /scratch/cimes/GLOBALFV3/stellar_run/processed_new/  /scratch/cimes/tmerlis/GLOBALFV3/stellar_run/processed_new
+# this avoids the other directories that contain other experiments that have not been 'processed'
+input_path:  "/scratch/cimes/tmerlis/GLOBALFV3/stellar_run/"
+output_path: "/home/tmerlis/hackathon/catbuild/pire4" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
diff --git a/examples/build_xshield_catalog.ipynb b/examples/build_xshield_catalog.ipynb
@@ -0,0 +1,102 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2a5a9746-b56f-4462-b938-ae17f7129a8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import catalogbuilder\n",
+    "from catalogbuilder.scripts import gen_intake_gfdl\n",
+    "import sys, os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1b0f5306-0032-4e0a-8c4a-b5bcb1d2ab7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#This is an example call to run catalog builder using a yaml config file.\n",
+    "\n",
+    "configyaml = '/home/tmerlis/hackathon/catbuild/config-xshield_stellar.yaml' \n",
+    "# soft link to processed and processed_new **only** from within '/scratch/cimes/GLOBALFV3/stellar_run/' \n",
+    "input_path = '/scratch/cimes/tmerlis/GLOBALFV3/stellar_run/' \n",
+    "output_path = '/home/tmerlis/hackathon/catbuild/xshield_cat' "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e8d89aec-655e-4729-820b-a37ffb60454e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:local:[Mostly] silent log activated\n",
+      "INFO:local:Default schema: catalogbuilder/cats/gfdl_template.json\n",
+      "INFO:local:input path: /scratch/cimes/tmerlis/GLOBALFV3/stellar_run/\n",
+      "INFO:local: output path: /home/tmerlis/hackathon/catbuild/xshield_cat\n",
+      "JSON generated at: /home/tmerlis/hackathon/catbuild/xshield_cat.json\n",
+      "CSV generated at: /home/tmerlis/hackathon/catbuild/xshield_cat.csv\n",
+      "INFO:local:CSV generated at/home/tmerlis/hackathon/catbuild/xshield_cat.csv\n",
+      "CPU times: user 4.76 s, sys: 12.6 s, total: 17.3 s\n",
+      "Wall time: 2min 5s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "def create_catalog_from_config(input_path=input_path,output_path=output_path,configyaml=configyaml):\n",
+    "    csv, json = gen_intake_gfdl.create_catalog(input_path=input_path,output_path=output_path,config=configyaml)\n",
+    "    return(csv,json)\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    create_catalog_from_config(input_path,output_path,configyaml)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "69de4f3a-c622-444f-9bdf-cb3459d3261c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No traceback available to show.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%tb"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "catalogbuilder [~/.conda/envs/catalogbuilder/]",
+   "language": "python",
+   "name": "conda_catalogbuilder"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}