Skip to content

Commit

Permalink
tmerlis additions to config, notebooks to build and use X-SHiELD data…
Browse files Browse the repository at this point in the history
… catalog on stellar HPC
  • Loading branch information
Timothy Merlis committed Nov 26, 2024
1 parent 038de4d commit 3d21a4e
Show file tree
Hide file tree
Showing 3 changed files with 2,629 additions and 0 deletions.
68 changes: 68 additions & 0 deletions configs/config-xshield_stellar.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

# default, failed because standard_name not compatible with X-SHiELD output
# may be okay with updated diag table that uses cmip variable names
#headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
# "frequency", "realm", "table_id",
# "member_id", "grid_label", "variable_id",
# "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]

# working prototype v1
#headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
# "frequency", "realm", "table_id",
# "member_id", "grid_label", "variable_id",
# "time_range", "chunk_freq","platform","dimensions","cell_methods","path"]

# eliminated some unused headers, but
# eliminating frequency, realm, table_id, chunk_freq, causes a problem
# when col.search is called
headerlist: ["source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"time_range", "chunk_freq","platform","path"]

#headerlist: ["source_id","platform","activity_id", "experiment_id",
# "frequency", "realm", "table_id",
# "member_id", "grid_label", "variable_id",
# "time_range", "chunk_freq","platform","dimensions","cell_methods","path"]

#output_path_template: ['NA','NA','source_id','platform','activity_id','experiment_id','custom_pp','time_range']

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template: ['NA','NA','NA','source_id','platform','member_id','experiment_id','custom_pp','time_range']

output_file_template: ['variable_id','NA']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################

# note: this input path is built from softlinks to the directory /scratch/cimes/GLOBALFV3/stellar_run/
# cp -as /scratch/cimes/GLOBALFV3/stellar_run/processed/ /scratch/cimes/tmerlis/GLOBALFV3/stellar_run/processed
# cp -as /scratch/cimes/GLOBALFV3/stellar_run/processed_new/ /scratch/cimes/tmerlis/GLOBALFV3/stellar_run/processed_new
# this avoids the other directories that contain other experiments that have not been 'processed'
input_path: "/scratch/cimes/tmerlis/GLOBALFV3/stellar_run/"
output_path: "/home/tmerlis/hackathon/catbuild/pire4" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
102 changes: 102 additions & 0 deletions examples/build_xshield_catalog.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "2a5a9746-b56f-4462-b938-ae17f7129a8d",
"metadata": {},
"outputs": [],
"source": [
"import catalogbuilder\n",
"from catalogbuilder.scripts import gen_intake_gfdl\n",
"import sys, os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1b0f5306-0032-4e0a-8c4a-b5bcb1d2ab7f",
"metadata": {},
"outputs": [],
"source": [
"#This is an example call to run catalog builder using a yaml config file.\n",
"\n",
"configyaml = '/home/tmerlis/hackathon/catbuild/config-xshield_stellar.yaml' \n",
"# soft link to processed and processed_new **only** from within '/scratch/cimes/GLOBALFV3/stellar_run/' \n",
"input_path = '/scratch/cimes/tmerlis/GLOBALFV3/stellar_run/' \n",
"output_path = '/home/tmerlis/hackathon/catbuild/xshield_cat' "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e8d89aec-655e-4729-820b-a37ffb60454e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:local:[Mostly] silent log activated\n",
"INFO:local:Default schema: catalogbuilder/cats/gfdl_template.json\n",
"INFO:local:input path: /scratch/cimes/tmerlis/GLOBALFV3/stellar_run/\n",
"INFO:local: output path: /home/tmerlis/hackathon/catbuild/xshield_cat\n",
"JSON generated at: /home/tmerlis/hackathon/catbuild/xshield_cat.json\n",
"CSV generated at: /home/tmerlis/hackathon/catbuild/xshield_cat.csv\n",
"INFO:local:CSV generated at/home/tmerlis/hackathon/catbuild/xshield_cat.csv\n",
"CPU times: user 4.76 s, sys: 12.6 s, total: 17.3 s\n",
"Wall time: 2min 5s\n"
]
}
],
"source": [
"%%time\n",
"def create_catalog_from_config(input_path=input_path,output_path=output_path,configyaml=configyaml):\n",
" csv, json = gen_intake_gfdl.create_catalog(input_path=input_path,output_path=output_path,config=configyaml)\n",
" return(csv,json)\n",
"\n",
"if __name__ == '__main__':\n",
" create_catalog_from_config(input_path,output_path,configyaml)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "69de4f3a-c622-444f-9bdf-cb3459d3261c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"No traceback available to show.\n"
]
}
],
"source": [
"%tb"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "catalogbuilder [~/.conda/envs/catalogbuilder/]",
"language": "python",
"name": "conda_catalogbuilder"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 3d21a4e

Please sign in to comment.