Skip to content

Commit

Permalink
moving scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
aradhakrishnanGFDL committed Jul 19, 2024
1 parent 3f6c8a9 commit 81a5757
Show file tree
Hide file tree
Showing 10 changed files with 5,148 additions and 0 deletions.
Empty file.
2 changes: 2 additions & 0 deletions catalogbuilder/scripts/configs/config-example.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
input_path: "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
41 changes: 41 additions & 0 deletions catalogbuilder/scripts/configs/config-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "modeling_realm", "table_id",
"member_id", "grid_label", "variable_id",
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']

output_file_template: ['modeling_realm','temporal_subset','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################

input_path: "/Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/"
output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
112 changes: 112 additions & 0 deletions catalogbuilder/scripts/gen_intake_gfdl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python

import json
import sys
import click
import os
from pathlib import Path
import logging

logger = logging.getLogger('local')
logger.setLevel(logging.INFO)

try:
from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser
except ModuleNotFoundError:
print("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
print("Attempting again with adjusted sys.path ")
try:
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
except:
print("Unable to adjust sys.path")
#print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
try:
from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser
except ModuleNotFoundError:
sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")

package_dir = os.path.dirname(os.path.abspath(__file__))
template_path = os.path.join(package_dir, '../cats/gfdl_template.json')

#Setting up argument parsing/flags
@click.command()
#TODO arguments dont have help message. So consider changing arguments to options?
@click.argument('input_path',required=False,nargs=1)
#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp')
@click.argument('output_path',required=False,nargs=1)
#,help='Specify output filename suffix only. e.g. catalog')
@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo')
@click.option('--filter_realm', nargs=1)
@click.option('--filter_freq', nargs=1)
@click.option('--filter_chunk', nargs=1)
@click.option('--overwrite', is_flag=True, default=False)
@click.option('--append', is_flag=True, default=False)
def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
overwrite=False, append=False):

configyaml = None
# TODO error catching
#print("input path: ",input_path, " output path: ", output_path)
if input_path is None or output_path is None:
print("No paths given, using yaml configuration")
configyaml = configparser.Config(config)
if configyaml.input_path is None or not configyaml.input_path :
sys.exit("Can't find paths, is yaml configured?")

input_path = configyaml.input_path
output_path = configyaml.output_path

if not os.path.exists(input_path):
sys.exit("Input path does not exist. Adjust configuration.")
if not os.path.exists(Path(output_path).parent.absolute()):
sys.exit("Output path parent directory does not exist. Adjust configuration.")
project_dir = input_path
csv_path = "{0}.csv".format(output_path)
json_path = "{0}.json".format(output_path)

######### SEARCH FILTERS ###########################

dictFilter = {}
dictFilterIgnore = {}
if filter_realm:
dictFilter["modeling_realm"] = filter_realm
if filter_freq:
dictFilter["frequency"] = filter_freq
if filter_chunk:
dictFilter["chunk_freq"] = filter_chunk

''' Override config file if necessary for dev
project_dir = "/archive/oar.gfdl.cmip6/ESM4/DECK/ESM4_1pctCO2_D1/gfdl.ncrc4-intel16-prod-openmp/pp/"
#for dev csvfile = "/nbhome/$USER/intakebuilder_cats/intake_gfdl2.csv"
dictFilterIgnore = {}
dictFilter["modeling_realm"]= 'atmos_cmip'
dictFilter["frequency"] = "monthly"
dictFilter["chunk_freq"] = "5yr"
dictFilterIgnore["remove"]= 'DO_NOT_USE'
'''
#########################################################
dictInfo = {}
project_dir = project_dir.rstrip("/")
logger.info("Calling gfdlcrawler.crawlLocal")
list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml)
#Grabbing data from template JSON, changing CSV path to match output path, and dumping data in new JSON
with open(template_path, "r") as jsonTemplate:
data = json.load(jsonTemplate)
data["catalog_file"] = os.path.abspath(csv_path)
jsonFile = open(json_path, "w")
json.dump(data, jsonFile, indent=2)
jsonFile.close()
headers = CSVwriter.getHeader(configyaml)

# When we pass relative path or just the filename the following still needs to not choke
# so we check if it's a directory first
if os.path.isdir(os.path.dirname(csv_path)):
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append)
print("JSON generated at:", os.path.abspath(json_path))
print("CSV generated at:", os.path.abspath(csv_path))
logger.info("CSV generated at" + os.path.abspath(csv_path))


if __name__ == '__main__':
main()
Loading

0 comments on commit 81a5757

Please sign in to comment.