generated from NOAA-GFDL/template-repository
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3f6c8a9
commit 81a5757
Showing
10 changed files
with
5,148 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
input_path: "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" | ||
output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#what kind of directory structure to expect? | ||
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp | ||
# the output_path_template is set as follows. | ||
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we | ||
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example | ||
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure | ||
#this is a valid value in headerlist as well. | ||
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template | ||
#for the fourth value. | ||
|
||
#catalog headers | ||
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction | ||
#with the ESM collection specification standards and the appropriate workflows. | ||
|
||
headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", | ||
"frequency", "modeling_realm", "table_id", | ||
"member_id", "grid_label", "variable_id", | ||
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] | ||
|
||
#what kind of directory structure to expect? | ||
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp | ||
# the output_path_template is set as follows. | ||
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we | ||
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example | ||
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure | ||
#this is a valid value in headerlist as well. | ||
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template | ||
#for the fourth value. | ||
|
||
output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq'] | ||
|
||
output_file_template: ['modeling_realm','temporal_subset','variable_id'] | ||
|
||
#OUTPUT FILE INFO is currently passed as command-line argument. | ||
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. | ||
#csvfile = #jsonfile = #logfile = | ||
|
||
####################################################### | ||
|
||
input_path: "/Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" | ||
output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
#!/usr/bin/env python | ||
|
||
import json | ||
import sys | ||
import click | ||
import os | ||
from pathlib import Path | ||
import logging | ||
|
||
logger = logging.getLogger('local') | ||
logger.setLevel(logging.INFO) | ||
|
||
try: | ||
from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser | ||
except ModuleNotFoundError: | ||
print("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") | ||
print("Attempting again with adjusted sys.path ") | ||
try: | ||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||
except: | ||
print("Unable to adjust sys.path") | ||
#print(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||
try: | ||
from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser | ||
except ModuleNotFoundError: | ||
sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") | ||
|
||
package_dir = os.path.dirname(os.path.abspath(__file__)) | ||
template_path = os.path.join(package_dir, '../cats/gfdl_template.json') | ||
|
||
#Setting up argument parsing/flags | ||
@click.command() | ||
#TODO arguments dont have help message. So consider changing arguments to options? | ||
@click.argument('input_path',required=False,nargs=1) | ||
#,help='The directory path with the datasets to be cataloged. E.g a GFDL PP path till /pp') | ||
@click.argument('output_path',required=False,nargs=1) | ||
#,help='Specify output filename suffix only. e.g. catalog') | ||
@click.option('--config',required=False,type=click.Path(exists=True),nargs=1,help='Path to your yaml config, Use the config_template in intakebuilder repo') | ||
@click.option('--filter_realm', nargs=1) | ||
@click.option('--filter_freq', nargs=1) | ||
@click.option('--filter_chunk', nargs=1) | ||
@click.option('--overwrite', is_flag=True, default=False) | ||
@click.option('--append', is_flag=True, default=False) | ||
def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None, | ||
overwrite=False, append=False): | ||
|
||
configyaml = None | ||
# TODO error catching | ||
#print("input path: ",input_path, " output path: ", output_path) | ||
if input_path is None or output_path is None: | ||
print("No paths given, using yaml configuration") | ||
configyaml = configparser.Config(config) | ||
if configyaml.input_path is None or not configyaml.input_path : | ||
sys.exit("Can't find paths, is yaml configured?") | ||
|
||
input_path = configyaml.input_path | ||
output_path = configyaml.output_path | ||
|
||
if not os.path.exists(input_path): | ||
sys.exit("Input path does not exist. Adjust configuration.") | ||
if not os.path.exists(Path(output_path).parent.absolute()): | ||
sys.exit("Output path parent directory does not exist. Adjust configuration.") | ||
project_dir = input_path | ||
csv_path = "{0}.csv".format(output_path) | ||
json_path = "{0}.json".format(output_path) | ||
|
||
######### SEARCH FILTERS ########################### | ||
|
||
dictFilter = {} | ||
dictFilterIgnore = {} | ||
if filter_realm: | ||
dictFilter["modeling_realm"] = filter_realm | ||
if filter_freq: | ||
dictFilter["frequency"] = filter_freq | ||
if filter_chunk: | ||
dictFilter["chunk_freq"] = filter_chunk | ||
|
||
''' Override config file if necessary for dev | ||
project_dir = "/archive/oar.gfdl.cmip6/ESM4/DECK/ESM4_1pctCO2_D1/gfdl.ncrc4-intel16-prod-openmp/pp/" | ||
#for dev csvfile = "/nbhome/$USER/intakebuilder_cats/intake_gfdl2.csv" | ||
dictFilterIgnore = {} | ||
dictFilter["modeling_realm"]= 'atmos_cmip' | ||
dictFilter["frequency"] = "monthly" | ||
dictFilter["chunk_freq"] = "5yr" | ||
dictFilterIgnore["remove"]= 'DO_NOT_USE' | ||
''' | ||
######################################################### | ||
dictInfo = {} | ||
project_dir = project_dir.rstrip("/") | ||
logger.info("Calling gfdlcrawler.crawlLocal") | ||
list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml) | ||
#Grabbing data from template JSON, changing CSV path to match output path, and dumping data in new JSON | ||
with open(template_path, "r") as jsonTemplate: | ||
data = json.load(jsonTemplate) | ||
data["catalog_file"] = os.path.abspath(csv_path) | ||
jsonFile = open(json_path, "w") | ||
json.dump(data, jsonFile, indent=2) | ||
jsonFile.close() | ||
headers = CSVwriter.getHeader(configyaml) | ||
|
||
# When we pass relative path or just the filename the following still needs to not choke | ||
# so we check if it's a directory first | ||
if os.path.isdir(os.path.dirname(csv_path)): | ||
os.makedirs(os.path.dirname(csv_path), exist_ok=True) | ||
CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append) | ||
print("JSON generated at:", os.path.abspath(json_path)) | ||
print("CSV generated at:", os.path.abspath(csv_path)) | ||
logger.info("CSV generated at" + os.path.abspath(csv_path)) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.