From 86b729f2024440bb5c13db593a70d359e20a2014 Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Fri, 26 Jul 2024 14:05:08 -0400 Subject: [PATCH 01/16] add slow option, cr. Ciheim. --- catalogbuilder/scripts/gen_intake_gfdl.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 54fd2d0..306650d 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -42,8 +42,9 @@ @click.option('--filter_chunk', nargs=1) @click.option('--overwrite', is_flag=True, default=False) @click.option('--append', is_flag=True, default=False) +click.option('--slow','-s', is_flag=True, default=False) def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None, - overwrite=False, append=False): + overwrite=False, append=False, slow = False): configyaml = None # TODO error catching @@ -89,7 +90,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt dictInfo = {} project_dir = project_dir.rstrip("/") logger.info("Calling gfdlcrawler.crawlLocal") - list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml) + list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml,slow) #Grabbing data from template JSON, changing CSV path to match output path, and dumping data in new JSON with open(template_path, "r") as jsonTemplate: data = json.load(jsonTemplate) From 4dc949edca1393b26a99f1b1dc1a12bf6b32b3c7 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Fri, 26 Jul 2024 15:14:39 -0400 Subject: [PATCH 02/16] prelim changes to include cfname, not 100 complete.. --- catalogbuilder/intakebuilder/getinfo.py | 20 +++++++++- catalogbuilder/intakebuilder/gfdlcrawler.py | 17 +++++++-- catalogbuilder/scripts/gen_intake_gfdl.py | 4 +- catalogbuilder/tests/config-cfname.yaml | 41 +++++++++++++++++++++ 4 files changed, 75 insertions(+), 7 deletions(-) create mode 100644 catalogbuilder/tests/config-cfname.yaml diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index 5cbabaa..1b03c83 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -178,7 +178,25 @@ def getInfoFromDRS(dirpath,projectdir,dictInfo): def return_xr(fname): filexr = (xr.open_dataset(fname)) filexra = filexr.attrs - return filexra + return filexr,filexra +def getInfoFromVarAtts(fname,variable_id,dictInfo,att="standard_name",filexra=None): + ''' + Returns info from the filename and xarray dataset object + :param fname: filename + :param filexr: Xarray dataset object + :return: dictInfo with all variable atts + ''' + #try: + filexr,filexra = return_xr(fname) + print("Variable atts from file:",filexr[variable_id]) + if (dictInfo[att] == "na"): + try: + cfname = filexr[variable_id].attrs["standard_name"] + except KeyError: + cfname = "NA" + dictInfo["standard_name"] = cfname + print("standard_name found",dictInfo["standard_name"]) + return dictInfo def getInfoFromGlobalAtts(fname,dictInfo,filexra=None): ''' Returns info from the filename and xarray dataset object diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 0c451fe..6dc4641 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -1,17 +1,16 @@ import os #from intakebuilder import getinfo, builderconfig -from . import getinfo, builderconfig +from . import getinfo, builderconfig, CSVwriter import sys import re import operator as op ''' localcrawler crawls through the local file path, then calls helper functions in the package to getinfo. It finally returns a list of dict. eg {'project': 'CMIP6', 'path': '/uda/CMIP6/CDRMIP/NCC/NorESM2-LM/esm-pi-cdr-pulse/r1i1p1f1/Emon/zg/gn/v20191108/zg_Emon_NorESM2-LM_esm-pi-cdr-pulse_r1i1p1f1_gn_192001-192912.nc', 'variable': 'zg', 'mip_table': 'Emon', 'model': 'NorESM2-LM', 'experiment_id': 'esm-pi-cdr-pulse', 'ensemble_member': 'r1i1p1f1', 'grid_label': 'gn', 'temporal subset': '192001-192912', 'institute': 'NCC', 'version': 'v20191108'} - ''' -def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml): +def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): ''' - Craw through the local directory and run through the getInfo.. functions + crawl through the local directory and run through the getInfo.. functions :param projectdir: :return:listfiles which has a dictionary of all key/value pairs for each file to be added to the csv ''' @@ -74,6 +73,16 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml): rmkeys = list(set(rmkeys)) for k in rmkeys: dictInfo.pop(k,None) + # todo do the reverse if slow is on. Open file no matter what and populate dictionary values and if there is something missed out + # we can scan filenames or config etc + #here, we will see if there are missing header values and compare with file attributes if slow option is turned on + if (slow == True) & (bool(dictInfo) == True) : + print("Slow option turned on.. lets open some files using xarray and lookup atts",filename) + headers = CSVwriter.getHeader(configyaml) + if "standard_name" in headers: + dictInfo["standard_name"] = "na" + getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) listfiles.append(dictInfo) + return listfiles diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 306650d..6fbeca1 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -11,7 +11,6 @@ logger.setLevel(logging.INFO) try: - #from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser except ModuleNotFoundError: print("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") @@ -23,6 +22,7 @@ #print(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) try: from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser + print(gfdlcrawler.__file__) except ModuleNotFoundError: sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") @@ -42,7 +42,7 @@ @click.option('--filter_chunk', nargs=1) @click.option('--overwrite', is_flag=True, default=False) @click.option('--append', is_flag=True, default=False) -click.option('--slow','-s', is_flag=True, default=False) +@click.option('--slow','-s', is_flag=True, default=False) def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None, overwrite=False, append=False, slow = False): diff --git a/catalogbuilder/tests/config-cfname.yaml b/catalogbuilder/tests/config-cfname.yaml new file mode 100644 index 0000000..c44ab0b --- /dev/null +++ b/catalogbuilder/tests/config-cfname.yaml @@ -0,0 +1,41 @@ +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +#catalog headers +#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction +#with the ESM collection specification standards and the appropriate workflows. + +headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", + "frequency", "modeling_realm", "table_id", + "member_id", "grid_label", "variable_id", + "temporal_subset", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"] + +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq'] + +output_file_template: ['modeling_realm','temporal_subset','variable_id'] + +#OUTPUT FILE INFO is currently passed as command-line argument. +#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. +#csvfile = #jsonfile = #logfile = + +####################################################### + +input_path: "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" +output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) From c61b732741e8f0df112446b4183ed5ed12a88cf8 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Fri, 26 Jul 2024 17:28:04 -0400 Subject: [PATCH 03/16] standard name when you dont pass slow mode will use look up table, if standard name exists in the headerlist of your config file --- catalogbuilder/intakebuilder/getinfo.py | 22 ++++++++++++ catalogbuilder/intakebuilder/gfdlcrawler.py | 38 ++++++++++++++++----- catalogbuilder/tests/config-cfname.yaml | 8 ++--- 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index 1b03c83..e0040b0 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -223,3 +223,25 @@ def getInfoFromGlobalAtts(fname,dictInfo,filexra=None): dictInfo["frequency"] = frequency return dictInfo +def getStandardName(variable_id): + ''' + Returns standard name for the variable in question + ''' + unique_cf = "na" + try: + url = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip5_vars.csv" + df = pd.read_csv(url, sep=",", header=0,index_col=False) + except IOError: + print("Unable to open file") + sys.exit(1) + #search for variable and its cf name + cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"]) + #cfname.to_string(index=False).tolist() + list_cfname = cfname.tolist() + if not list_cfname: + print("what if the names correspond to CMOR_varname") + cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"]) + list_cfname = cfname.tolist() + if len(list_cfname) > 0: + unique_cf = list(set(list_cfname))[0] + return (unique_cf) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 6dc4641..909aaaa 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -1,6 +1,6 @@ import os #from intakebuilder import getinfo, builderconfig -from . import getinfo, builderconfig, CSVwriter +from . import getinfo, builderconfig import sys import re import operator as op @@ -21,6 +21,27 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): orig_pat = pat + if configyaml: + headerlist = configyaml.headerlist + else: + headerlist = builderconfig.headerlist + + #For those columns that we cannot find in output path template or output file template from config yaml, we have hooks + #now to look up the netcdf dataset if slow option is True + #todo catch exceptions upon furhter testing + list_ptemplate = configyaml.output_path_template + list_ftemplate = configyaml.output_file_template + set_ptemplate = set(list_ptemplate) + set_ftemplate = set(list_ftemplate) + #print(headerlist) + #print(list_ptemplate) + #print(list_ftemplate) + diffcols = [x for x in headerlist if x not in set_ptemplate] + missingcols = [col for col in diffcols if col not in set_ftemplate] + missingcols.remove("path") #because we get this anyway + print("Missing cols from metadata sources:", missingcols) + + #TODO INCLUDE filter in traversing through directories at the top for dirpath, dirs, files in os.walk(projectdir): searchpath = dirpath @@ -59,11 +80,6 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): if(dictInfo["chunk_freq"] in list_bad_chunklabel): logger.debug("Found bad chunk, skipping this possibly bad DRS filename",filepath) continue - - if configyaml: - headerlist = configyaml.headerlist - else: - headerlist = builderconfig.headerlist # remove those keys that are not CSV headers # move it so its one time rmkeys = [] @@ -71,15 +87,19 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): if dkeys not in headerlist: rmkeys.append(dkeys) rmkeys = list(set(rmkeys)) - for k in rmkeys: dictInfo.pop(k,None) + if (bool(dictInfo)) & ("standard_name" in missingcols) & (slow == False): + #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs + cfname = getinfo.getStandardName(dictInfo["variable_id"]) + dictInfo["standard_name"] = cfname + print("standard name from look-up table-", cfname) # todo do the reverse if slow is on. Open file no matter what and populate dictionary values and if there is something missed out # we can scan filenames or config etc #here, we will see if there are missing header values and compare with file attributes if slow option is turned on if (slow == True) & (bool(dictInfo) == True) : print("Slow option turned on.. lets open some files using xarray and lookup atts",filename) - headers = CSVwriter.getHeader(configyaml) - if "standard_name" in headers: + #todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml + if "standard_name" in missingcols: dictInfo["standard_name"] = "na" getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) diff --git a/catalogbuilder/tests/config-cfname.yaml b/catalogbuilder/tests/config-cfname.yaml index c44ab0b..95e3217 100644 --- a/catalogbuilder/tests/config-cfname.yaml +++ b/catalogbuilder/tests/config-cfname.yaml @@ -13,9 +13,9 @@ #with the ESM collection specification standards and the appropriate workflows. headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", - "frequency", "modeling_realm", "table_id", + "frequency", "realm", "table_id", "member_id", "grid_label", "variable_id", - "temporal_subset", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"] + "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"] #what kind of directory structure to expect? #For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp @@ -27,9 +27,9 @@ headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", #The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template #for the fourth value. -output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq'] +output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq'] -output_file_template: ['modeling_realm','temporal_subset','variable_id'] +output_file_template: ['realm','time_range','variable_id'] #OUTPUT FILE INFO is currently passed as command-line argument. #We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. From e53a238beb36193be15ffd2f13f67f426eff667c Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Fri, 26 Jul 2024 18:44:53 -0400 Subject: [PATCH 04/16] not working yet - fast opt --- catalogbuilder/scripts/gen_intake_gfdl.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 6fbeca1..525a4ef 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import json -import sys +import sys,pandas as pd import click import os from pathlib import Path @@ -11,7 +11,7 @@ logger.setLevel(logging.INFO) try: - from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser + from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser, getinfo except ModuleNotFoundError: print("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") print("Attempting again with adjusted sys.path ") @@ -21,7 +21,7 @@ print("Unable to adjust sys.path") #print(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) try: - from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser + from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser,getinfo print(gfdlcrawler.__file__) except ModuleNotFoundError: sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") @@ -104,7 +104,21 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt # so we check if it's a directory first if os.path.isdir(os.path.dirname(csv_path)): os.makedirs(os.path.dirname(csv_path), exist_ok=True) - CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append) + CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append,slow) + if(slow == False): + #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs + df = pd.read_csv(os.path.abspath(csv_path), sep=",", header=0,index_col=False) + list_variable_id = [] + list_variable_id = df["variable_id"].tolist() + dictVarCF = getinfo.getStandardName(list_variable_id) + print("standard name from look-up table-", dictVarCF) + for k, v in dictVarCF.items(): + #if(df['variable_id'].eq(k)).any(): + df['standard_name'].loc[(df['variable_id'].eq(k)).any()] = v + #df['standard_name'] = v + with open(csv_path, 'w') as csvfile: + df.to_csv(csvfile) + print("JSON generated at:", os.path.abspath(json_path)) print("CSV generated at:", os.path.abspath(csv_path)) logger.info("CSV generated at" + os.path.abspath(csv_path)) From 59b9e44983d6cf62567fc12bfc78fc88e2c56996 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Fri, 26 Jul 2024 18:45:24 -0400 Subject: [PATCH 05/16] fast opt incompleted --- catalogbuilder/intakebuilder/CSVwriter.py | 4 +++- catalogbuilder/intakebuilder/getinfo.py | 26 +++++++++++---------- catalogbuilder/intakebuilder/gfdlcrawler.py | 6 ----- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/catalogbuilder/intakebuilder/CSVwriter.py b/catalogbuilder/intakebuilder/CSVwriter.py index 021bf9c..f231bfd 100644 --- a/catalogbuilder/intakebuilder/CSVwriter.py +++ b/catalogbuilder/intakebuilder/CSVwriter.py @@ -1,5 +1,6 @@ import os.path import csv +import pandas as pd from csv import writer #from intakebuilder import builderconfig, configparser from . import builderconfig, configparser @@ -40,7 +41,7 @@ def file_appender(dictinputs, csvfile): # add contents of list as last row in the csv file csv_writer.writerow(dictinputs) -def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append): +def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append,slow): try: #Open the CSV file in write mode and add any data with atleast 3 values associated with it if overwrite: @@ -95,5 +96,6 @@ def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append): for data in dict_info: if len(data.keys()) > 2: writer.writerow(data) + except IOError: print("I/O error") diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index e0040b0..e343296 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -223,11 +223,12 @@ def getInfoFromGlobalAtts(fname,dictInfo,filexra=None): dictInfo["frequency"] = frequency return dictInfo -def getStandardName(variable_id): +def getStandardName(list_variable_id): ''' - Returns standard name for the variable in question + Returns dict standard name for the variable in question ''' unique_cf = "na" + dictCF = {} try: url = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip5_vars.csv" df = pd.read_csv(url, sep=",", header=0,index_col=False) @@ -235,13 +236,14 @@ def getStandardName(variable_id): print("Unable to open file") sys.exit(1) #search for variable and its cf name - cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"]) - #cfname.to_string(index=False).tolist() - list_cfname = cfname.tolist() - if not list_cfname: - print("what if the names correspond to CMOR_varname") - cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"]) - list_cfname = cfname.tolist() - if len(list_cfname) > 0: - unique_cf = list(set(list_cfname))[0] - return (unique_cf) + for variable_id in list_variable_id: + cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"]) + list_cfname = cfname.tolist() + if not list_cfname: + print("what if the names correspond to CMOR_varname") + cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"]) + list_cfname = cfname.tolist() + if len(list_cfname) > 0: + unique_cf = list(set(list_cfname))[0] + dictCF[variable_id] = unique_cf + return (dictCF) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 909aaaa..d095569 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -88,11 +88,6 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): rmkeys.append(dkeys) rmkeys = list(set(rmkeys)) for k in rmkeys: dictInfo.pop(k,None) - if (bool(dictInfo)) & ("standard_name" in missingcols) & (slow == False): - #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs - cfname = getinfo.getStandardName(dictInfo["variable_id"]) - dictInfo["standard_name"] = cfname - print("standard name from look-up table-", cfname) # todo do the reverse if slow is on. Open file no matter what and populate dictionary values and if there is something missed out # we can scan filenames or config etc #here, we will see if there are missing header values and compare with file attributes if slow option is turned on @@ -104,5 +99,4 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) listfiles.append(dictInfo) - return listfiles From 05ef32a4fb582af5fefea42f23263996058ff194 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Fri, 26 Jul 2024 18:48:13 -0400 Subject: [PATCH 06/16] fast option seems to be ok --- catalogbuilder/scripts/gen_intake_gfdl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 525a4ef..c0d5c12 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -114,7 +114,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt print("standard name from look-up table-", dictVarCF) for k, v in dictVarCF.items(): #if(df['variable_id'].eq(k)).any(): - df['standard_name'].loc[(df['variable_id'].eq(k)).any()] = v + df['standard_name'].loc[(df['variable_id'] == k)] = v #df['standard_name'] = v with open(csv_path, 'w') as csvfile: df.to_csv(csvfile) From 5374491c123d720e6a4bb240978f7ee37402191f Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 11:30:35 -0400 Subject: [PATCH 07/16] rm print --- catalogbuilder/intakebuilder/getinfo.py | 4 ++-- catalogbuilder/scripts/gen_intake_gfdl.py | 2 +- catalogbuilder/tests/config-cfname.yaml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index e343296..5406d0c 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -188,7 +188,7 @@ def getInfoFromVarAtts(fname,variable_id,dictInfo,att="standard_name",filexra=No ''' #try: filexr,filexra = return_xr(fname) - print("Variable atts from file:",filexr[variable_id]) + #print("Variable atts from file:",filexr[variable_id]) if (dictInfo[att] == "na"): try: cfname = filexr[variable_id].attrs["standard_name"] @@ -240,7 +240,7 @@ def getStandardName(list_variable_id): cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"]) list_cfname = cfname.tolist() if not list_cfname: - print("what if the names correspond to CMOR_varname") + #print("what if the names correspond to CMOR_varname") cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"]) list_cfname = cfname.tolist() if len(list_cfname) > 0: diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index c0d5c12..946b2cc 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -111,7 +111,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt list_variable_id = [] list_variable_id = df["variable_id"].tolist() dictVarCF = getinfo.getStandardName(list_variable_id) - print("standard name from look-up table-", dictVarCF) + #print("standard name from look-up table-", dictVarCF) for k, v in dictVarCF.items(): #if(df['variable_id'].eq(k)).any(): df['standard_name'].loc[(df['variable_id'] == k)] = v diff --git a/catalogbuilder/tests/config-cfname.yaml b/catalogbuilder/tests/config-cfname.yaml index 95e3217..21d8ceb 100644 --- a/catalogbuilder/tests/config-cfname.yaml +++ b/catalogbuilder/tests/config-cfname.yaml @@ -37,5 +37,5 @@ output_file_template: ['realm','time_range','variable_id'] ####################################################### -input_path: "/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" #"ENTER INPUT PATH HERE" #Example: /Users/ar46/archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp/" -output_path: "catalog" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) +input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/" +output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) From 411cda466b609f3e485992eaab52bc2ccd3d42a2 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 11:34:48 -0400 Subject: [PATCH 08/16] when no config is passed --- catalogbuilder/intakebuilder/gfdlcrawler.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index d095569..ff49925 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -29,10 +29,13 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): #For those columns that we cannot find in output path template or output file template from config yaml, we have hooks #now to look up the netcdf dataset if slow option is True #todo catch exceptions upon furhter testing - list_ptemplate = configyaml.output_path_template - list_ftemplate = configyaml.output_file_template - set_ptemplate = set(list_ptemplate) - set_ftemplate = set(list_ftemplate) + list_ptemplate = [] + list_ftemplate = [] + if( configyaml.output_path_template is not None) & (configyaml.output_file_template is not None) : + list_ptemplate = configyaml.output_path_template + list_ftemplate = configyaml.output_file_template + set_ptemplate = set(list_ptemplate) + set_ftemplate = set(list_ftemplate) #print(headerlist) #print(list_ptemplate) #print(list_ftemplate) From 5bdef42e1a2988b34a308def8ddfb1308f311a8f Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 11:39:30 -0400 Subject: [PATCH 09/16] referencing None object, fixing --- catalogbuilder/intakebuilder/gfdlcrawler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index ff49925..3cafb02 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -31,9 +31,10 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): #todo catch exceptions upon furhter testing list_ptemplate = [] list_ftemplate = [] - if( configyaml.output_path_template is not None) & (configyaml.output_file_template is not None) : - list_ptemplate = configyaml.output_path_template - list_ftemplate = configyaml.output_file_template + if( configyaml is not None): + if (configyaml.output_path_template is not None) & (configyaml.output_file_template is not None) : + list_ptemplate = configyaml.output_path_template + list_ftemplate = configyaml.output_file_template set_ptemplate = set(list_ptemplate) set_ftemplate = set(list_ftemplate) #print(headerlist) From 05193137d7318d5e71eb3ab3e254ceeedb1bf05d Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 11:58:18 -0400 Subject: [PATCH 10/16] minor fix to let ci pass --- catalogbuilder/intakebuilder/gfdlcrawler.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 3cafb02..e6bee89 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -40,10 +40,12 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): #print(headerlist) #print(list_ptemplate) #print(list_ftemplate) - diffcols = [x for x in headerlist if x not in set_ptemplate] - missingcols = [col for col in diffcols if col not in set_ftemplate] - missingcols.remove("path") #because we get this anyway - print("Missing cols from metadata sources:", missingcols) + if (len(set_ptemplate) > 0): + diffcols = [x for x in headerlist if x not in set_ptemplate] + if ( len(set_ftemplate) > 0 ): + missingcols = [col for col in diffcols if col not in set_ftemplate] + missingcols.remove("path") #because we get this anyway + print("Missing cols from metadata sources:", missingcols) #TODO INCLUDE filter in traversing through directories at the top From 78f00f67c71995f02fc91c4c9fefa20e3739010a Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 12:01:19 -0400 Subject: [PATCH 11/16] pandas future warnings off --- catalogbuilder/intakebuilder/getinfo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index 5406d0c..628193d 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -6,7 +6,8 @@ import xarray as xr #from intakebuilder import builderconfig, configparser from . import builderconfig, configparser - +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) ''' getinfo.py provides helper functions to get information (from filename, DRS, file/global attributes) needed to populate the catalog From 4fa54e7c6d8828fef19540c4a8592dc4a1979847 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 12:03:42 -0400 Subject: [PATCH 12/16] sphinx url updaed in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 896e4e0..d9dbf0d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Cite our work: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5196586.svg)](https://doi.org/10.5281/zenodo.10787602) -See our [project documentation site ](https://aradhakrishnangfdl.github.io/CatalogBuilder/). +See our [project documentation site ](https://noaa-gfdl.github.io/CatalogBuilder/). This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! From 8e9de1238a70ca729395985fb9108db95352eedd Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 12:05:22 -0400 Subject: [PATCH 13/16] set init --- catalogbuilder/intakebuilder/gfdlcrawler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index e6bee89..79164ad 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -31,6 +31,9 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): #todo catch exceptions upon furhter testing list_ptemplate = [] list_ftemplate = [] + set_ptemplate = set() + set_ftemplate = set() + if( configyaml is not None): if (configyaml.output_path_template is not None) & (configyaml.output_file_template is not None) : list_ptemplate = configyaml.output_path_template From f2e68b2de2641c6b1c46e8073a82083a14be5b5a Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 12:11:49 -0400 Subject: [PATCH 14/16] gen_intake_gfdl if cf name exists do stiuff --- catalogbuilder/scripts/gen_intake_gfdl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 946b2cc..5d6cd99 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -105,7 +105,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt if os.path.isdir(os.path.dirname(csv_path)): os.makedirs(os.path.dirname(csv_path), exist_ok=True) CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append,slow) - if(slow == False): + if(slow == False) & ('standard_name' in headers ): #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs df = pd.read_csv(os.path.abspath(csv_path), sep=",", header=0,index_col=False) list_variable_id = [] From 2ba2fb5beea4f6f50ef9fd366af3e980e66911ab Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 12:21:03 -0400 Subject: [PATCH 15/16] df checks --- catalogbuilder/scripts/gen_intake_gfdl.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 5d6cd99..639982b 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -105,6 +105,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt if os.path.isdir(os.path.dirname(csv_path)): os.makedirs(os.path.dirname(csv_path), exist_ok=True) CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append,slow) + df = None if(slow == False) & ('standard_name' in headers ): #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs df = pd.read_csv(os.path.abspath(csv_path), sep=",", header=0,index_col=False) @@ -116,8 +117,11 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt #if(df['variable_id'].eq(k)).any(): df['standard_name'].loc[(df['variable_id'] == k)] = v #df['standard_name'] = v - with open(csv_path, 'w') as csvfile: - df.to_csv(csvfile) + + if(slow == False) & ('standard_name' in headers ): + if ((df is not None) & (len(df) != 0) ): + with open(csv_path, 'w') as csvfile: + df.to_csv(csvfile) print("JSON generated at:", os.path.abspath(json_path)) print("CSV generated at:", os.path.abspath(csv_path)) From 776a5e73985e13eac958255f70e2c4496237b617 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Mon, 29 Jul 2024 12:27:14 -0400 Subject: [PATCH 16/16] testing with no config so CI passes --- catalogbuilder/scripts/gen_intake_gfdl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 639982b..b744f0d 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -118,7 +118,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt df['standard_name'].loc[(df['variable_id'] == k)] = v #df['standard_name'] = v - if(slow == False) & ('standard_name' in headers ): + if(slow == False) & ('standard_name' in headers): if ((df is not None) & (len(df) != 0) ): with open(csv_path, 'w') as csvfile: df.to_csv(csvfile)