NOAA-GFDL · aradhakrishnanGFDL · Jul 29, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Cite our work: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5196586.svg)](https://doi.org/10.5281/zenodo.10787602)
 
-See our [project documentation site ](https://aradhakrishnangfdl.github.io/CatalogBuilder/).
+See our [project documentation site ](https://noaa-gfdl.github.io/CatalogBuilder/).
 
 
 This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
diff --git a/catalogbuilder/intakebuilder/CSVwriter.py b/catalogbuilder/intakebuilder/CSVwriter.py
@@ -1,5 +1,6 @@
 import os.path
 import csv
+import pandas as pd
 from csv import writer
 #from intakebuilder import builderconfig, configparser
 from . import builderconfig, configparser 
@@ -40,7 +41,7 @@ def file_appender(dictinputs, csvfile):
         # add contents of list as last row in the csv file
         csv_writer.writerow(dictinputs)
 
-def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append):
+def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append,slow):
     try:
         #Open the CSV file in write mode and add any data with atleast 3 values associated with it
         if overwrite:
@@ -95,5 +96,6 @@ def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append):
                     for data in dict_info:
                         if len(data.keys()) > 2:
                             writer.writerow(data)
+
     except IOError:
         print("I/O error")
diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py
@@ -6,7 +6,8 @@
 import xarray as xr
 #from intakebuilder import builderconfig, configparser
 from . import builderconfig, configparser 
-
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
 
 '''
 getinfo.py provides helper functions to get information (from filename, DRS, file/global attributes) needed to populate the catalog
@@ -178,7 +179,25 @@ def getInfoFromDRS(dirpath,projectdir,dictInfo):
 def return_xr(fname):
     filexr = (xr.open_dataset(fname))
     filexra = filexr.attrs
-    return filexra
+    return filexr,filexra
+def getInfoFromVarAtts(fname,variable_id,dictInfo,att="standard_name",filexra=None):
+    '''
+    Returns info from the filename and xarray dataset object
+    :param fname: filename
+    :param filexr: Xarray dataset object
+    :return: dictInfo with all variable atts 
+    '''
+    #try:
+    filexr,filexra = return_xr(fname)
+    #print("Variable atts from file:",filexr[variable_id])
+    if (dictInfo[att] == "na"):
+      try:
+          cfname = filexr[variable_id].attrs["standard_name"]
+      except KeyError:
+          cfname = "NA"
+      dictInfo["standard_name"] = cfname 
+      print("standard_name found",dictInfo["standard_name"])
+    return dictInfo
 def getInfoFromGlobalAtts(fname,dictInfo,filexra=None):
     '''
     Returns info from the filename and xarray dataset object
@@ -205,3 +224,27 @@ def getInfoFromGlobalAtts(fname,dictInfo,filexra=None):
     dictInfo["frequency"] = frequency
     return dictInfo
 
+def getStandardName(list_variable_id):
+  '''
+  Returns dict standard name for the variable in question
+  ''' 
+  unique_cf = "na"
+  dictCF = {}
+  try:
+      url = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip5_vars.csv"
+      df = pd.read_csv(url, sep=",", header=0,index_col=False)
+  except IOError:
+            print("Unable to open file")
+            sys.exit(1)
+  #search for variable and its cf name
+  for variable_id in list_variable_id:
+     cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"])
+     list_cfname = cfname.tolist()
+     if not list_cfname:
+        #print("what if the names correspond to CMOR_varname")
+        cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"])
+        list_cfname = cfname.tolist()
+     if len(list_cfname) > 0:
+       unique_cf = list(set(list_cfname))[0]
+     dictCF[variable_id] = unique_cf
+  return (dictCF)
diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py
@@ -7,11 +7,10 @@
 '''
 localcrawler crawls through the local file path, then calls helper functions in the package to getinfo.
 It finally returns a list of dict. eg {'project': 'CMIP6', 'path': '/uda/CMIP6/CDRMIP/NCC/NorESM2-LM/esm-pi-cdr-pulse/r1i1p1f1/Emon/zg/gn/v20191108/zg_Emon_NorESM2-LM_esm-pi-cdr-pulse_r1i1p1f1_gn_192001-192912.nc', 'variable': 'zg', 'mip_table': 'Emon', 'model': 'NorESM2-LM', 'experiment_id': 'esm-pi-cdr-pulse', 'ensemble_member': 'r1i1p1f1', 'grid_label': 'gn', 'temporal subset': '192001-192912', 'institute': 'NCC', 'version': 'v20191108'}
-
 '''
-def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
+def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
     '''
-    Craw through the local directory and run through the getInfo.. functions
+    crawl through the local directory and run through the getInfo.. functions
     :param projectdir:
     :return:listfiles which has a dictionary of all key/value pairs for each file to be added to the csv
     '''
@@ -22,6 +21,36 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
 
     orig_pat = pat
 
+    if configyaml:
+       headerlist = configyaml.headerlist
+    else:
+       headerlist = builderconfig.headerlist
+
+    #For those columns that we cannot find in output path template or output file template from config yaml, we have hooks
+    #now to look up the netcdf dataset if slow option is True
+    #todo catch exceptions upon furhter testing
+    list_ptemplate = []
+    list_ftemplate = []
+    set_ptemplate = set()
+    set_ftemplate = set()
+
+    if( configyaml is not None):
+        if (configyaml.output_path_template is not None) & (configyaml.output_file_template is not None) :
+          list_ptemplate = configyaml.output_path_template
+          list_ftemplate = configyaml.output_file_template
+        set_ptemplate = set(list_ptemplate)
+        set_ftemplate = set(list_ftemplate)
+    #print(headerlist)
+    #print(list_ptemplate)
+    #print(list_ftemplate)
+    if (len(set_ptemplate) > 0):
+       diffcols  = [x for x in headerlist  if x not in set_ptemplate]
+    if ( len(set_ftemplate) > 0 ):
+      missingcols = [col for col in diffcols if col not in set_ftemplate]
+      missingcols.remove("path") #because we get this anyway
+      print("Missing cols from metadata sources:", missingcols)
+
+
     #TODO INCLUDE filter in traversing through directories at the top
     for dirpath, dirs, files in os.walk(projectdir):
         searchpath = dirpath
@@ -60,20 +89,23 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
                    if(dictInfo["chunk_freq"] in list_bad_chunklabel):
                        logger.debug("Found bad chunk, skipping this possibly bad DRS filename",filepath)
                        continue     
-
-               if configyaml:
-                   headerlist = configyaml.headerlist
-               else:
-                   headerlist = builderconfig.headerlist
                # remove those keys that are not CSV headers 
                # move it so its one time 
                rmkeys = []
                for dkeys in dictInfo.keys():
                   if dkeys not in headerlist:
                       rmkeys.append(dkeys) 
                rmkeys = list(set(rmkeys))
-
                for k in rmkeys: dictInfo.pop(k,None)
+               # todo do the reverse if slow is on. Open file no matter what and populate dictionary values and if there is something missed out
+               # we can scan filenames or config etc 
+               #here, we will see if there are missing header values and compare with file attributes if slow option is turned on
+               if (slow == True) & (bool(dictInfo) == True) :
+                    print("Slow option turned on.. lets open some files using xarray and lookup atts",filename)
+                    #todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml 
+                    if "standard_name" in missingcols: 
+                        dictInfo["standard_name"] = "na"
+                        getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo)
 
                listfiles.append(dictInfo)
     return listfiles
diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 import json
-import sys
+import sys,pandas as pd
 import click
 import os
 from pathlib import Path
@@ -11,8 +11,7 @@
 logger.setLevel(logging.INFO)
 
 try:
-   #from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser
-   from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser
+   from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser, getinfo
 except ModuleNotFoundError:
     print("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
     print("Attempting again with adjusted sys.path ")
@@ -22,7 +21,8 @@
        print("Unable to adjust sys.path")
     #print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
     try:
-        from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser
+        from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser,getinfo
+        print(gfdlcrawler.__file__)
     except ModuleNotFoundError:
         sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
 
@@ -42,8 +42,9 @@
 @click.option('--filter_chunk', nargs=1)
 @click.option('--overwrite', is_flag=True, default=False)
 @click.option('--append', is_flag=True, default=False)
+@click.option('--slow','-s', is_flag=True, default=False)
 def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
-         overwrite=False, append=False):
+         overwrite=False, append=False, slow = False):
 
     configyaml = None
     # TODO error catching
@@ -89,7 +90,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
     dictInfo = {}
     project_dir = project_dir.rstrip("/")
     logger.info("Calling gfdlcrawler.crawlLocal")
-    list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml)
+    list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml,slow)
     #Grabbing data from template JSON, changing CSV path to match output path, and dumping data in new JSON
     with open(template_path, "r") as jsonTemplate:
         data = json.load(jsonTemplate)
@@ -103,7 +104,25 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
     # so we check if it's a directory first
     if os.path.isdir(os.path.dirname(csv_path)):
         os.makedirs(os.path.dirname(csv_path), exist_ok=True)
-    CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append)
+    CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append,slow)
+    df = None
+    if(slow == False) & ('standard_name' in headers ):
+               #If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs
+                      df = pd.read_csv(os.path.abspath(csv_path), sep=",", header=0,index_col=False)
+                      list_variable_id = []
+                      list_variable_id = df["variable_id"].tolist()
+                      dictVarCF = getinfo.getStandardName(list_variable_id)
+                      #print("standard name from look-up table-", dictVarCF)
+                      for k, v in dictVarCF.items():
+                         #if(df['variable_id'].eq(k)).any():
+                         df['standard_name'].loc[(df['variable_id'] == k)] = v
+                             #df['standard_name'] = v 
+
+    if(slow == False) & ('standard_name' in headers):
+       if ((df is not None) & (len(df) != 0) ):
+           with open(csv_path, 'w') as csvfile:
+               df.to_csv(csvfile)
+
     print("JSON generated at:", os.path.abspath(json_path))
     print("CSV generated at:", os.path.abspath(csv_path))
     logger.info("CSV generated at" + os.path.abspath(csv_path))

diff --git a/catalogbuilder/tests/config-cfname.yaml b/catalogbuilder/tests/config-cfname.yaml
@@ -0,0 +1,41 @@
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+output_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+input_path:  "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)