NOAA-GFDL · aradhakrishnanGFDL · Aug 1, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/.github/workflows/conda-env-create-run-pytest.yml b/.github/workflows/conda-env-create-run-pytest.yml
@@ -57,11 +57,8 @@ jobs:
       with:
           name: workflow-artifacts1 
           path: |
-            gfdl_autotest.csv
-            gfdl_autotest.json
-            cats/gfdl_autotest_from_yaml.csv
-            cats/gfdl_autotest_from_yaml.json
-
+            catalogbuilder/cats/gfdl_autotest_from_yaml.json
+            catalogbuilder/cats/gfdl_autotest_from_yaml.csv 
     - name: Download all workflow run artifacts
       uses: actions/download-artifact@v4
 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,2 @@
 recursive-include catalogbuilder/cats *
-
+recursive-include catalogbuilder/intakebuilder/dat *
diff --git a/catalogbuilder/intakebuilder/builderconfig.py b/catalogbuilder/intakebuilder/builderconfig.py
@@ -15,7 +15,7 @@
 headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
                   "frequency", "realm", "table_id",
                   "member_id", "grid_label", "variable_id",
-                  "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
+                  "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
 
 #what kind of directory structure to expect?
 #For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp

diff --git a/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml
@@ -0,0 +1,10 @@
+monthly:
+    frequency: mon
+daily:
+    frequency: day 
+hourly:
+    frequency: 1hr
+annual:
+    frequency: yearly
+3hr:
+    frequency: 3hr 
diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py
@@ -42,6 +42,19 @@ def getinfoFromYAML(dictInfo,yamlfile,miptable=None):
                 dictInfo["realm"]  = "NA"
     return(dictInfo)
 
+def getFreqFromYAML(yamlfile,gfdlfreq=None):
+    #returns cmip freq for gfdl pp freq 
+    import yaml
+    cmipfreq = None
+    with open(yamlfile) as f:
+        mappings = yaml.load(f, Loader=yaml.FullLoader)
+        if(gfdlfreq):
+            try:
+                cmipfreq = mappings[gfdlfreq]["frequency"]
+            except KeyError:
+                cmipfreq = None 
+    return(cmipfreq)
+
 def getStem(dirpath,projectdir):
     '''
     return stem from the project directory passed and the files crawled within
@@ -81,29 +94,35 @@ def getInfoFromFilename(filename,dictInfo,logger):
     return dictInfo
 
 #adding this back to trace back some old errors
-def getInfoFromGFDLFilename(filename,dictInfo,logger):
+def getInfoFromGFDLFilename(filename,dictInfo,logger,configyaml):
     # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
-    if(filename.endswith(".nc")): #and not filename.startswith(".")):
-        ncfilename = filename.split(".")
-        varname = ncfilename[-2]
-        dictInfo["variable_id"] = varname
-        #miptable = "" #ncfilename[1]
-        #dictInfo["mip_table"] = miptable
-        #modelname = ncfilename[2]
-        #dictInfo["model"] = modelname
-        #expname = ncfilename[3]
-        #dictInfo["experiment_id"] = expname
-        #ens = ncfilename[4]
-        #dictInfo["ensemble_member"] = ens
-        #grid = ncfilename[5]
-        #dictInfo["grid_label"] = grid
-        try:
-           tsubset = ncfilename[1]
-        except IndexError:
-           tsubset = "null" #For fx fields
-        dictInfo["temporal_subset"] = tsubset
+  if ( (filename.endswith(".nc"))): # & ("static" not in filename)) ): 
+    stemdir = filename.split(".")
+    #lets go backwards and match given input directory to the template, add things to dictInfo
+    j = -2
+    cnt = 1 #'variable_id': 'static', 'time_range': 'land'}
+    if configyaml:
+        output_file_template = configyaml.output_file_template
     else:
-        logger.debug("Filename not compatible with this version of the builder:"+filename)
+        try:
+            output_file_template = builderconfig.output_file_template
+        except:
+            sys.exit("No output_path_template found. Check configuration.")
+    #output_file_template.reverse()
+    nlen = len(output_file_template)
+    for i in range(nlen-1,-1,-1): #nlen = 3
+      try:
+          if(output_file_template[i] != "NA"):
+              try:
+                  #print(output_file_template[i], "=" , stemdir[(j)])
+                  dictInfo[output_file_template[i]] = stemdir[(j)]
+              except IndexError:
+                  #print("Check configuration. Is output file template set correctly?")
+                  dictInfo[output_file_template[i]] = ""
+      except IndexError:
+          sys.exit("oops in getInfoFromGFDLFilename"+str(i)+str(j)+output_file_template[i]+stemdir[j])
+      j = j - 1
+    cnt = cnt + 1
     return dictInfo
 
 def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
@@ -239,12 +258,14 @@ def getStandardName(list_variable_id):
   #search for variable and its cf name
   for variable_id in list_variable_id:
      cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"])
+     #print(cfname,variable_id)
      list_cfname = cfname.tolist()
-     if not list_cfname:
+     if(len(list_cfname) == 0):
         #print("what if the names correspond to CMOR_varname")
         cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"])
         list_cfname = cfname.tolist()
+        #print(list_cfname)
      if len(list_cfname) > 0:
        unique_cf = list(set(list_cfname))[0]
-     dictCF[variable_id] = unique_cf
+       dictCF[variable_id] = unique_cf
   return (dictCF)
diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py
@@ -77,7 +77,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
                if (op.countOf(filename,".") == 1):
                  dictInfo = getinfo.getInfoFromFilename(filename,dictInfo, logger)
                else:
-                 dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger)
+                 dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger,configyaml)
                dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml)
                list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
                list_bad_chunklabel = ['DO_NOT_USE']
@@ -106,6 +106,15 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
                     if "standard_name" in missingcols: 
                         dictInfo["standard_name"] = "na"
                         getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo)
-
+               #replace frequency as needed 
+               if 'frequency' in dictInfo.keys():
+                      package_dir = os.path.dirname(os.path.abspath(__file__))
+                      yamlfile = os.path.join(package_dir, 'dat/gfdlcmipfreq.yaml')
+                      cmipfreq = None
+                      gfdlfreq = dictInfo['frequency']  
+                      cmipfreq = getinfo.getFreqFromYAML(yamlfile,gfdlfreq=dictInfo['frequency'])
+                      if(cmipfreq is not None):
+                          dictInfo['frequency'] = cmipfreq 
+                          #print("Adjusting frequency from ", gfdlfreq ," to ",cmipfreq)  
                listfiles.append(dictInfo)
     return listfiles
diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py
@@ -117,7 +117,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
                          #if(df['variable_id'].eq(k)).any():
                          df['standard_name'].loc[(df['variable_id'] == k)] = v
                              #df['standard_name'] = v 
-
     if(slow == False) & ('standard_name' in headers):
        if ((df is not None) & (len(df) != 0) ):
            with open(csv_path, 'w') as csvfile:

diff --git a/catalogbuilder/tests/config-cfname.yaml b/catalogbuilder/tests/config-cfname.yaml
@@ -38,4 +38,4 @@ output_file_template: ['realm','time_range','variable_id']
 #######################################################
 
 input_path:  "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
-output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
diff --git a/catalogbuilder/tests/subdirs.py b/catalogbuilder/tests/subdirs.py
@@ -15,6 +15,6 @@
 'uas'
 ]
 time = [
-'000101-000112'
+'000101-000112',
 '000201-000212'
 ]
diff --git a/catalogbuilder/tests/test_ci_dynamic.py b/catalogbuilder/tests/test_ci_dynamic.py
@@ -32,7 +32,7 @@ def test_loadcat():
   #todo check if its readable etc
   #we are using the dynamically generated csv and json for testing in this routine
   #leveraging GitHub actions CI workflow and manifests and caches
-  catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/gfdl_autotest.json'
+  catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/gfdl_autotest_from_yaml.json'
   cat = load_cat((str(catspec)))
   try:
     assert isinstance(cat.df, pd.DataFrame),"test failed"

diff --git a/catalogbuilder/tests/test_config.yaml b/catalogbuilder/tests/test_config.yaml
@@ -15,7 +15,7 @@
 headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
                   "frequency", "realm", "table_id",
                   "member_id", "grid_label", "variable_id",
-                  "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
+                  "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","standard_name","path"]
 
 #what kind of directory structure to expect?
 #For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		recursive-include catalogbuilder/cats *

		recursive-include catalogbuilder/intakebuilder/dat *