diff --git a/.github/workflows/conda-env-create-run-pytest.yml b/.github/workflows/conda-env-create-run-pytest.yml index f1714f7..038e2a1 100644 --- a/.github/workflows/conda-env-create-run-pytest.yml +++ b/.github/workflows/conda-env-create-run-pytest.yml @@ -57,11 +57,8 @@ jobs: with: name: workflow-artifacts1 path: | - gfdl_autotest.csv - gfdl_autotest.json - cats/gfdl_autotest_from_yaml.csv - cats/gfdl_autotest_from_yaml.json - + catalogbuilder/cats/gfdl_autotest_from_yaml.json + catalogbuilder/cats/gfdl_autotest_from_yaml.csv - name: Download all workflow run artifacts uses: actions/download-artifact@v4 diff --git a/MANIFEST.in b/MANIFEST.in index 01fba10..e42ebf6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ recursive-include catalogbuilder/cats * - +recursive-include catalogbuilder/intakebuilder/dat * diff --git a/catalogbuilder/intakebuilder/builderconfig.py b/catalogbuilder/intakebuilder/builderconfig.py index 2eb95ef..97d5cd1 100644 --- a/catalogbuilder/intakebuilder/builderconfig.py +++ b/catalogbuilder/intakebuilder/builderconfig.py @@ -15,7 +15,7 @@ headerlist = ["activity_id", "institution_id", "source_id", "experiment_id", "frequency", "realm", "table_id", "member_id", "grid_label", "variable_id", - "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] + "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] #what kind of directory structure to expect? #For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp diff --git a/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml new file mode 100644 index 0000000..240c103 --- /dev/null +++ b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml @@ -0,0 +1,10 @@ +monthly: + frequency: mon +daily: + frequency: day +hourly: + frequency: 1hr +annual: + frequency: yearly +3hr: + frequency: 3hr diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index 628193d..1a207d1 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -42,6 +42,19 @@ def getinfoFromYAML(dictInfo,yamlfile,miptable=None): dictInfo["realm"] = "NA" return(dictInfo) +def getFreqFromYAML(yamlfile,gfdlfreq=None): + #returns cmip freq for gfdl pp freq + import yaml + cmipfreq = None + with open(yamlfile) as f: + mappings = yaml.load(f, Loader=yaml.FullLoader) + if(gfdlfreq): + try: + cmipfreq = mappings[gfdlfreq]["frequency"] + except KeyError: + cmipfreq = None + return(cmipfreq) + def getStem(dirpath,projectdir): ''' return stem from the project directory passed and the files crawled within @@ -81,29 +94,35 @@ def getInfoFromFilename(filename,dictInfo,logger): return dictInfo #adding this back to trace back some old errors -def getInfoFromGFDLFilename(filename,dictInfo,logger): +def getInfoFromGFDLFilename(filename,dictInfo,logger,configyaml): # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc - if(filename.endswith(".nc")): #and not filename.startswith(".")): - ncfilename = filename.split(".") - varname = ncfilename[-2] - dictInfo["variable_id"] = varname - #miptable = "" #ncfilename[1] - #dictInfo["mip_table"] = miptable - #modelname = ncfilename[2] - #dictInfo["model"] = modelname - #expname = ncfilename[3] - #dictInfo["experiment_id"] = expname - #ens = ncfilename[4] - #dictInfo["ensemble_member"] = ens - #grid = ncfilename[5] - #dictInfo["grid_label"] = grid - try: - tsubset = ncfilename[1] - except IndexError: - tsubset = "null" #For fx fields - dictInfo["temporal_subset"] = tsubset + if ( (filename.endswith(".nc"))): # & ("static" not in filename)) ): + stemdir = filename.split(".") + #lets go backwards and match given input directory to the template, add things to dictInfo + j = -2 + cnt = 1 #'variable_id': 'static', 'time_range': 'land'} + if configyaml: + output_file_template = configyaml.output_file_template else: - logger.debug("Filename not compatible with this version of the builder:"+filename) + try: + output_file_template = builderconfig.output_file_template + except: + sys.exit("No output_path_template found. Check configuration.") + #output_file_template.reverse() + nlen = len(output_file_template) + for i in range(nlen-1,-1,-1): #nlen = 3 + try: + if(output_file_template[i] != "NA"): + try: + #print(output_file_template[i], "=" , stemdir[(j)]) + dictInfo[output_file_template[i]] = stemdir[(j)] + except IndexError: + #print("Check configuration. Is output file template set correctly?") + dictInfo[output_file_template[i]] = "" + except IndexError: + sys.exit("oops in getInfoFromGFDLFilename"+str(i)+str(j)+output_file_template[i]+stemdir[j]) + j = j - 1 + cnt = cnt + 1 return dictInfo def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml): @@ -239,12 +258,14 @@ def getStandardName(list_variable_id): #search for variable and its cf name for variable_id in list_variable_id: cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"]) + #print(cfname,variable_id) list_cfname = cfname.tolist() - if not list_cfname: + if(len(list_cfname) == 0): #print("what if the names correspond to CMOR_varname") cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"]) list_cfname = cfname.tolist() + #print(list_cfname) if len(list_cfname) > 0: unique_cf = list(set(list_cfname))[0] - dictCF[variable_id] = unique_cf + dictCF[variable_id] = unique_cf return (dictCF) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 79164ad..cc1bdad 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -77,7 +77,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): if (op.countOf(filename,".") == 1): dictInfo = getinfo.getInfoFromFilename(filename,dictInfo, logger) else: - dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger) + dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger,configyaml) dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml) list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"] list_bad_chunklabel = ['DO_NOT_USE'] @@ -106,6 +106,15 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): if "standard_name" in missingcols: dictInfo["standard_name"] = "na" getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) - + #replace frequency as needed + if 'frequency' in dictInfo.keys(): + package_dir = os.path.dirname(os.path.abspath(__file__)) + yamlfile = os.path.join(package_dir, 'dat/gfdlcmipfreq.yaml') + cmipfreq = None + gfdlfreq = dictInfo['frequency'] + cmipfreq = getinfo.getFreqFromYAML(yamlfile,gfdlfreq=dictInfo['frequency']) + if(cmipfreq is not None): + dictInfo['frequency'] = cmipfreq + #print("Adjusting frequency from ", gfdlfreq ," to ",cmipfreq) listfiles.append(dictInfo) return listfiles diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index b744f0d..b784610 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -117,7 +117,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt #if(df['variable_id'].eq(k)).any(): df['standard_name'].loc[(df['variable_id'] == k)] = v #df['standard_name'] = v - if(slow == False) & ('standard_name' in headers): if ((df is not None) & (len(df) != 0) ): with open(csv_path, 'w') as csvfile: diff --git a/catalogbuilder/tests/config-cfname.yaml b/catalogbuilder/tests/config-cfname.yaml index 21d8ceb..06d3f46 100644 --- a/catalogbuilder/tests/config-cfname.yaml +++ b/catalogbuilder/tests/config-cfname.yaml @@ -38,4 +38,4 @@ output_file_template: ['realm','time_range','variable_id'] ####################################################### input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/" -output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) +output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) diff --git a/catalogbuilder/tests/subdirs.py b/catalogbuilder/tests/subdirs.py index bd40b00..b833bb2 100644 --- a/catalogbuilder/tests/subdirs.py +++ b/catalogbuilder/tests/subdirs.py @@ -15,6 +15,6 @@ 'uas' ] time = [ -'000101-000112' +'000101-000112', '000201-000212' ] diff --git a/catalogbuilder/tests/test_ci_dynamic.py b/catalogbuilder/tests/test_ci_dynamic.py index 9ba2a70..681fa3b 100644 --- a/catalogbuilder/tests/test_ci_dynamic.py +++ b/catalogbuilder/tests/test_ci_dynamic.py @@ -32,7 +32,7 @@ def test_loadcat(): #todo check if its readable etc #we are using the dynamically generated csv and json for testing in this routine #leveraging GitHub actions CI workflow and manifests and caches - catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/gfdl_autotest.json' + catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/gfdl_autotest_from_yaml.json' cat = load_cat((str(catspec))) try: assert isinstance(cat.df, pd.DataFrame),"test failed" diff --git a/catalogbuilder/tests/test_config.yaml b/catalogbuilder/tests/test_config.yaml index 9d9bbd1..41c1288 100644 --- a/catalogbuilder/tests/test_config.yaml +++ b/catalogbuilder/tests/test_config.yaml @@ -15,7 +15,7 @@ headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", "frequency", "realm", "table_id", "member_id", "grid_label", "variable_id", - "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] + "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","standard_name","path"] #what kind of directory structure to expect? #For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp