From 9cd320ef17ad1da93c908a8f3e36ac2f34d7049b Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Tue, 30 Jul 2024 17:12:04 -0400 Subject: [PATCH 01/12] rm hardcode filenmae parsing --- catalogbuilder/intakebuilder/getinfo.py | 48 ++++++++++++--------- catalogbuilder/intakebuilder/gfdlcrawler.py | 2 +- catalogbuilder/scripts/gen_intake_gfdl.py | 2 +- catalogbuilder/tests/config-cfname.yaml | 2 +- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index 628193d..9421125 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -81,29 +81,35 @@ def getInfoFromFilename(filename,dictInfo,logger): return dictInfo #adding this back to trace back some old errors -def getInfoFromGFDLFilename(filename,dictInfo,logger): +def getInfoFromGFDLFilename(filename,dictInfo,logger,configyaml): # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc - if(filename.endswith(".nc")): #and not filename.startswith(".")): - ncfilename = filename.split(".") - varname = ncfilename[-2] - dictInfo["variable_id"] = varname - #miptable = "" #ncfilename[1] - #dictInfo["mip_table"] = miptable - #modelname = ncfilename[2] - #dictInfo["model"] = modelname - #expname = ncfilename[3] - #dictInfo["experiment_id"] = expname - #ens = ncfilename[4] - #dictInfo["ensemble_member"] = ens - #grid = ncfilename[5] - #dictInfo["grid_label"] = grid - try: - tsubset = ncfilename[1] - except IndexError: - tsubset = "null" #For fx fields - dictInfo["temporal_subset"] = tsubset + if ( (filename.endswith(".nc"))): # & ("static" not in filename)) ): + stemdir = filename.split(".") + #lets go backwards and match given input directory to the template, add things to dictInfo + j = -2 + cnt = 1 #'variable_id': 'static', 'time_range': 'land'} + if configyaml: + output_file_template = configyaml.output_file_template else: - logger.debug("Filename not compatible with this version of the builder:"+filename) + try: + output_file_template = builderconfig.output_file_template + except: + sys.exit("No output_path_template found. Check configuration.") + #output_file_template.reverse() + nlen = len(output_file_template) + for i in range(nlen-1,-1,-1): #nlen = 3 + try: + if(output_file_template[i] != "NA"): + try: + #print(output_file_template[i], "=" , stemdir[(j)]) + dictInfo[output_file_template[i]] = stemdir[(j)] + except IndexError: + #print("Check configuration. Is output file template set correctly?") + dictInfo[output_file_template[i]] = "" + except IndexError: + sys.exit("oops in getInfoFromGFDLFilename"+str(i)+str(j)+output_file_template[i]+stemdir[j]) + j = j - 1 + cnt = cnt + 1 return dictInfo def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml): diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 79164ad..7543b20 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -77,7 +77,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): if (op.countOf(filename,".") == 1): dictInfo = getinfo.getInfoFromFilename(filename,dictInfo, logger) else: - dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger) + dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger,configyaml) dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml) list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"] list_bad_chunklabel = ['DO_NOT_USE'] diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index b744f0d..1552bbb 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -117,7 +117,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt #if(df['variable_id'].eq(k)).any(): df['standard_name'].loc[(df['variable_id'] == k)] = v #df['standard_name'] = v - + print(headers) if(slow == False) & ('standard_name' in headers): if ((df is not None) & (len(df) != 0) ): with open(csv_path, 'w') as csvfile: diff --git a/catalogbuilder/tests/config-cfname.yaml b/catalogbuilder/tests/config-cfname.yaml index 21d8ceb..06d3f46 100644 --- a/catalogbuilder/tests/config-cfname.yaml +++ b/catalogbuilder/tests/config-cfname.yaml @@ -38,4 +38,4 @@ output_file_template: ['realm','time_range','variable_id'] ####################################################### input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/" -output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) +output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) From b58219790423c8867081a0faf7029ba1b91925ea Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Tue, 30 Jul 2024 17:44:49 -0400 Subject: [PATCH 02/12] bug fix cf name addition --- catalogbuilder/intakebuilder/getinfo.py | 6 ++++-- catalogbuilder/scripts/gen_intake_gfdl.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index 9421125..2191b85 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -245,12 +245,14 @@ def getStandardName(list_variable_id): #search for variable and its cf name for variable_id in list_variable_id: cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"]) + #print(cfname,variable_id) list_cfname = cfname.tolist() - if not list_cfname: + if(len(list_cfname) == 0): #print("what if the names correspond to CMOR_varname") cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"]) list_cfname = cfname.tolist() + #print(list_cfname) if len(list_cfname) > 0: unique_cf = list(set(list_cfname))[0] - dictCF[variable_id] = unique_cf + dictCF[variable_id] = unique_cf return (dictCF) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 1552bbb..b784610 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -117,7 +117,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt #if(df['variable_id'].eq(k)).any(): df['standard_name'].loc[(df['variable_id'] == k)] = v #df['standard_name'] = v - print(headers) if(slow == False) & ('standard_name' in headers): if ((df is not None) & (len(df) != 0) ): with open(csv_path, 'w') as csvfile: From 15823b617fe2080b5be95251d0c39a1f5789d5cd Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Tue, 30 Jul 2024 18:26:42 -0400 Subject: [PATCH 03/12] gfdl to cmip freq --- catalogbuilder/intakebuilder/getinfo.py | 13 +++++++++++++ catalogbuilder/intakebuilder/gfdlcrawler.py | 11 ++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index 2191b85..1a207d1 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -42,6 +42,19 @@ def getinfoFromYAML(dictInfo,yamlfile,miptable=None): dictInfo["realm"] = "NA" return(dictInfo) +def getFreqFromYAML(yamlfile,gfdlfreq=None): + #returns cmip freq for gfdl pp freq + import yaml + cmipfreq = None + with open(yamlfile) as f: + mappings = yaml.load(f, Loader=yaml.FullLoader) + if(gfdlfreq): + try: + cmipfreq = mappings[gfdlfreq]["frequency"] + except KeyError: + cmipfreq = None + return(cmipfreq) + def getStem(dirpath,projectdir): ''' return stem from the project directory passed and the files crawled within diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 7543b20..cc1bdad 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -106,6 +106,15 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): if "standard_name" in missingcols: dictInfo["standard_name"] = "na" getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) - + #replace frequency as needed + if 'frequency' in dictInfo.keys(): + package_dir = os.path.dirname(os.path.abspath(__file__)) + yamlfile = os.path.join(package_dir, 'dat/gfdlcmipfreq.yaml') + cmipfreq = None + gfdlfreq = dictInfo['frequency'] + cmipfreq = getinfo.getFreqFromYAML(yamlfile,gfdlfreq=dictInfo['frequency']) + if(cmipfreq is not None): + dictInfo['frequency'] = cmipfreq + #print("Adjusting frequency from ", gfdlfreq ," to ",cmipfreq) listfiles.append(dictInfo) return listfiles From f739c4a5bb3346d08939f6182c9edb53c14364ea Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Tue, 30 Jul 2024 18:31:26 -0400 Subject: [PATCH 04/12] push dat file --- .../intakebuilder/dat/gfdlcmipfreq.json | 20 +++++++++++++++++++ .../intakebuilder/dat/gfdlcmipfreq.yaml | 10 ++++++++++ 2 files changed, 30 insertions(+) create mode 100644 catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json create mode 100644 catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml diff --git a/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json new file mode 100644 index 0000000..4dcf599 --- /dev/null +++ b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json @@ -0,0 +1,20 @@ +{ + "frequency":{ + "1hr":"1hr", + "1hrCM":"1hrCM", + "1hrPt":"1hrPt", + "3hr":"3hr", + "3hrPt":"3hrPt", + "6hr":"6hr", + "6hrPt":"6hrPt", + "daily":"day", + "dec":"dec", + "fx":"fx", + "monthly":"mon" + "monC":"monC", + "monPt":"monPt", + "subhrPt":"subhrPt", + "yr":"yr", + "yrPt":"yrPt" + } +} diff --git a/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml new file mode 100644 index 0000000..240c103 --- /dev/null +++ b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml @@ -0,0 +1,10 @@ +monthly: + frequency: mon +daily: + frequency: day +hourly: + frequency: 1hr +annual: + frequency: yearly +3hr: + frequency: 3hr From 30b060fcbd725b882112094f003b85f914297ef9 Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Tue, 30 Jul 2024 21:30:57 -0400 Subject: [PATCH 05/12] Delete catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json --- .../intakebuilder/dat/gfdlcmipfreq.json | 20 ------------------- 1 file changed, 20 deletions(-) delete mode 100644 catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json diff --git a/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json b/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json deleted file mode 100644 index 4dcf599..0000000 --- a/catalogbuilder/intakebuilder/dat/gfdlcmipfreq.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "frequency":{ - "1hr":"1hr", - "1hrCM":"1hrCM", - "1hrPt":"1hrPt", - "3hr":"3hr", - "3hrPt":"3hrPt", - "6hr":"6hr", - "6hrPt":"6hrPt", - "daily":"day", - "dec":"dec", - "fx":"fx", - "monthly":"mon" - "monC":"monC", - "monPt":"monPt", - "subhrPt":"subhrPt", - "yr":"yr", - "yrPt":"yrPt" - } -} From df7873813c128b3be9485b86d48e9219bf3de07a Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Tue, 30 Jul 2024 21:33:52 -0400 Subject: [PATCH 06/12] Update MANIFEST.in --- MANIFEST.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 01fba10..e42ebf6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ recursive-include catalogbuilder/cats * - +recursive-include catalogbuilder/intakebuilder/dat * From 3ac409869d94d2a059d67eaa2f255ccbd4d8cc4d Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Tue, 30 Jul 2024 21:43:12 -0400 Subject: [PATCH 07/12] Update conda-env-create-run-pytest.yml --- .github/workflows/conda-env-create-run-pytest.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/conda-env-create-run-pytest.yml b/.github/workflows/conda-env-create-run-pytest.yml index f1714f7..038e2a1 100644 --- a/.github/workflows/conda-env-create-run-pytest.yml +++ b/.github/workflows/conda-env-create-run-pytest.yml @@ -57,11 +57,8 @@ jobs: with: name: workflow-artifacts1 path: | - gfdl_autotest.csv - gfdl_autotest.json - cats/gfdl_autotest_from_yaml.csv - cats/gfdl_autotest_from_yaml.json - + catalogbuilder/cats/gfdl_autotest_from_yaml.json + catalogbuilder/cats/gfdl_autotest_from_yaml.csv - name: Download all workflow run artifacts uses: actions/download-artifact@v4 From c31147a411f14948d52dc277ff6bb80b8506521d Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Tue, 30 Jul 2024 21:49:33 -0400 Subject: [PATCH 08/12] Update test_ci_dynamic.py --- catalogbuilder/tests/test_ci_dynamic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/tests/test_ci_dynamic.py b/catalogbuilder/tests/test_ci_dynamic.py index 9ba2a70..1d46ae8 100644 --- a/catalogbuilder/tests/test_ci_dynamic.py +++ b/catalogbuilder/tests/test_ci_dynamic.py @@ -32,7 +32,7 @@ def test_loadcat(): #todo check if its readable etc #we are using the dynamically generated csv and json for testing in this routine #leveraging GitHub actions CI workflow and manifests and caches - catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/gfdl_autotest.json' + catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/catalogbuilder/cats/gfdl_autotest_from_yaml.json' cat = load_cat((str(catspec))) try: assert isinstance(cat.df, pd.DataFrame),"test failed" From 04bb2e2d0647ba8f0c9aba014969e4fb0ad2985b Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Tue, 30 Jul 2024 21:49:53 -0400 Subject: [PATCH 09/12] Update builderconfig.py --- catalogbuilder/intakebuilder/builderconfig.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/intakebuilder/builderconfig.py b/catalogbuilder/intakebuilder/builderconfig.py index 2eb95ef..97d5cd1 100644 --- a/catalogbuilder/intakebuilder/builderconfig.py +++ b/catalogbuilder/intakebuilder/builderconfig.py @@ -15,7 +15,7 @@ headerlist = ["activity_id", "institution_id", "source_id", "experiment_id", "frequency", "realm", "table_id", "member_id", "grid_label", "variable_id", - "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] + "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] #what kind of directory structure to expect? #For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp From e32831f19bf3430616bac68ac087963283de524b Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Tue, 30 Jul 2024 21:50:54 -0400 Subject: [PATCH 10/12] Update subdirs.py --- catalogbuilder/tests/subdirs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/tests/subdirs.py b/catalogbuilder/tests/subdirs.py index bd40b00..b833bb2 100644 --- a/catalogbuilder/tests/subdirs.py +++ b/catalogbuilder/tests/subdirs.py @@ -15,6 +15,6 @@ 'uas' ] time = [ -'000101-000112' +'000101-000112', '000201-000212' ] From 7ef2d8d4cd39aa45f5250ca59b854f8cb8432e54 Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Tue, 30 Jul 2024 21:59:01 -0400 Subject: [PATCH 11/12] Update test_ci_dynamic.py --- catalogbuilder/tests/test_ci_dynamic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/tests/test_ci_dynamic.py b/catalogbuilder/tests/test_ci_dynamic.py index 1d46ae8..681fa3b 100644 --- a/catalogbuilder/tests/test_ci_dynamic.py +++ b/catalogbuilder/tests/test_ci_dynamic.py @@ -32,7 +32,7 @@ def test_loadcat(): #todo check if its readable etc #we are using the dynamically generated csv and json for testing in this routine #leveraging GitHub actions CI workflow and manifests and caches - catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/catalogbuilder/cats/gfdl_autotest_from_yaml.json' + catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/gfdl_autotest_from_yaml.json' cat = load_cat((str(catspec))) try: assert isinstance(cat.df, pd.DataFrame),"test failed" From 54f76e8c2b03e145f248b49978618765be797a2a Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Tue, 30 Jul 2024 22:03:56 -0400 Subject: [PATCH 12/12] Update test_config.yaml --- catalogbuilder/tests/test_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/tests/test_config.yaml b/catalogbuilder/tests/test_config.yaml index 9d9bbd1..41c1288 100644 --- a/catalogbuilder/tests/test_config.yaml +++ b/catalogbuilder/tests/test_config.yaml @@ -15,7 +15,7 @@ headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", "frequency", "realm", "table_id", "member_id", "grid_label", "variable_id", - "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] + "time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","standard_name","path"] #what kind of directory structure to expect? #For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp