Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mdtf support #25

Merged
merged 12 commits into from
Aug 1, 2024
7 changes: 2 additions & 5 deletions .github/workflows/conda-env-create-run-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,8 @@ jobs:
with:
name: workflow-artifacts1
path: |
gfdl_autotest.csv
gfdl_autotest.json
cats/gfdl_autotest_from_yaml.csv
cats/gfdl_autotest_from_yaml.json

catalogbuilder/cats/gfdl_autotest_from_yaml.json
catalogbuilder/cats/gfdl_autotest_from_yaml.csv
- name: Download all workflow run artifacts
uses: actions/download-artifact@v4

Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
recursive-include catalogbuilder/cats *

recursive-include catalogbuilder/intakebuilder/dat *
2 changes: 1 addition & 1 deletion catalogbuilder/intakebuilder/builderconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
"time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
Expand Down
10 changes: 10 additions & 0 deletions catalogbuilder/intakebuilder/dat/gfdlcmipfreq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
monthly:
frequency: mon
daily:
frequency: day
hourly:
frequency: 1hr
annual:
frequency: yearly
3hr:
frequency: 3hr
67 changes: 44 additions & 23 deletions catalogbuilder/intakebuilder/getinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ def getinfoFromYAML(dictInfo,yamlfile,miptable=None):
dictInfo["realm"] = "NA"
return(dictInfo)

def getFreqFromYAML(yamlfile,gfdlfreq=None):
#returns cmip freq for gfdl pp freq
import yaml
cmipfreq = None
with open(yamlfile) as f:
mappings = yaml.load(f, Loader=yaml.FullLoader)
if(gfdlfreq):
try:
cmipfreq = mappings[gfdlfreq]["frequency"]
except KeyError:
cmipfreq = None
return(cmipfreq)

def getStem(dirpath,projectdir):
'''
return stem from the project directory passed and the files crawled within
Expand Down Expand Up @@ -81,29 +94,35 @@ def getInfoFromFilename(filename,dictInfo,logger):
return dictInfo

#adding this back to trace back some old errors
def getInfoFromGFDLFilename(filename,dictInfo,logger):
def getInfoFromGFDLFilename(filename,dictInfo,logger,configyaml):
# 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
if(filename.endswith(".nc")): #and not filename.startswith(".")):
ncfilename = filename.split(".")
varname = ncfilename[-2]
dictInfo["variable_id"] = varname
#miptable = "" #ncfilename[1]
#dictInfo["mip_table"] = miptable
#modelname = ncfilename[2]
#dictInfo["model"] = modelname
#expname = ncfilename[3]
#dictInfo["experiment_id"] = expname
#ens = ncfilename[4]
#dictInfo["ensemble_member"] = ens
#grid = ncfilename[5]
#dictInfo["grid_label"] = grid
try:
tsubset = ncfilename[1]
except IndexError:
tsubset = "null" #For fx fields
dictInfo["temporal_subset"] = tsubset
if ( (filename.endswith(".nc"))): # & ("static" not in filename)) ):
stemdir = filename.split(".")
#lets go backwards and match given input directory to the template, add things to dictInfo
j = -2
cnt = 1 #'variable_id': 'static', 'time_range': 'land'}
if configyaml:
output_file_template = configyaml.output_file_template
else:
logger.debug("Filename not compatible with this version of the builder:"+filename)
try:
output_file_template = builderconfig.output_file_template
except:
sys.exit("No output_path_template found. Check configuration.")
#output_file_template.reverse()
nlen = len(output_file_template)
for i in range(nlen-1,-1,-1): #nlen = 3
try:
if(output_file_template[i] != "NA"):
try:
#print(output_file_template[i], "=" , stemdir[(j)])
dictInfo[output_file_template[i]] = stemdir[(j)]
except IndexError:
#print("Check configuration. Is output file template set correctly?")
dictInfo[output_file_template[i]] = ""
except IndexError:
sys.exit("oops in getInfoFromGFDLFilename"+str(i)+str(j)+output_file_template[i]+stemdir[j])
j = j - 1
cnt = cnt + 1
return dictInfo

def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo,configyaml):
Expand Down Expand Up @@ -239,12 +258,14 @@ def getStandardName(list_variable_id):
#search for variable and its cf name
for variable_id in list_variable_id:
cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"])
#print(cfname,variable_id)
list_cfname = cfname.tolist()
if not list_cfname:
if(len(list_cfname) == 0):
#print("what if the names correspond to CMOR_varname")
cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"])
list_cfname = cfname.tolist()
#print(list_cfname)
if len(list_cfname) > 0:
unique_cf = list(set(list_cfname))[0]
dictCF[variable_id] = unique_cf
dictCF[variable_id] = unique_cf
return (dictCF)
13 changes: 11 additions & 2 deletions catalogbuilder/intakebuilder/gfdlcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
if (op.countOf(filename,".") == 1):
dictInfo = getinfo.getInfoFromFilename(filename,dictInfo, logger)
else:
dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger)
dictInfo = getinfo.getInfoFromGFDLFilename(filename,dictInfo, logger,configyaml)
dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo,configyaml)
list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
list_bad_chunklabel = ['DO_NOT_USE']
Expand Down Expand Up @@ -106,6 +106,15 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
if "standard_name" in missingcols:
dictInfo["standard_name"] = "na"
getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo)

#replace frequency as needed
if 'frequency' in dictInfo.keys():
package_dir = os.path.dirname(os.path.abspath(__file__))
yamlfile = os.path.join(package_dir, 'dat/gfdlcmipfreq.yaml')
cmipfreq = None
gfdlfreq = dictInfo['frequency']
cmipfreq = getinfo.getFreqFromYAML(yamlfile,gfdlfreq=dictInfo['frequency'])
if(cmipfreq is not None):
dictInfo['frequency'] = cmipfreq
#print("Adjusting frequency from ", gfdlfreq ," to ",cmipfreq)
listfiles.append(dictInfo)
return listfiles
1 change: 0 additions & 1 deletion catalogbuilder/scripts/gen_intake_gfdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
#if(df['variable_id'].eq(k)).any():
df['standard_name'].loc[(df['variable_id'] == k)] = v
#df['standard_name'] = v

if(slow == False) & ('standard_name' in headers):
if ((df is not None) & (len(df) != 0) ):
with open(csv_path, 'w') as csvfile:
Expand Down
2 changes: 1 addition & 1 deletion catalogbuilder/tests/config-cfname.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ output_file_template: ['realm','time_range','variable_id']
#######################################################

input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
2 changes: 1 addition & 1 deletion catalogbuilder/tests/subdirs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
'uas'
]
time = [
'000101-000112'
'000101-000112',
'000201-000212'
]
2 changes: 1 addition & 1 deletion catalogbuilder/tests/test_ci_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_loadcat():
#todo check if its readable etc
#we are using the dynamically generated csv and json for testing in this routine
#leveraging GitHub actions CI workflow and manifests and caches
catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/gfdl_autotest.json'
catspec = pathlib.Path(os.path.dirname(__file__)).parent / '../workflow-artifacts1/gfdl_autotest_from_yaml.json'
cat = load_cat((str(catspec)))
try:
assert isinstance(cat.df, pd.DataFrame),"test failed"
Expand Down
2 changes: 1 addition & 1 deletion catalogbuilder/tests/test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
"time_range", "chunk_freq","grid_label","platform","dimensions","cell_methods","standard_name","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
Expand Down