Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add standard_name to catalog #19

Merged
merged 16 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

Cite our work: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5196586.svg)](https://doi.org/10.5281/zenodo.10787602)

See our [project documentation site ](https://aradhakrishnangfdl.github.io/CatalogBuilder/).
See our [project documentation site ](https://noaa-gfdl.github.io/CatalogBuilder/).


This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
4 changes: 3 additions & 1 deletion catalogbuilder/intakebuilder/CSVwriter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os.path
import csv
import pandas as pd
from csv import writer
#from intakebuilder import builderconfig, configparser
from . import builderconfig, configparser
Expand Down Expand Up @@ -40,7 +41,7 @@ def file_appender(dictinputs, csvfile):
# add contents of list as last row in the csv file
csv_writer.writerow(dictinputs)

def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append):
def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append,slow):
try:
#Open the CSV file in write mode and add any data with atleast 3 values associated with it
if overwrite:
Expand Down Expand Up @@ -95,5 +96,6 @@ def listdict_to_csv(dict_info,headerlist, csvfile, overwrite, append):
for data in dict_info:
if len(data.keys()) > 2:
writer.writerow(data)

except IOError:
print("I/O error")
47 changes: 45 additions & 2 deletions catalogbuilder/intakebuilder/getinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import xarray as xr
#from intakebuilder import builderconfig, configparser
from . import builderconfig, configparser

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

'''
getinfo.py provides helper functions to get information (from filename, DRS, file/global attributes) needed to populate the catalog
Expand Down Expand Up @@ -178,7 +179,25 @@ def getInfoFromDRS(dirpath,projectdir,dictInfo):
def return_xr(fname):
filexr = (xr.open_dataset(fname))
filexra = filexr.attrs
return filexra
return filexr,filexra
def getInfoFromVarAtts(fname,variable_id,dictInfo,att="standard_name",filexra=None):
'''
Returns info from the filename and xarray dataset object
:param fname: filename
:param filexr: Xarray dataset object
:return: dictInfo with all variable atts
'''
#try:
filexr,filexra = return_xr(fname)
#print("Variable atts from file:",filexr[variable_id])
if (dictInfo[att] == "na"):
try:
cfname = filexr[variable_id].attrs["standard_name"]
except KeyError:
cfname = "NA"
dictInfo["standard_name"] = cfname
print("standard_name found",dictInfo["standard_name"])
return dictInfo
def getInfoFromGlobalAtts(fname,dictInfo,filexra=None):
'''
Returns info from the filename and xarray dataset object
Expand All @@ -205,3 +224,27 @@ def getInfoFromGlobalAtts(fname,dictInfo,filexra=None):
dictInfo["frequency"] = frequency
return dictInfo

def getStandardName(list_variable_id):
'''
Returns dict standard name for the variable in question
'''
unique_cf = "na"
dictCF = {}
try:
url = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip5_vars.csv"
df = pd.read_csv(url, sep=",", header=0,index_col=False)
except IOError:
print("Unable to open file")
sys.exit(1)
#search for variable and its cf name
for variable_id in list_variable_id:
cfname = (df[df['GFDL_varname'] == variable_id]["standard_name"])
list_cfname = cfname.tolist()
if not list_cfname:
#print("what if the names correspond to CMOR_varname")
cfname = (df[df['CMOR_varname'] == variable_id]["standard_name"])
list_cfname = cfname.tolist()
if len(list_cfname) > 0:
unique_cf = list(set(list_cfname))[0]
dictCF[variable_id] = unique_cf
return (dictCF)
50 changes: 41 additions & 9 deletions catalogbuilder/intakebuilder/gfdlcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
'''
localcrawler crawls through the local file path, then calls helper functions in the package to getinfo.
It finally returns a list of dict. eg {'project': 'CMIP6', 'path': '/uda/CMIP6/CDRMIP/NCC/NorESM2-LM/esm-pi-cdr-pulse/r1i1p1f1/Emon/zg/gn/v20191108/zg_Emon_NorESM2-LM_esm-pi-cdr-pulse_r1i1p1f1_gn_192001-192912.nc', 'variable': 'zg', 'mip_table': 'Emon', 'model': 'NorESM2-LM', 'experiment_id': 'esm-pi-cdr-pulse', 'ensemble_member': 'r1i1p1f1', 'grid_label': 'gn', 'temporal subset': '192001-192912', 'institute': 'NCC', 'version': 'v20191108'}

'''
def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
'''
Craw through the local directory and run through the getInfo.. functions
crawl through the local directory and run through the getInfo.. functions
:param projectdir:
:return:listfiles which has a dictionary of all key/value pairs for each file to be added to the csv
'''
Expand All @@ -22,6 +21,36 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):

orig_pat = pat

if configyaml:
headerlist = configyaml.headerlist
else:
headerlist = builderconfig.headerlist

#For those columns that we cannot find in output path template or output file template from config yaml, we have hooks
#now to look up the netcdf dataset if slow option is True
#todo catch exceptions upon furhter testing
list_ptemplate = []
list_ftemplate = []
set_ptemplate = set()
set_ftemplate = set()

if( configyaml is not None):
if (configyaml.output_path_template is not None) & (configyaml.output_file_template is not None) :
list_ptemplate = configyaml.output_path_template
list_ftemplate = configyaml.output_file_template
set_ptemplate = set(list_ptemplate)
set_ftemplate = set(list_ftemplate)
#print(headerlist)
#print(list_ptemplate)
#print(list_ftemplate)
if (len(set_ptemplate) > 0):
diffcols = [x for x in headerlist if x not in set_ptemplate]
if ( len(set_ftemplate) > 0 ):
missingcols = [col for col in diffcols if col not in set_ftemplate]
missingcols.remove("path") #because we get this anyway
print("Missing cols from metadata sources:", missingcols)


#TODO INCLUDE filter in traversing through directories at the top
for dirpath, dirs, files in os.walk(projectdir):
searchpath = dirpath
Expand Down Expand Up @@ -60,20 +89,23 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml):
if(dictInfo["chunk_freq"] in list_bad_chunklabel):
logger.debug("Found bad chunk, skipping this possibly bad DRS filename",filepath)
continue

if configyaml:
headerlist = configyaml.headerlist
else:
headerlist = builderconfig.headerlist
# remove those keys that are not CSV headers
# move it so its one time
rmkeys = []
for dkeys in dictInfo.keys():
if dkeys not in headerlist:
rmkeys.append(dkeys)
rmkeys = list(set(rmkeys))

for k in rmkeys: dictInfo.pop(k,None)
# todo do the reverse if slow is on. Open file no matter what and populate dictionary values and if there is something missed out
# we can scan filenames or config etc
#here, we will see if there are missing header values and compare with file attributes if slow option is turned on
if (slow == True) & (bool(dictInfo) == True) :
print("Slow option turned on.. lets open some files using xarray and lookup atts",filename)
#todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml
if "standard_name" in missingcols:
dictInfo["standard_name"] = "na"
getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo)

listfiles.append(dictInfo)
return listfiles
33 changes: 26 additions & 7 deletions catalogbuilder/scripts/gen_intake_gfdl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python

import json
import sys
import sys,pandas as pd
import click
import os
from pathlib import Path
Expand All @@ -11,8 +11,7 @@
logger.setLevel(logging.INFO)

try:
#from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser
from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser
from catalogbuilder.intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser, getinfo
except ModuleNotFoundError:
print("The module intakebuilder is not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
print("Attempting again with adjusted sys.path ")
Expand All @@ -22,7 +21,8 @@
print("Unable to adjust sys.path")
#print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
try:
from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser
from intakebuilder import gfdlcrawler, CSVwriter, builderconfig, configparser,getinfo
print(gfdlcrawler.__file__)
except ModuleNotFoundError:
sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")

Expand All @@ -42,8 +42,9 @@
@click.option('--filter_chunk', nargs=1)
@click.option('--overwrite', is_flag=True, default=False)
@click.option('--append', is_flag=True, default=False)
@click.option('--slow','-s', is_flag=True, default=False)
def main(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
overwrite=False, append=False):
overwrite=False, append=False, slow = False):

configyaml = None
# TODO error catching
Expand Down Expand Up @@ -89,7 +90,7 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
dictInfo = {}
project_dir = project_dir.rstrip("/")
logger.info("Calling gfdlcrawler.crawlLocal")
list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml)
list_files = gfdlcrawler.crawlLocal(project_dir, dictFilter, dictFilterIgnore, logger, configyaml,slow)
#Grabbing data from template JSON, changing CSV path to match output path, and dumping data in new JSON
with open(template_path, "r") as jsonTemplate:
data = json.load(jsonTemplate)
Expand All @@ -103,7 +104,25 @@ def main(input_path=None, output_path=None, config=None, filter_realm=None, filt
# so we check if it's a directory first
if os.path.isdir(os.path.dirname(csv_path)):
os.makedirs(os.path.dirname(csv_path), exist_ok=True)
CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append)
CSVwriter.listdict_to_csv(list_files, headers, csv_path, overwrite, append,slow)
df = None
if(slow == False) & ('standard_name' in headers ):
#If we badly need standard name, we use gfdl cmip mapping tables especially when one does not prefer the slow option. Useful for MDTF runs
df = pd.read_csv(os.path.abspath(csv_path), sep=",", header=0,index_col=False)
list_variable_id = []
list_variable_id = df["variable_id"].tolist()
dictVarCF = getinfo.getStandardName(list_variable_id)
#print("standard name from look-up table-", dictVarCF)
for k, v in dictVarCF.items():
#if(df['variable_id'].eq(k)).any():
df['standard_name'].loc[(df['variable_id'] == k)] = v
#df['standard_name'] = v

if(slow == False) & ('standard_name' in headers):
if ((df is not None) & (len(df) != 0) ):
with open(csv_path, 'w') as csvfile:
df.to_csv(csvfile)

print("JSON generated at:", os.path.abspath(json_path))
print("CSV generated at:", os.path.abspath(csv_path))
logger.info("CSV generated at" + os.path.abspath(csv_path))
Expand Down
41 changes: 41 additions & 0 deletions catalogbuilder/tests/config-cfname.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']

output_file_template: ['realm','time_range','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################

input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
Loading