Merge pull request #26 from ScottWales/handling-args

Add support for CMIP6
coecms · Aug 24, 2018 · 1fa7371 · 1fa7371
2 parents 16ec6ac + 3904b35
commit 1fa7371
Show file tree

Hide file tree

Showing 23 changed files with 15,070 additions and 213 deletions.
diff --git a/arccssive2/cli.py b/arccssive2/cli.py
diff --git a/arccssive2/esgf.py b/arccssive2/esgf.py
@@ -15,108 +15,96 @@
 # limitations under the License.
 from __future__ import print_function
 import requests
+import json
 from sqlalchemy.sql import column, label
 from sqlalchemy.orm import aliased
 from sqlalchemy import String, Float, Integer, or_, func
 from .pgvalues import values
-from .model import Path, Checksum, metadata_dataset_link
-
-def esgf_query(query, fields, limit=1000, offset=0, distrib=True, replica=False, latest=None,
-        cf_standard_name=None,
-        ensemble=None,
-        experiment=None,
-        experiment_family=None,
-        institute=None,
-        cmor_table=None,
-        model=None,
-        project=None,
-        product=None,
-        realm=None,
-        time_frequency=None,
-        variable=None,
-        variable_long_name=None,
-        source_id=None,
-        ):
+from .model import Path, Checksum, c5_metadata_dataset_link, c6_metadata_dataset_link
+
+def define_facets(project):
+    ''' Define available search facets based on project value: CMIP5 or CMIP6 '''
+    with open('../db/facets.json', 'r') as f: 
+       data = f.read()
+       fdict = json.loads(data)
+    if project == 'CMIP5':
+       facets = {v: None for v in fdict.values() if v != 'None'}
+       facets['project'] = 'CMIP5'
+    elif project == 'CMIP6': 
+       #facets = list(fdict.keys())
+       facets = {k: None for k in fdict.keys()}
+       facets['project'] = 'CMIP6'
+    return facets
+
+def esgf_query(query, fields, limit=1000, offset=0, distrib=True, replica=False, latest=None, **kwargs):
     """
     Search the ESGF
     """
+    #facets = define_facets(project)
     if latest == 'all':
         latest = None
 
     if query is not None and len(query) == 0:
         query = None
 
+    params = {
+          'query': query,
+          'fields': fields,
+          'offset': offset,
+          'limit': limit,
+          'distrib': distrib,
+          'replica': replica,
+          'latest': latest, 
+          'type': 'File',
+          'format': 'application/solr+json',
+          } 
+    params.update(kwargs)
     r = requests.get('https://esgf-node.llnl.gov/esg-search/search',
-            params = {
-                'query': query,
-                'fields': fields,
-                'offset': offset,
-                'limit': limit,
-                'distrib': distrib,
-                'replica': replica,
-                'latest': latest,
-                'cf_standard_name':cf_standard_name,
-                'ensemble':ensemble,
-                'experiment':experiment,
-                'experiment_family':experiment_family,
-                'institute':institute,
-                'cmor_table':cmor_table,
-                'model':model,
-                'project':project,
-                'product':product,
-                'realm':realm,
-                'time_frequency':time_frequency,
-                'variable':variable,
-                'variable_long_name':variable_long_name,
-                'source_id':source_id,
-                'type': 'File',
-                'format': 'application/solr+json',
-                })
+                     params = params )
 
     r.raise_for_status()
 
     return r.json()
 
 def link_to_esgf(query, **kwargs):
-    r = requests.Request('GET','https://esgf-node.llnl.gov/search/esgf-llnl',
-            params = {
+
+    constraints = {k: v for k,v in kwargs.items() if v != ()}
+    params = {
             'query': query,
             'fields': kwargs.get('fields',None),
             'offset': kwargs.get('offset',None),
             'limit': kwargs.get('limit',None),
             'distrib': 'on' if kwargs.get('distrib',True) else None,
             'replica': 'on' if kwargs.get('replica',False) else None,
-            'latest': 'on' if kwargs.get('latest',None) else None,
-            'cf_standard_name': kwargs.get('cf_standard_name',None),
-            'ensemble': kwargs.get('ensemble',None),
-            'experiment': kwargs.get('experiment',None),
-            'experiment_family': kwargs.get('experiment_family',None),
-            'institute': kwargs.get('institute',None),
-            'cmor_table': kwargs.get('cmor_table',None),
-            'model': kwargs.get('model',None),
-            'project': kwargs.get('project',None),
-            'product': kwargs.get('product',None),
-            'realm': kwargs.get('realm',None),
-            'time_frequency': kwargs.get('time_frequency',None),
-            'variable': kwargs.get('variable',None),
-            'variable_long_name': kwargs.get('variable_long_name',None),
-            'source_id': kwargs.get('source_id',None),
-            })
+            'latest': 'on' if kwargs.get('latest',None) else None
+            }
+    params.update(constraints)
+
+    endpoint = 'cmip5'
+    if params.get('project','').lower() == 'cmip6':
+        endpoint = 'cmip6'
+
+    r = requests.Request('GET','https://esgf-node.llnl.gov/search/%s'%endpoint,
+            params=params,
+            )
+    p = r.prepare()
     return r.prepare().url
 
 
-def find_checksum_id(query, **kwargs):
+def find_checksum_id(query, project, **kwargs):
     """
     Returns a sqlalchemy selectable containing the ESGF id and checksum for
     each query match
     """
-    response = esgf_query(query, 'checksum,id,dataset_id,title,version', **kwargs)
+    constraints = {k: v for k,v in kwargs.items() if v != ()}
+    constraints['project'] = project
+    response = esgf_query(query, 'checksum,id,dataset_id,title,version', **constraints)
 
     if response['response']['numFound'] == 0:
-        raise Exception('No matches found on ESGF, check at %s'%link_to_esgf(query, **kwargs))
+        raise Exception('No matches found on ESGF, check at %s'%link_to_esgf(query, **constraints))
 
     if response['response']['numFound'] > int(response['responseHeader']['params']['rows']):
-        raise Exception('Too many results (%d), try limiting your search %s'%(response['response']['numFound'], link_to_esgf(query, **kwargs)))
+        raise Exception('Too many results (%d), try limiting your search %s'%(response['response']['numFound'], link_to_esgf(query, **constraints)))
 
     table = values([
             column('checksum', String),

diff --git a/arccssive2/model.py b/arccssive2/model.py
@@ -38,20 +38,28 @@ def expr(self, model):
         expr = super(pg_json_property, self).expr(model)
         return expr.astext.cast(self.cast_type)
 
-metadata_dataset_link = Table('esgf_metadata_dataset_link', Base.metadata,
+c5_metadata_dataset_link = Table('c5_metadata_dataset_link', Base.metadata,
     Column('file_id', 
         ForeignKey('esgf_paths.file_id'), 
         ForeignKey('metadata.md_hash'),
         ForeignKey('checksums.ch_hash')),
-    Column('dataset_id', ForeignKey('esgf_dataset.dataset_id')))
+    Column('dataset_id', ForeignKey('cmip5_dataset.dataset_id')))
+
+c6_metadata_dataset_link = Table('c6_metadata_dataset_link', Base.metadata,
+    Column('file_id', 
+        ForeignKey('esgf_paths.file_id'), 
+        ForeignKey('metadata.md_hash'),
+        ForeignKey('checksums.ch_hash')),
+    Column('dataset_id', ForeignKey('cmip6_dataset.dataset_id')))
 
 class Path(Base):
     __tablename__ = 'esgf_paths'
 
     id = Column('file_id', UUID, primary_key=True)
     path = Column('path', Text)
 
-    dataset = relationship('Dataset', secondary=metadata_dataset_link, viewonly=True)
+    c5dataset = relationship('C5Dataset', secondary=c5_metadata_dataset_link, viewonly=True)
+    c6dataset = relationship('C6Dataset', secondary=c6_metadata_dataset_link, viewonly=True)
     netcdf = relationship('Netcdf', viewonly=True)
     checksum = relationship('Checksum', viewonly=True)
     extended = relationship('ExtendedMetadata', viewonly=True)
@@ -108,11 +116,11 @@ class ExtendedMetadata(Base):
     variable = Column(Text)
     period = Column(INT4RANGE)
 
-class Dataset(Base):
+class C5Dataset(Base):
     """
-    An ESGF dataset
+    A CMIP5 dataset
     """
-    __tablename__ = 'esgf_dataset'
+    __tablename__ = 'cmip5_dataset'
 
     dataset_id = Column(Text, primary_key=True)
     project = Column(Text)
@@ -126,3 +134,31 @@ class Dataset(Base):
     p = Column(Integer)
     ensemble = Column(Text)
     cmor_table = Column(Text)
+
+class C6Dataset(Base):
+    """
+    A CMIP6 ESGF dataset
+    """
+    __tablename__ = 'cmip6_dataset'
+
+    dataset_id = Column(Text, primary_key=True)
+    project = Column(Text)
+    activity_id = Column('activity_id', Text)
+    institution_id = Column('institution_id', Text)
+    source_id = Column('source_id', Text)
+    source_type = Column('source_type', Text)
+    experiment_id = Column('experiment_id', Text)
+    sub_experiment_id = Column('sub_experiment_id', Text)
+    frequency = Column('frequency', Text)
+    realm = Column(Text)
+    r = Column(Integer)
+    i = Column(Integer)
+    p = Column(Integer)
+    f = Column(Integer)
+    variant_label = Column('variant_label', Text)
+    member_id = Column('member_id', Text)
+    variable_id = Column( Text)
+    grid_label = Column('grid_label', Text)
+    nominal_resolution = Column('nominal_resolution', Text)
+    table_id = Column('table_id', Text)
+
diff --git a/db/CMIP6_activity_id.json b/db/CMIP6_activity_id.json
@@ -0,0 +1,38 @@
+{
+    "activity_id":{
+        "AerChemMIP":"Aerosols and Chemistry Model Intercomparison Project",
+        "C4MIP":"Coupled Climate Carbon Cycle Model Intercomparison Project",
+        "CDRMIP":"Carbon Dioxide Removal Model Intercomparison Project",
+        "CFMIP":"Cloud Feedback Model Intercomparison Project",
+        "CMIP":"CMIP DECK: 1pctCO2, abrupt4xCO2, amip, esm-piControl, esm-historical, historical, and piControl experiments",
+        "CORDEX":"Coordinated Regional Climate Downscaling Experiment",
+        "DAMIP":"Detection and Attribution Model Intercomparison Project",
+        "DCPP":"Decadal Climate Prediction Project",
+        "DynVarMIP":"Dynamics and Variability Model Intercomparison Project",
+        "FAFMIP":"Flux-Anomaly-Forced Model Intercomparison Project",
+        "GMMIP":"Global Monsoons Model Intercomparison Project",
+        "GeoMIP":"Geoengineering Model Intercomparison Project",
+        "HighResMIP":"High-Resolution Model Intercomparison Project",
+        "ISMIP6":"Ice Sheet Model Intercomparison Project for CMIP6",
+        "LS3MIP":"Land Surface, Snow and Soil Moisture",
+        "LUMIP":"Land-Use Model Intercomparison Project",
+        "OMIP":"Ocean Model Intercomparison Project",
+        "PAMIP":"Polar Amplification Model Intercomparison Project",
+        "PMIP":"Palaeoclimate Modelling Intercomparison Project",
+        "RFMIP":"Radiative Forcing Model Intercomparison Project",
+        "SIMIP":"Sea Ice Model Intercomparison Project",
+        "ScenarioMIP":"Scenario Model Intercomparison Project",
+        "VIACSAB":"Vulnerability, Impacts, Adaptation and Climate Services Advisory Board",
+        "VolMIP":"Volcanic Forcings Model Intercomparison Project"
+    },
+    "version_metadata":{
+        "CV_collection_modified":"Tue Aug  7 07:31:46 2018 -0700",
+        "CV_collection_version":"6.2.12.0",
+        "activity_id_CV_modified":"Mon Mar  5 16:39:09 2018 -0800",
+        "activity_id_CV_note":"Update activity_id to include CDRMIP and PAMIP",
+        "author":"Paul J. Durack <[email protected]>",
+        "institution_id":"PCMDI",
+        "previous_commit":"b27ce275adcf31632cbafb9b9f92af4235b3fff2",
+        "specs_doc":"v6.2.6 (20th December 2017; https://goo.gl/v1drZl)"
+    }
+}