Merge pull request #279 from ckan/new-cli

New `ckan dcat consume` and `ckan dcat produce` commands
ckan · May 7, 2024 · 48b5e61 · 48b5e61
2 parents 6fb1780 + 576e26c
commit 48b5e61
Show file tree

Hide file tree

Showing 4 changed files with 186 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -36,6 +36,7 @@ It also offers other features related to Semantic Data like exposing the necessa
 - [XML DCAT harvester (deprecated)](#xml-dcat-harvester-deprecated)
 - [Translation of fields](#translation-of-fields)
 - [Structured Data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing)
+- [CLI](#cli)
 - [Running the Tests](#running-the-tests)
 - [Releases](#releases)
 - [Acknowledgements](#acknowledgements)
@@ -944,6 +945,25 @@ Example output of structured data in JSON-LD:
     </html>
 
 
+## CLI
+
+The `ckan dcat` command offers utilites to transform between DCAT RDF Serializations and CKAN datasets (`ckan dcat consume`) and
+viceversa (`ckan dcat produce`). In both cases the input can be provided as a path to a file:
+
+    ckan dcat consume -f ttl examples/dataset.ttl
+
+    ckan dcat produce -f jsonld examples/ckan_datasets.json
+
+or be read from stdin:
+
+    ckan dcat consume -
+
+The latter form allows chaininig commands for more complex metadata processing, e.g.:
+
+    curl https://demo.ckan.org/api/action/package_search | jq .result.results | ckan dcat produce -f jsonld -
+
+For the full list of options check `ckan dcat consume --help` and  `ckan dcat produce --help`.
+
 ## Running the Tests
 
 To run the tests do:

diff --git a/ckanext/dcat/cli.py b/ckanext/dcat/cli.py
@@ -1,25 +1,132 @@
 # -*- coding: utf-8 -*-
+import json
 
 import click
+
 import ckan.plugins.toolkit as tk
+
 import ckanext.dcat.utils as utils
+from ckanext.dcat.processors import RDFParser, RDFSerializer, DEFAULT_RDF_PROFILES
 
-@click.group()
-def generate_static():
-    """Generates static files containing all datasets.
 
-    """
+@click.group()
+def dcat():
+    """DCAT utilities for CKAN"""
     pass
 
-@generate_static.command()
-@click.argument('output', type=click.File(mode="w"))
-def json(output):
-    """The generate command will generate a static file containing all of
-    the datasets in the catalog in JSON format.
 
+@dcat.command()
+@click.argument("output", type=click.File(mode="w"))
+def generate_static(output):
+    """[Deprecated] Generate a static datasets file in JSON format
+    (requires the dcat_json_interface plugin) .
     """
     utils.generate_static_json(output)
 
 
+@dcat.command(context_settings={"show_default": True})
+@click.argument("input", type=click.File(mode="r"))
+@click.option(
+    "-o",
+    "--output",
+    type=click.File(mode="w"),
+    default="-",
+    help="By default the command will output the result to stdin, "
+    "alternatively you can provide a file path with this option",
+)
+@click.option(
+    "-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)"
+)
+@click.option(
+    "-p",
+    "--profiles",
+    default=" ".join(DEFAULT_RDF_PROFILES),
+    help="RDF profiles to use",
+)
+@click.option(
+    "-P", "--pretty", is_flag=True, help="Make the output more human readable"
+)
+@click.option(
+    "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)"
+)
+def consume(input, output, format, profiles, pretty, compat_mode):
+    """
+    Parses DCAT RDF graphs into CKAN dataset JSON objects.
+
+    The input serializations can be provided as a path to a file, e.g.:
+
+        ckan dcat consume examples/dataset.ttl
+
+    Or be read from stdin:
+
+        ckan dcat consume -
+    """
+    contents = input.read()
+
+    if profiles:
+        profiles = profiles.split()
+    parser = RDFParser(profiles=profiles, compatibility_mode=compat_mode)
+    parser.parse(contents, _format=format)
+
+    ckan_datasets = [d for d in parser.datasets()]
+
+    indent = 4 if pretty else None
+    out = json.dumps(ckan_datasets, indent=indent)
+
+    output.write(out)
+
+
+@dcat.command(context_settings={"show_default": True})
+@click.argument("input", type=click.File(mode="r"))
+@click.option(
+    "-o",
+    "--output",
+    type=click.File(mode="w"),
+    default="-",
+    help="By default the command will output the result to stdin, "
+    "alternatively you can provide a file path with this option",
+)
+@click.option(
+    "-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)"
+)
+@click.option(
+    "-p",
+    "--profiles",
+    default=" ".join(DEFAULT_RDF_PROFILES),
+    help="RDF profiles to use",
+)
+@click.option(
+    "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)"
+)
+def produce(input, output, format, profiles, compat_mode):
+    """
+    Transforms CKAN dataset JSON objects into DCAT RDF serializations.
+
+    The input datasets can be provided as a path to a file, e.g.:
+
+        ckan dcat consume examples/ckan_dataset.json
+
+    Or be read from stdin:
+
+        ckan dcat produce -
+    """
+    contents = input.read()
+
+    if profiles:
+        profiles = profiles.split()
+    serializer = RDFSerializer(
+        profiles=profiles,
+        compatibility_mode=compat_mode
+    )
+
+    dataset = json.loads(contents)
+    if isinstance(dataset, list):
+        out = serializer.serialize_datasets(dataset, _format=format)
+    else:
+        out = serializer.serialize_dataset(dataset, _format=format)
+
+    output.write(out)
+
+
 def get_commands():
-    return [generate_static]
+    return [dcat]
diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py
@@ -291,6 +291,22 @@ def serialize_dataset(self, dataset_dict, _format='xml'):
 
         return output
 
+    def serialize_datasets(self, dataset_dicts, _format='xml'):
+        '''
+        Given a list of CKAN dataset dicts, returns an RDF serialization
+
+        The serialization format can be defined using the `_format` parameter.
+        It must be one of the ones supported by RDFLib, defaults to `xml`.
+
+        Returns a string with the serialized datasets
+        '''
+        out = []
+        for dataset_dict in dataset_dicts:
+            out.append(self.serialize_dataset(dataset_dict, _format))
+        return '\n'.join(out)
+
+
+
     def serialize_catalog(self, catalog_dict=None, dataset_dicts=None,
                           _format='xml', pagination_info=None):
         '''
@@ -394,59 +410,3 @@ def _get_from_extra(key):
                     g.add((agent, predicate, _type(val)))
 
         return catalog_ref
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(
-        description='DCAT RDF - CKAN operations')
-    parser.add_argument('mode',
-                        default='consume',
-                        help='''
-Operation mode.
-`consume` parses DCAT RDF graphs to CKAN dataset JSON objects.
-`produce` serializes CKAN dataset JSON objects into DCAT RDF.
-                        ''')
-    parser.add_argument('file', nargs='?', type=argparse.FileType('r'),
-                        default=sys.stdin,
-                        help='Input file. If omitted will read from stdin')
-    parser.add_argument('-f', '--format',
-                        default='xml',
-                        help='''Serialization format (as understood by rdflib)
-                                eg: xml, n3 ... Defaults to \'xml\'.''')
-    parser.add_argument('-P', '--pretty',
-                        action='store_true',
-                        help='Make the output more human readable')
-    parser.add_argument('-p', '--profile', nargs='*',
-                        action='store',
-                        help='RDF Profiles to use, defaults to euro_dcat_ap_2')
-    parser.add_argument('-m', '--compat-mode',
-                        action='store_true',
-                        help='Enable compatibility mode')
-
-    parser.add_argument('-s', '--subcatalogs', action='store_true', dest='subcatalogs',
-                        default=False,
-                        help="Enable subcatalogs handling (dct:hasPart support)")
-    args = parser.parse_args()
-
-    contents = args.file.read()
-
-    config.update({DCAT_EXPOSE_SUBCATALOGS: args.subcatalogs})
-
-    if args.mode == 'produce':
-        serializer = RDFSerializer(profiles=args.profile,
-                                   compatibility_mode=args.compat_mode)
-
-        dataset = json.loads(contents)
-        out = serializer.serialize_dataset(dataset, _format=args.format)
-        print(out)
-    else:
-        parser = RDFParser(profiles=args.profile,
-                           compatibility_mode=args.compat_mode)
-
-        parser.parse(contents, _format=args.format)
-
-        ckan_datasets = [d for d in parser.datasets()]
-
-        indent = 4 if args.pretty else None
-        print(json.dumps(ckan_datasets, indent=indent))
diff --git a/ckanext/dcat/tests/test_cli.py b/ckanext/dcat/tests/test_cli.py
@@ -0,0 +1,33 @@
+import json
+import os
+
+from ckanext.dcat.cli import dcat as dcat_cli
+
+
+def test_consume(cli):
+
+    path = os.path.join(
+        os.path.dirname(__file__), "..", "..", "..", "examples", "dataset_afs.ttl"
+    )
+
+    result = cli.invoke(dcat_cli, ["consume", "-f", "ttl", path])
+    assert result.exit_code == 0
+
+    assert json.loads(result.stdout)[0]["title"] == "A test dataset on your catalogue"
+
+
+def test_produce(cli):
+
+    path = os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "..",
+        "..",
+        "examples",
+        "full_ckan_dataset.json",
+    )
+
+    result = cli.invoke(dcat_cli, ["produce", "-f", "jsonld", path])
+    assert result.exit_code == 0
+
+    assert json.loads(result.stdout)["@context"]["dcat"] == "http://www.w3.org/ns/dcat#"