Merge pull request #32 from HEPData/schema-v1.0.2

Schema v1.1.0 - bug fixes and cli
HEPData · Sep 23, 2021 · de48d30 · de48d30
2 parents 94fecde + 0d67d29
commit de48d30
Show file tree

Hide file tree

Showing 91 changed files with 76,863 additions and 116 deletions.
diff --git a/README.rst b/README.rst
@@ -39,7 +39,7 @@ Installation
 ------------
 
 If you can, install `LibYAML <https://pyyaml.org/wiki/LibYAML>`_ (a C library for parsing and emitting YAML) on your machine.
-This will allow for the use of CLoader for faster loading of YAML files.
+This will allow for the use of ``CSafeLoader`` (instead of Python ``SafeLoader``) for faster loading of YAML files.
 Not a big deal for small files, but performs markedly better on larger documents.
 
 Via pip:
@@ -61,18 +61,130 @@ Via GitHub (for developers):
 Usage
 -----
 
+The ``hepdata-validator`` package allows you to validate (via the command line or Python):
+
+* A full directory of submission and data files
+* An archive file (.zip, .tar, .tar.gz, .tgz) containing all of the files (`full details <https://hepdata-submission.readthedocs.io/en/latest/introduction.html>`_)
+* A `single .yaml or .yaml.gz file <https://hepdata-submission.readthedocs.io/en/latest/single_yaml.html>`_ (but *not* ``submission.yaml`` or a YAML data file)
+* A ``submission.yaml`` file or individual YAML data file (via Python only, not via the command line)
+
+The same package is used for validating uploads made to `hepdata.net <https://www.hepdata.net>`_, therefore
+first validating offline can be more efficient in checking your submission is valid before uploading.
+
+
+Command line
+============
+
+Installing the ``hepdata-validator`` package adds the command ``hepdata-validate`` to your path, which allows you to validate a
+`HEPData submission <https://hepdata-submission.readthedocs.io/en/latest/introduction.html>`_ offline.
+
+Examples
+^^^^^^^^
+
+To validate a submission comprising of multiple files in the current directory:
+
+.. code:: bash
+
+    $ hepdata-validate
+
+To validate a submission comprising of multiple files in another directory:
+
+.. code:: bash
+
+    $ hepdata-validate -d ../TestHEPSubmission
+
+To validate an archive file (.zip, .tar, .tar.gz, .tgz) in the current directory:
+
+.. code:: bash
+
+    $ hepdata-validate -a TestHEPSubmission.zip
+
+To validate a single YAML file in the current directory:
+
+.. code:: bash
+
+    $ hepdata-validate -f single_yaml_file.yaml
+
+Usage options
+^^^^^^^^^^^^^
+
+.. code:: bash
+
+    $ hepdata-validate --help
+    Usage: hepdata-validate [OPTIONS]
+
+      Offline validation of submission.yaml and YAML data files. Can check either
+      a directory, an archive file, or the single YAML file format.
+
+    Options:
+      -d, --directory TEXT  Directory to check (defaults to current working
+                            directory)
+      -f, --file TEXT       Single .yaml or .yaml.gz file (but not submission.yaml
+                            or a YAML data file) to check - see https://hepdata-
+                            submission.readthedocs.io/en/latest/single_yaml.html.
+                            (Overrides directory)
+      -a, --archive TEXT    Archive file (.zip, .tar, .tar.gz, .tgz) to check.
+                            (Overrides directory and file)
+      --help                Show this message and exit.
+
+
+Python
+======
+
+Validating a full submission
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To validate a full submission, instantiate a ``FullSubmissionValidator`` object:
+
+.. code:: python
+
+    from hepdata_validator.full_submission_validator import FullSubmissionValidator, SchemaType
+    full_submission_validator = FullSubmissionValidator()
+
+    # validate a directory
+    is_dir_valid = full_submission_validator.validate(directory='TestHEPSubmission')
+
+    # or uncomment to validate an archive file
+    # is_archive_valid = full_submission_validator.validate(archive='TestHEPSubmission.zip')
+
+    # or uncomment to validate a single file
+    # is_file_valid = full_submission_validator.validate(file='single_yaml_file.yaml')
+
+    # if there are any error messages, they are retrievable through this call
+    full_submission_validator.get_messages()
+
+    # the error messages can be printed for each file
+    full_submission_validator.print_errors('submission.yaml')
+
+    # the list of valid files can be retrieved via the valid_files property, which is a
+    # dict mapping SchemaType (e.g. SUBMISSION, DATA, SINGLE_YAML, REMOTE) to lists of
+    # valid files
+    full_submission_validator.valid_files[SchemaType.SUBMISSION]
+    full_submission_validator.valid_files[SchemaType.DATA]
+    # full_submission_validator.valid_files[SchemaType.SINGLE_YAML]
+
+    # if a remote schema is used, valid_files is a list of tuples (schema, file)
+    # full_submission_validator.valid_files[SchemaType.REMOTE]
+
+    # the list of valid files can be printed
+    full_submission_validator.print_valid_files()
+
+
+Validating individual files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 To validate submission files, instantiate a ``SubmissionFileValidator`` object:
 
 .. code:: python
 
     from hepdata_validator.submission_file_validator import SubmissionFileValidator
-    
+
     submission_file_validator = SubmissionFileValidator()
     submission_file_path = 'submission.yaml'
-    
+
     # the validate method takes a string representing the file path
     is_valid_submission_file = submission_file_validator.validate(file_path=submission_file_path)
-    
+
     # if there are any error messages, they are retrievable through this call
     submission_file_validator.get_messages()
 
@@ -83,14 +195,14 @@ To validate submission files, instantiate a ``SubmissionFileValidator`` object:
 To validate data files, instantiate a ``DataFileValidator`` object:
 
 .. code:: python
-    
+
     from hepdata_validator.data_file_validator import DataFileValidator
-    
+
     data_file_validator = DataFileValidator()
-    
+
     # the validate method takes a string representing the file path
     data_file_validator.validate(file_path='data.yaml')
-    
+
     # if there are any error messages, they are retrievable through this call
     data_file_validator.get_messages()
 
@@ -106,12 +218,12 @@ for the error message lookup map.
 
     from hepdata_validator.data_file_validator import DataFileValidator
     import yaml
-    
+
     file_contents = yaml.safe_load(open('data.yaml', 'r'))
     data_file_validator = DataFileValidator()
-    
+
     data_file_validator.validate(file_path='data.yaml', data=file_contents)
-    
+
     data_file_validator.get_messages('data.yaml')
 
     data_file_validator.print_errors('data.yaml')
@@ -131,10 +243,6 @@ For the analogous case of the ``SubmissionFileValidator``:
     is_valid_submission_file = submission_file_validator.validate(file_path=submission_file_path, data=docs)
     submission_file_validator.print_errors(submission_file_path)
 
-An example `offline validation script <https://github.com/HEPData/hepdata-submission/blob/master/scripts/check.py>`_
-uses the ``hepdata_validator`` package to validate the ``submission.yaml`` file and all YAML data files of a
-HEPData submission.
-
 
 Schema Versions
 ---------------
@@ -196,7 +304,7 @@ download them. However, in principle, for testing purposes, note that the same m
 
 .. code:: python
 
-    schema_path = 'https://hepdata.net/submission/schemas/1.0.1/'
+    schema_path = 'https://hepdata.net/submission/schemas/1.1.0/'
     schema_name = 'data_schema.json'
 
-and passing a HEPData YAML data file as the ``file_path`` argument of the ``validate`` method.
+and passing a HEPData YAML data file as the ``file_path`` argument of the ``validate`` method.
diff --git a/hepdata_validator/__init__.py b/hepdata_validator/__init__.py
@@ -25,11 +25,16 @@
 import abc
 import os
 
+from jsonschema import validate as json_validate, ValidationError
+from jsonschema.validators import validator_for
+from jsonschema.exceptions import by_relevance
+from packaging import version as packaging_version
+
 from .version import __version__
 
 __all__ = ('__version__', )
 
-VALID_SCHEMA_VERSIONS = ['1.0.1', '1.0.0', '0.1.0']
+VALID_SCHEMA_VERSIONS = ['1.1.0', '1.0.1', '1.0.0', '0.1.0']
 LATEST_SCHEMA_VERSION = VALID_SCHEMA_VERSIONS[0]
 
 RAW_SCHEMAS_URL = 'https://raw.githubusercontent.com/HEPData/hepdata-validator/' \
@@ -48,22 +53,16 @@ def __init__(self, *args, **kwargs):
         self.default_schema_file = ''
         self.schemas = kwargs.get('schemas', {})
         self.schema_folder = kwargs.get('schema_folder', 'schemas')
-        self.schema_version = kwargs.get('schema_version', LATEST_SCHEMA_VERSION)
-        if self.schema_version not in VALID_SCHEMA_VERSIONS:
-            raise ValueError('Invalid schema version ' + self.schema_version)
+        self.schema_version_string = kwargs.get('schema_version', LATEST_SCHEMA_VERSION)
+        if self.schema_version_string not in VALID_SCHEMA_VERSIONS:
+            raise ValueError('Invalid schema version ' + self.schema_version_string)
+        self.schema_version = packaging_version.parse(self.schema_version_string)
 
-    def _get_major_version(self):
-        """
-        Parses the major version of the validator.
-
-        :return: integer corresponding to the validator major version
-        """
-        return int(self.schema_version.split('.')[0])
 
     def _get_schema_filepath(self, schema_filename):
         full_filepath = os.path.join(self.base_path,
                                      self.schema_folder,
-                                     self.schema_version,
+                                     self.schema_version_string,
                                      schema_filename)
 
         if not os.path.isfile(full_filepath):
@@ -81,6 +80,33 @@ def validate(self, **kwargs):
         :return: true if valid, false otherwise
         """
 
+    def _validate_json_against_schema(self, file_path, data, schema, sort_fn=None, **kwargs):
+        """
+        Validates json_data against the given schema.
+        Roughly follows the pattern of jsonschema.validate but adds errors to
+        self.messages, and will add multiple errors if they exist.
+
+        :param type file_path: path to file being checked
+        :param type data: JSON/YAML data to validate
+        :param type schema: schema to validate data against
+        :param type sort_fn: Function to sort error messages to get most
+            relevant (see docs for `jsonschema.exceptions.by_relevance`).
+        :param type **kwargs: Other kwargs to use when creating the
+            `jsonschema.IValidator` instance.
+        """
+        # Create validator ourselves so we can tweak the errors
+        cls = validator_for(schema)
+        cls.check_schema(schema)
+        v = cls(schema, **kwargs)
+
+        if not sort_fn:
+            sort_fn = by_relevance()
+
+        # Show all errors found, using best error in context for each
+        for error in v.iter_errors(data):
+            best = sorted([error] + error.context, key=sort_fn)[0]
+            self.add_validation_error(file_path, best)
+
     def has_errors(self, file_name):
         """
         Returns true if the provided file name has error messages

diff --git a/hepdata_validator/cli.py b/hepdata_validator/cli.py
@@ -0,0 +1,30 @@
+import sys
+
+import click
+
+from .full_submission_validator import FullSubmissionValidator
+
+
+@click.command()
+@click.option('--directory', '-d', default='.', help='Directory to check (defaults to current working directory)')
+@click.option('--file', '-f', default=None, help='Single .yaml or .yaml.gz file (but not submission.yaml or a YAML data file) to check - see https://hepdata-submission.readthedocs.io/en/latest/single_yaml.html. (Overrides directory)')
+@click.option('--archive', '-a', default=None, help='Archive file (.zip, .tar, .tar.gz, .tgz) to check. (Overrides directory and file)')
+def validate(directory, file, archive):  # pragma: no cover
+    """
+    Offline validation of submission.yaml and YAML data files.
+    Can check either a directory, an archive file, or the single YAML file format.
+    """
+    file_or_dir_checked = archive if archive else (file if file else directory)
+    validator = FullSubmissionValidator()
+    is_valid = validator.validate(directory, file, archive)
+    if is_valid:
+        click.echo(f"{file_or_dir_checked} is valid.")
+    else:
+        click.echo(f"ERROR: {file_or_dir_checked} is invalid.")
+
+    validator.print_valid_files()
+    for f in validator.messages.keys():
+        validator.print_errors(f)
+
+    if not is_valid:
+        sys.exit(1)