Add named graphs to SPARQL interface. Add SPARQL diagnostics

hsolbrig · Feb 17, 2019 · c3e2d53 · c3e2d53
1 parent 6bf1c94
commit c3e2d53
Show file tree

Hide file tree

Showing 37 changed files with 3,453 additions and 2,210 deletions.
diff --git a/Pipfile b/Pipfile
@@ -13,7 +13,7 @@ urllib3 = "*"
 ShExJSG = ">=0.5.6"
 CFGraph = ">=0.2.1"
 PyShExC = ">=0.5.4"
-sparql-slurper = ">=0.2.0"
+sparql-slurper = ">=0.2.1"
 sparqlwrapper = "*"
 
 [requires]

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ This package is a reasonably literal implementation of the [Shape Expressions La
 * 0.7.1 -- Fix issue 26
 * 0.7.2 -- Upgrade error reporting
 * 0.7.3 -- Report using namespaces, enhance PrefixLib to inject into a module
+* 0.7.4 -- Added '-ps', '-pr', '-gn', '-pb' options to CLI
 
 ## Installation
 ```bash
@@ -53,7 +54,7 @@ Unfortunately, however, `rdflib-jsonld` is NOT compatible with the bleeding edge
 > shexeval -h
 usage: shexeval [-h] [-f FORMAT] [-s START] [-ut] [-sp STARTPREDICATE]
                 [-fn FOCUS] [-A] [-d] [-ss] [-cf] [-sq SPARQL] [-se]
-                [--stopafter STOPAFTER]
+                [--stopafter STOPAFTER] [-ps] [-pr] [-gn GRAPHNAME] [-pb]
                 rdf shex
 
 positional arguments:
@@ -81,6 +82,13 @@ optional arguments:
   -se, --stoponerror    Stop on an error
   --stopafter STOPAFTER
                         Stop after N nodes
+  -ps, --printsparql    Print SPARQL queries as they are executed
+  -pr, --printsparqlresults
+                        Print SPARQL query and results
+  -gn GRAPHNAME, --graphname GRAPHNAME
+                        Specific SPARQL graph to query - use '' for any named
+                        graph
+  -pb, --persistbnodes  Treat BNodes as persistent in SPARQL endpoint
 ```
 
 ## Documentation

diff --git a/notebooks/CLI.ipynb b/notebooks/CLI.ipynb
@@ -18,16 +18,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pyshex.shex_evaluator import evaluate_cli as shexeval"
+    "from pyshex.shex_evaluator import evaluate_cli as shexeval, PrefixLibrary"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -36,7 +36,7 @@
      "text": [
       "usage: shexeval [-h] [-f FORMAT] [-s START] [-ut] [-sp STARTPREDICATE]\n",
       "                [-fn FOCUS] [-A] [-d] [-ss] [-cf] [-sq SPARQL] [-se]\n",
-      "                [--stopafter STOPAFTER]\n",
+      "                [--stopafter STOPAFTER] [-ps] [-pr] [-gn GRAPHNAME] [-pb]\n",
       "                rdf shex\n",
       "\n",
       "positional arguments:\n",
@@ -63,7 +63,14 @@
       "                        SPARQL query to generate focus nodes\n",
       "  -se, --stoponerror    Stop on an error\n",
       "  --stopafter STOPAFTER\n",
-      "                        Stop after N nodes\n"
+      "                        Stop after N nodes\n",
+      "  -ps, --printsparql    Print SPARQL queries as they are executed\n",
+      "  -pr, --printsparqlresults\n",
+      "                        Print SPARQL query and results\n",
+      "  -gn GRAPHNAME, --graphname GRAPHNAME\n",
+      "                        Specific SPARQL graph to query - use '' for any named\n",
+      "                        graph\n",
+      "  -pb, --persistbnodes  Treat BNodes as persistent in SPARQL endpoint\n"
      ]
     },
     {
@@ -90,7 +97,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -99,7 +106,7 @@
        "0"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -119,7 +126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -158,7 +165,7 @@
        "1"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -169,12 +176,121 @@
     "         \"-fn http://hl7.org/fhir/Observation/example-haplotype2\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Debug Biolink model against NCATS Red knowledge graph\n",
+    "The following example selects every item of type Protein and validates it against the biolink-model shex definition.\n",
+    "Parameters:\n",
+    "* http://graphdb.dumontierlab.com/repositories/ncats-red-kg - SPARQL endpoing\n",
+    "* https://raw.githubusercontent.com/biolink/biolink-model/master/shex/biolink-modelnc.shex - ShEx \n",
+    "* -ss      Use SparqlSlurper\n",
+    "* -gn ''   Don't slurp the default graph (if there is a name in the quotes, restrict it to that graph)\n",
+    "* -pr      Print Slurper query results\n",
+    "* -ps      Print Slurper queries\n",
+    "* -ut      The RDF types of the subjects are the start nodes in the ShEx\n",
+    "* -se      Stop on the first error\n",
+    "* -sq 'select ?item where{?item a <http://w3id.org/biolink/vocab/Protein>}'  - the initial SPARQL query"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SPARQL:\n",
+      "select ?item where{?item a <http://w3id.org/biolink/vocab/Protein>}\n",
+      "\thttp://identifiers.org/uniprot/P00734\n",
+      "\thttp://identifiers.org/uniprot/P00533\n",
+      "\thttp://identifiers.org/uniprot/O75015\n",
+      "\thttp://identifiers.org/uniprot/P00736\n",
+      "\thttp://identifiers.org/uniprot/P02745\n",
+      "\thttp://identifiers.org/uniprot/P02746\n",
+      "\thttp://identifiers.org/uniprot/P02747\n",
+      "\thttp://identifiers.org/uniprot/P08637\n",
+      "\thttp://identifiers.org/uniprot/P09871\n",
+      "\thttp://identifiers.org/uniprot/P12314\n",
+      "\n",
+      "\t     ...\n",
+      "\n",
+      "\n",
+      "SPARQL: (SELECT ?s ?p ?o {graph ?g {<http://identifiers.org/uniprot/P00734> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?o}}) (0.25 secs) - 3 triples\n",
+      "RESULTS:\n",
+      "\t<http://identifiers.org/uniprot/P00734> a <http://w3id.org/biolink/vocab/Protein> .\n",
+      "SPARQL: (SELECT ?s ?p ?o {graph ?g {<http://identifiers.org/uniprot/P00734> ?p ?o}}) (0.25 secs) - 42 triples\n",
+      "RESULTS:\n",
+      "\t<http://identifiers.org/uniprot/P00734> a ns1:Protein ;\n",
+      "\t    ns1:id <http://identifiers.org/uniprot/P00734>,\n",
+      "\t        \"P00734\" ;\n",
+      "\t    ns1:name \"10xCbxE-F2(25-622)\",\n",
+      "\t        \"10xCbxE-F2(44-327)\",\n",
+      "\t        \"10xCbxE-F2(44-622)\",\n",
+      "\t        \"F2(25-622)\",\n",
+      "\t        \"prothrombin (factor II) propeptide\",\n",
+      "\t        \"thrombin heavy chain\",\n",
+      "\t        \"thrombin light chain\" ;\n",
+      "\t    ns1:part_of <http://identifiers.org/wikipathways/WP1818_r101320>,\n",
+      "\t        <http://identifiers.org/wikipathways/WP1884_r101399>,\n",
+      "\t        <http://identifiers.org/wikipathways/WP1929_r101361>,\n",
+      "\t        <http://identifiers.org/wikipathways/WP2762_r101409>,\n",
+      "\t        <http://identifiers.org/wikipathways/WP4419_r101720>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/a6d12>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/b607a>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/b9727>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/d6bb0>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/da774>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/e0f44>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/e7688>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/eddd5>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/f0c44>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/Complex/f7a7d>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/WP/Interaction/ccf28>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1818_r101320/WP/Interaction/f8e40>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1884_r101399/Complex/a8341>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1884_r101399/Complex/e1e8e>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP1929_r101361/Complex/eb678>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP2762_r101409/WP/Interaction/b538d>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP2762_r101409/WP/Interaction/da6fc>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP2762_r101409/WP/Interaction/fc444>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP4419_r101720/Complex/d0cd7>,\n",
+      "\t        <http://rdf.wikipathways.org/Pathway/WP4419_r101720/Complex/f1433> ;\n",
+      "\t    ns1:same_as <http://identifiers.org/ncbigene/2147>,\n",
+      "\t        <http://identifiers.org/uniprot/C9JV37>,\n",
+      "\t        <http://identifiers.org/uniprot/E9PIT3>,\n",
+      "\t        <http://identifiers.org/uniprot/P00734> ;\n",
+      "\t    ns1:systematic_synonym <http://identifiers.org/hgnc.symbol/F2> .\n",
+      "Errors:\n",
+      "  Focus: http://identifiers.org/uniprot/P00734\n",
+      "  Start: http://w3id.org/biolink/vocab/Protein\n",
+      "  Reason:   Testing <http://identifiers.org/uniprot/P00734> against shape http://w3id.org/biolink/vocab/NamedThing\n",
+      "    Triples:\n",
+      "      <http://identifiers.org/uniprot/P00734> ns1:id <http://identifiers.org/uniprot/P00734> .\n",
+      "      <http://identifiers.org/uniprot/P00734> ns1:id \"P00734\" .\n",
+      "   2 triples exceeds max {0,1}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "shexeval([\"http://graphdb.dumontierlab.com/repositories/ncats-red-kg\",\n",
+    "          \"https://raw.githubusercontent.com/biolink/biolink-model/master/shex/biolink-modelnc.shex\",\n",
+    "          \"-ss\", \"-gn\", \"\", \"-pr\", \"-ps\", \"-ut\", \"-se\", \"-sq\",\n",
+    "          \"select ?item where{?item a <http://w3id.org/biolink/vocab/Protein>}\"])"
+   ]
   }
  ],
  "metadata": {

diff --git a/pyshex/shape_expressions_language/p5_5_shapes_and_triple_expressions.py b/pyshex/shape_expressions_language/p5_5_shapes_and_triple_expressions.py
@@ -216,7 +216,7 @@ def matchesCardinality(cntxt: Context, T: RDFGraph, expr: Union[ShExJ.tripleExpr
                 _fail_triples(cntxt, T)
                 cntxt.fail_reason = f"   {len(T)} triples less than {cardinality_text}"
             else:
-                cntxt.fail_reason = f"   No matching triples found for predicate {expr.predicate}"
+                cntxt.fail_reason = f"   No matching triples found for predicate {cntxt.n3_mapper.n3(expr.predicate)}"
             return False
         elif 0 <= max_ < len(T):
             _fail_triples(cntxt, T)
@@ -238,7 +238,7 @@ def _fail_triples(cntxt: Context, T: RDFGraph) -> None:
     tlist = list(T)
     if len(tlist):
         cntxt.fail_reason = "Triples:"
-        for t in tlist:
+        for t in sorted(tlist):
             cntxt.fail_reason = f"      {cntxt.n3_mapper.n3(t)}"
         if len(tlist) > 5:
             cntxt.fail_reason = "      ...   "

diff --git a/pyshex/shex_evaluator.py b/pyshex/shex_evaluator.py
@@ -6,7 +6,7 @@
 from ShExJSG import ShExJ, ShExC
 from rdflib import Graph, URIRef, RDF
 from rdflib.util import guess_format
-from sparql_slurper import SlurpyGraph
+from sparql_slurper import SlurpyGraph, QueryResultPrinter
 
 from pyshex import PrefixLibrary
 from pyshex.shape_expressions_language.p5_2_validation_definition import isValid
@@ -281,6 +281,11 @@ def genargs(prog: Optional[str] = None) -> ArgumentParser:
     parser.add_argument("-sq", "--sparql", help="SPARQL query to generate focus nodes")
     parser.add_argument("-se", "--stoponerror", help="Stop on an error", action="store_true")
     parser.add_argument("--stopafter", help="Stop after N nodes", type=int)
+    parser.add_argument("-ps", "--printsparql", help="Print SPARQL queries as they are executed", action="store_true")
+    parser.add_argument("-pr", "--printsparqlresults", help="Print SPARQL query and results", action="store_true")
+    parser.add_argument("-gn", "--graphname", help="Specific SPARQL graph to query - use '' for any named graph")
+    parser.add_argument("-pb", "--persistbnodes", help="Treat BNodes as persistent in SPARQL endpoint",
+                        action="store_true")
     return parser
 
 
@@ -291,16 +296,28 @@ def evaluate_cli(argv: Optional[Union[str, List[str]]] = None, prog: Optional[st
     if opts.sparql:
         opts.slurper = True
     if opts.slurper and opts.flattener:
-        print("Error: Cannot combine slurper and flattener graphs")
+        print("Error: Cannot combine slurper and flattener graphs", file=sys.stderr)
         return 2
+    if not opts.sparql and not opts.slurper and \
+            (opts.printsparql or opts.printsparqlresults or opts.graphname is not None or opts.persistbnodes):
+        print("Error: printsparql, pringsparqlresults, graphname and persistbnodes are SPQARQL only",
+              file=sys.stderr)
     if not opts.format:
         opts.format = guess_format(opts.rdf)
     if not opts.format:
-        print('Error: Cannot determine RDF format from file name - use "--format" option')
+        print('Error: Cannot determine RDF format from file name - use "--format" option', file=sys.stderr)
         return 3
     if opts.slurper:
         g = SlurpyGraph(opts.rdf)
-        g.persistent_bnodes = True
+        if opts.printsparql:
+            g.debug_slurps = True
+        if opts.printsparqlresults:
+            g.debug_slurps = True
+            g.add_result_hook(QueryResultPrinter)
+        if opts.graphname is not None:
+            g.graph_name = opts.graphname
+        if opts.persistbnodes:
+            g.persistent_bnodes = True
     else:
         g = CFGraph() if opts.flattener else Graph()
         if '\n' in opts.rdf or '\r' in opts.rdf:
@@ -309,7 +326,8 @@ def evaluate_cli(argv: Optional[Union[str, List[str]]] = None, prog: Optional[st
             g.load(opts.rdf, format=opts.format)
 
     if not (opts.focus or opts.allsubjects or opts.sparql):
-        print('Error: You must specify one or more graph focus nodes, supply a SPARQL query, or use the "-A" option')
+        print('Error: You must specify one or more graph focus nodes, supply a SPARQL query, or use the "-A" option',
+              file=sys.stderr)
         return 4
 
     start = []
@@ -327,7 +345,8 @@ def evaluate_cli(argv: Optional[Union[str, List[str]]] = None, prog: Optional[st
             opts.focus = []
         elif not isinstance(opts.focus, list):
             opts.focus = [opts.focus]
-        opts.focus += list(SPARQLQuery(opts.rdf, opts.sparql, ).focus_nodes())
+        opts.focus += list(SPARQLQuery(opts.rdf, opts.sparql, print_query=opts.printsparql,
+                                       print_results=opts.printsparqlresults).focus_nodes())
 
     def result_sink(rslt: EvaluationResult) -> bool:
         if not rslt.result:

diff --git a/pyshex/utils/sparql_query.py b/pyshex/utils/sparql_query.py
@@ -7,12 +7,16 @@
 
 
 class SPARQLQuery:
-    def __init__(self, sparql_endpoint: str, sparql_file_uri_or_text: str, ) -> None:
+    def __init__(self, sparql_endpoint: str, sparql_file_uri_or_text: str,
+                 print_query: bool=False, print_results: bool=False) -> None:
         """ Set up the query to run
 
         :param sparql_endpoint: URL of sparql endpoint
         :param sparql_file_uri_or_text: URI, filename or SPARQL text
+        :param print_query: Print the sparql results query
+        :param print_results: Print query results
         """
+        self.print_results = print_results
         if '\n' in sparql_file_uri_or_text or '\r' in sparql_file_uri_or_text or ' ' in sparql_file_uri_or_text:
             self.query = sparql_file_uri_or_text
         elif ':/' in sparql_file_uri_or_text:
@@ -23,6 +27,9 @@ def __init__(self, sparql_endpoint: str, sparql_file_uri_or_text: str, ) -> None
         else:
             with open(sparql_file_uri_or_text) as f:
                 self.query = f.read()
+        if print_query:
+            print("SPARQL:")
+            print(self.query)
         self.endpoint = SPARQLWrapper(sparql_endpoint)
         self.endpoint.setQuery(self.query)
         self.endpoint.setReturnFormat(JSON)
@@ -31,4 +38,9 @@ def focus_nodes(self) -> List[URIRef]:
         result = self.endpoint.query()
 
         processed_results = jsonasobj.load(result.response)
+        if self.print_results:
+            print('\t' + ('\n\t'.join([row.item.value for row in processed_results.results.bindings[:10]])))
+            if len(processed_results.results.bindings) > 10:
+                print('\n\t     ...')
+            print('\n')
         return [URIRef(row.item.value) for row in processed_results.results.bindings]