Skip to content

Commit

Permalink
various
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall committed May 9, 2021
1 parent 1d397cf commit b22d1ef
Show file tree
Hide file tree
Showing 31 changed files with 1,517 additions and 19 deletions.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.idea
owl/
download/
db/
docs/
load*-*
schemaload*-*
venv
bin/
target/
test.db
inferences/
112 changes: 103 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,23 +1,117 @@
ONTS = obi mondo go envo ro hp mp zfa wbphenotype ecto upheno uberon_cm doid chebi pr

OWL_SQL = rdf owl
OBO_SQL = $(OWL_SQL) obo-checks
RG_SQL = $(OWL_SQL) relation-graph
ALL_SQL = $(OWL_SQL) relation-graph obo-checks


all: $(patsubst %,all-%,$(ONTS))

all-%: target/%.load target/%.views inferences/%.load reports/%.problems.tsv
echo $*

realclean-%:
rm target/$*.* ;
rm db/$*.db

ALL_SQL_FILES = $(patsubst %,sql/%.sql,$(ALL_SQL))
sql/all.sql: $(ALL_SQL_FILES)
cat $^ > $@

schemaload-%: db/%.db sql/all.sql
sqlite3 -cmd ".echo on" $< < sql/all.sql && touch $@
# ---
# sqlite db creation and loading
# ---
target/%.created:
touch db/$*.db
db/%.db: prefixes/prefix.sql sql/rdftab.sql
cat $^ | sqlite3 $@ && echo OK || echo ALREADY LOADED
.PRECIOUS: db/%.db

problems-%: db/%.db schemaload-%
sqlite3 $< -cmd "SELECT * FROM problems"
target/%.load: target/%.created owl/%.owl
./bin/rdftab db/$*.db < owl/$*.owl && touch $@
.PRECIOUS: target/%.load

db/%.db: prefixes/prefix.sql
sqlite3 $@ < $<
.PRECIOUS: db/%.db
target/%.views: db/%.db sql/all.sql
sqlite3 -cmd ".echo on" $< < sql/all.sql ; touch $@
.PRECIOUS: target/%.load


# ---
# Inferences
# ---
# We use relation-graph
inferences/%-inf.ttl: owl/%.owl
relation-graph --ontology-file $< --redundant-output-file $@ --non-redundant-output-file inferences/$*-nr.ttl --property http://purl.obolibrary.org/obo/BFO_0000050
.PRECIOUS: inferences/%-inf.ttl

inferences/%-inf.owl: inferences/%-inf.ttl
robot convert -i $< -o $@
.PRECIOUS: inferences/%-inf.owl
inferences/%-inf.tsv: inferences/%-inf.owl
sqlite3 $@.db < prefixes/prefix.sql && ./bin/rdftab $@.db < $< && sqlite3 $@.db -cmd '.separator "\t"' -cmd '.header on' "SELECT subject,predicate,object FROM statements " > $@.tmp && mv $@.db $@.db.old && mv $@.tmp $@
.PRECIOUS: inferences/%-inf.tsv

inferences/%.load: db/%.db inferences/%-inf.tsv
sqlite3 $< -cmd '.separator "\t"' '.import inferences/$*-inf.tsv entailed_edge' && touch $@
.PRECIOUS: inferences/%.load


# ---
# Reports
# ---

reports/%.problems.tsv: db/%.db target/%.views
sqlite3 $< "SELECT * FROM problems" > $@


# ---
# Downloads
# ---

owl/%.owl:
curl -L -s http://purl.obolibrary.org/obo/$*.owl > $@.tmp && mv $@.tmp $@
.PRECIOUS: owl/%.owl

owl/go.owl:
curl -L -s http://purl.obolibrary.org/obo/go/extensions/go-plus.owl > $@

# ---
# GO Demo
# ---
demo/gaf/%.gaf.tsv:
curl -L -s http://current.geneontology.org/annotations/$*.gaf.gz | gzip -dc | ./utils/gaf2tsv > $@
loadgaf-%: demo/gaf/%.gaf.tsv
sqlite3 db/go.db -cmd '.separator "\t"' '.import $< gaf' && touch $@

download/idmapping.dat.gz:
wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz -O $@

CAMDIR = ../noctua-models/models/
loadcams:
find $(CAMDIR) -name "*.ttl" -exec sh -c "riot --out rdfxml {} | ./bin/rdftab db/go.db" \;

# ---
# Experimental: sqlalchemy bindings
# ---
semsql/sqlaviews.py: db/foo.db
sqlacodegen sqlite:///$< > $@

# ---
# Schema
# ---

# TODO: markdown gen should make modular output
markdown-%: src/schema/%.yaml
gen-markdown --no-mergeimports -d docs $< && mv docs/index.md docs/$*_index.md
markdown: markdown-rdf markdown-owl
gen-markdown --no-mergeimports -d docs src/schema/semsql.yaml

load-%: db/%.db owl/%.owl
./bin/rdftab $< < owl/$*.owl && touch $@
gen-ddl: ddl/rdf.sql
ddl/%.sql: src/schema/%.yaml
gen-sqlddl --no-use-foreign-keys $< > $@.tmp && \
python semsql/sqlutils.py $< >> $@.tmp && \
mv $@.tmp $@

semsql/sqla/%.py: src/schema/%.yaml
gen-sqlddl --no-use-foreign-keys --sqla-file $@ $<
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,21 @@ views for querying ontologies such as GO, that incorporate critical
information in existential axioms, the view `edge` provides a union of
subclass between named classes and subclasses of existentials.

## Validation

The general philosophy is not to use foreign key constraints or
triggers to enforce integrity. Instead we allow problematic data into
the database and instead provide transparent reporting on it and ways
to validate. Individual use cases may call for more aggressive filtering.

The convention is to write rules/constrains as SQL views with a name `problem_`.

See also:

* ROBOT report
* GO Rules
* KGX validation

## Relationship to rdftab.rs

We use the same schema as rdftab.rs, and rdftab can be used as a performant robust loader.
Expand All @@ -78,6 +93,10 @@ rdftab provides a useful base standard that could be used e.g. for
distributing ontologies and semantic databases as sqlite, for which a
variety of performant tools can be written.

## Schema

See [LinkML Docs](https://cmungall.github.io/semantic-sql/)

## Design Philosophy

SPARQL has many appealing qualities and it was designed first and foremost for the semantic web. But there are problems:
Expand Down
5 changes: 5 additions & 0 deletions environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/sh
python -m venv venv
source venv/bin/activate
export PYTHONPATH=.:$PYTHONPATH

7 changes: 7 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
site_name: "Semantic SQL"
theme: readthedocs
nav:
- Home: index.md
- RDF: rdf_index.md
- OWL: owl_index.md

5 changes: 5 additions & 0 deletions prefixes/prefix.sql
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,18 @@ INSERT OR IGNORE INTO prefix VALUES
("BFO", "http://purl.obolibrary.org/obo/BFO_"),
("CHEBI", "http://purl.obolibrary.org/obo/CHEBI_"),
("CL", "http://purl.obolibrary.org/obo/CL_"),
("RO", "http://purl.obolibrary.org/obo/RO_"),
("GO", "http://purl.obolibrary.org/obo/GO_"),
("UBERON", "http://purl.obolibrary.org/obo/UBERON_"),
("PATO", "http://purl.obolibrary.org/obo/PATO_"),
("IAO", "http://purl.obolibrary.org/obo/IAO_"),
("NCBITaxon", "http://purl.obolibrary.org/obo/NCBITaxon_"),
("OBI", "http://purl.obolibrary.org/obo/OBI_"),
("PR", "http://purl.obolibrary.org/obo/PR_"),

("obo", "http://purl.obolibrary.org/obo/"),

("gocam", "http://model.geneontology.org/"),
("UP", "http://purl.uniprot.org/uniprot/"),
("UC", "http://purl.uniprot.org/core/"),
("PRO", "http://www.uniprot.org/annotation/PRO_"),
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
linkml
mkdocs
sqlalchemy
sqlacodegen
Empty file added semsql/__init__.py
Empty file.
Empty file added semsql/sqla/__init__.py
Empty file.
101 changes: 101 additions & 0 deletions semsql/sqla/owl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from sqlalchemy import Column, Index, Table, Text
from sqlalchemy.sql.sqltypes import NullType
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()
metadata = Base.metadata

class BlankNode(Base):
__tablename__ = 'blank_node'
id = Column(Text, primary_key=True)

class ClassNode(Base):
__tablename__ = 'class_node'
id = Column(Text, primary_key=True)

class IriNode(Base):
__tablename__ = 'iri_node'
id = Column(Text, primary_key=True)

class NamedIndividualNode(Base):
__tablename__ = 'named_individual_node'
id = Column(Text, primary_key=True)

class Node(Base):
__tablename__ = 'node'
id = Column(Text, primary_key=True)

class OntologyNode(Base):
__tablename__ = 'ontology_node'
id = Column(Text, primary_key=True)

class OwlEquivalentClassStatement(Base):
__tablename__ = 'owl_equivalent_class_statement'
stanza = Column(Text, primary_key=True)
predicate = Column(Text, primary_key=True)
value = Column(Text, primary_key=True)
language = Column(Text, primary_key=True)
subject = Column(Text, primary_key=True)
object = Column(Text, primary_key=True)

class OwlRestriction(Base):
__tablename__ = 'owl_restriction'
restriction = Column(Text, primary_key=True)
on_property = Column(Text, primary_key=True)
filler = Column(Text, primary_key=True)

class OwlSomeValuesFrom(Base):
__tablename__ = 'owl_some_values_from'
restriction = Column(Text, primary_key=True)
on_property = Column(Text, primary_key=True)
filler = Column(Text, primary_key=True)

class Prefix(Base):
"""
Maps CURIEs to URIs
"""
__tablename__ = 'prefix'
prefix = Column(Text, primary_key=True)
base = Column(Text, primary_key=True)

class RdfTypeStatement(Base):
"""
A statement that indicates the asserted type of the subject entity
"""
__tablename__ = 'rdf_type_statement'
stanza = Column(Text, primary_key=True)
subject = Column(Text, primary_key=True)
predicate = Column(Text, primary_key=True)
value = Column(Text, primary_key=True)
language = Column(Text, primary_key=True)
object = Column(Text, primary_key=True)

class RdfsLabelStatement(Base):
__tablename__ = 'rdfs_label_statement'
stanza = Column(Text, primary_key=True)
subject = Column(Text, primary_key=True)
predicate = Column(Text, primary_key=True)
object = Column(Text, primary_key=True)
value = Column(Text, primary_key=True)
language = Column(Text, primary_key=True)

class RdfsSubClassOfStatement(Base):
__tablename__ = 'rdfs_subClassOf_statement'
stanza = Column(Text, primary_key=True)
predicate = Column(Text, primary_key=True)
value = Column(Text, primary_key=True)
language = Column(Text, primary_key=True)
subject = Column(Text, primary_key=True)
object = Column(Text, primary_key=True)

class Statements(Base):
"""
Represents an RDF triple
"""
__tablename__ = 'statements'
stanza = Column(Text, primary_key=True)
subject = Column(Text, primary_key=True)
predicate = Column(Text, primary_key=True)
object = Column(Text, primary_key=True)
value = Column(Text, primary_key=True)
language = Column(Text, primary_key=True)
Empty file added semsql/sqla/rdf.py
Empty file.
44 changes: 44 additions & 0 deletions semsql/sqlutils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import click
from linkml_model import SchemaDefinition
from linkml.utils.formatutils import underscore
from linkml.utils.schemaloader import load_raw_schema, SchemaLoader

VIEW_CODE = 'sqlview>>'

def generate_views_from_linkml(schema: SchemaDefinition, drop_tables=True) -> None:
"""
Generates SQL VIEW statements from hints in LinkML schema
View hints are encoded in comments section in classes/tables section
:param schema: LinkML schema containing hints
"""
for cn, c in schema.classes.items():
sql_table = underscore(cn)
views = []
for cmt in c.comments:
cmt = cmt.strip().rstrip(';')
if cmt.startswith(VIEW_CODE):
views.append(cmt.replace(VIEW_CODE,'').strip())
if len(views) > 0:
print()
if drop_tables:
print(f'DROP TABLE {sql_table};')
print(f'CREATE VIEW {sql_table} AS {"UNION".join(views)};')

@click.command()
@click.argument('inputs', nargs=-1)
def cli(inputs):
"""
Generates SQL VIEW commands from hints embedded in linkml schema
"""
for input in inputs:
with open(input, 'r') as stream:
schema = load_raw_schema(input)
print('-- ** REWRITE TABLES AS VIEWS **')
print(f'-- SCHEMA: {schema.id}')
loader = SchemaLoader(schema, mergeimports=True)
loader.resolve()
generate_views_from_linkml(schema)

if __name__ == '__main__':
cli()
17 changes: 17 additions & 0 deletions sql/go-annotation.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/**
These operate over NON-NORMALIZED gaf tables, for demo purposes
No attempt is yet made to parse pipe-separated fields like taxon, qualifier
*/


CREATE VIEW annotation_to_deprecated AS SELECT * FROM gaf WHERE ontology_class_ref IN (SELECT id FROM deprecated );

-- todo: include redundant
CREATE VIEW entailed_gaf AS SELECT gaf.*, e.predicate AS inferred_predicate, e.object AS ancestor_term FROM gaf JOIN entailed_edge AS e ON (gaf.ontology_class_ref = e.subject);

-- stats
CREATE VIEW num_annotation_by_taxon AS SELECT db_object_taxon, count(*) AS num_annotations FROM gaf GROUP BY db_object_taxon;
CREATE VIEW num_term_by_taxon AS SELECT db_object_taxon, count(DISTINCT ontology_class_ref) AS num_terms_annotated FROM gaf GROUP BY db_object_taxon;
Loading

0 comments on commit b22d1ef

Please sign in to comment.