Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overcome backwards xl-id form by handling both #422

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
context="sys/context/base.jsonld",
system_base_iri="",
union="common.jsonld.lines",
last_backwards_id_time="2022-10-14T16:26:16Z"
)

if __name__ == "__main__":
Expand Down
60 changes: 45 additions & 15 deletions lxltools/datacompiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def __init__(self, *,
context=None,
record_thing_link='mainEntity',
system_base_iri=None,
union='all.jsonld.lines'):
union='all.jsonld.lines',
last_backwards_id_time=None):
self.datasets_description = datasets_description
self.datasets = {}
self.current_ds_resources = set()
Expand All @@ -49,6 +50,11 @@ def __init__(self, *,
self.current_ds_file = None
self.no_records = False

self.last_backwards_id_time = (
timeutil.w3c_dtz_to_ms(last_backwards_id_time)
if isinstance(last_backwards_id_time, str)
else None)

if datasets_description:
self._handlers_from_datasets_description(datasets_description)

Expand Down Expand Up @@ -155,7 +161,8 @@ def _compile_dataset(self, name, result):
data = self.to_jsonld(data)

ds_url = urljoin(self.dataset_id, name)
self._create_dataset_description(ds_url, ds_created_ms, ds_modified_ms)
self._create_dataset_description(
ds_url, ds_created_ms, ds_created_ms=ds_created_ms)

base_id = urljoin(self.dataset_id, base)

Expand All @@ -172,10 +179,6 @@ def _compile_dataset(self, name, result):
modified_ms = None
fpath = urlparse(nodeid).path[1:]

if self.no_records:
self.write(node, fpath)
continue

meta = node.pop('meta', None)
if meta:
if 'created' in meta:
Expand All @@ -189,10 +192,25 @@ def _compile_dataset(self, name, result):
node,
created_ms,
modified_ms,
datasets=[self.dataset_id, ds_url])
self.write(desc, fpath)
datasets=[self.dataset_id, ds_url],
ds_created_ms=ds_created_ms)

# Keep sameAs "forwards" form in meta even if no_records is used
if self.no_records:
meta = meta or {}
sameas = meta.setdefault('sameAs', [])
rec = desc['@graph'][0]
if 'sameAs' in rec:
sameas.append({"@id": rec['@id']})
for same in rec.get('sameAs', []):
sameas.append(same)
node['meta'] = meta
self.write(node, fpath)
else:
self.write(desc, fpath)

def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, label=None):
def _create_dataset_description(self, ds_url, created_ms, modified_ms=None,
label=None, ds_created_ms=None):
if not label:
label = ds_url.rsplit('/', 1)[-1]
ds = {
Expand All @@ -211,7 +229,7 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
return

desc = self._to_node_description(ds, created_ms, modified_ms,
datasets={self.dataset_id, ds_url})
datasets={self.dataset_id, ds_url}, ds_created_ms=ds_created_ms)

record = desc['@graph'][0]
if self.tool_id:
Expand All @@ -220,14 +238,16 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
self.write(desc, ds_path)

def _to_node_description(self, node, created_ms,
modified_ms=None, datasets=None):
modified_ms=None, datasets=None, ds_created_ms=None):
assert self.record_thing_link not in node

node_id = node['@id']

record = OrderedDict()
record['@type'] = 'Record'
record['@id'] = self.generate_record_id(created_ms, node_id)

self.set_record_id(record, created_ms, node_id, ds_created_ms)

record[self.record_thing_link] = {'@id': node_id}

# Add provenance
Expand All @@ -241,9 +261,19 @@ def _to_node_description(self, node, created_ms,

return {'@graph': items}

def generate_record_id(self, created_ms, node_id):
# FIXME: backwards_form=created_ms < 2015
slug = lxlslug.librisencode(created_ms, lxlslug.checksum(node_id))
def set_record_id(self, record, created_ms, node_id, ds_created_ms=None):
if ds_created_ms is None:
ds_created_ms = created_ms
backwards_form = ds_created_ms < self.last_backwards_id_time
# TODO: use normal form and keep backwards_form as sameAs until "GC:able"?
record['@id'] = self.generate_record_id(created_ms, node_id, backwards_form)
if backwards_form:
record['sameAs'] = [{'@id': self.generate_record_id(created_ms, node_id)}]

def generate_record_id(self, created_ms, node_id, backwards_form=False):
slug = lxlslug.librisencode(
created_ms, lxlslug.checksum(node_id), backwards_form=backwards_form
)
return urljoin(self.system_base_iri, slug)

def write(self, node, name):
Expand Down
12 changes: 7 additions & 5 deletions lxltools/lxlslug.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python
from __future__ import unicode_literals, print_function

from typing import Any
from zlib import crc32
import string
import time
Expand Down Expand Up @@ -32,9 +31,12 @@ def rotate(c):
def checksum(data):
return crc32(data.encode('utf-8')) & 0xffffffff

def librisencode(a, b):
def librisencode(a, b, backwards_form=False):
alphabet = lower_consonants_numbers
timepart = "".join(reversed(caesarize(alphabet, tobase(alphabet, a))))
chars = caesarize(alphabet, tobase(alphabet, a))
if backwards_form:
chars = reversed(chars)
timepart = "".join(chars)
codepart = tobase(alphabet, b)
codelen = len(codepart)
if codelen < 7:
Expand All @@ -53,7 +55,7 @@ def librisencode(a, b):
print("Usage: %s TIMESTAMP IDENTIFIER" % (cmd), file=sys.stderr)
exit(1)

timestamp = args.pop(0)
timestamp: Any = args.pop(0)
identifiers = args

try:
Expand Down
5 changes: 3 additions & 2 deletions syscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def _get_repo_version():
context='sys/context/base.jsonld',
record_thing_link='mainEntity',
system_base_iri='',
union='syscore.jsonld.lines')
union='syscore.jsonld.lines',
last_backwards_id_time='2022-11-20T00:00:00Z')


@compiler.handler
Expand Down Expand Up @@ -145,7 +146,7 @@ def _insert_record(graph, created_ms, dataset_id):
record = {'@type': 'SystemRecord'}
record[compiler.record_thing_link] = {'@id': entity['@id']}
graph.insert(0, record)
record['@id'] = compiler.generate_record_id(created_ms, entity['@id'])
compiler.set_record_id(record, created_ms, entity['@id'])
record['inDataset'] = [{'@id': compiler.dataset_id}, {'@id': dataset_id}]


Expand Down