diff --git a/common.py b/common.py index 7ad3199db..9d97a0b45 100644 --- a/common.py +++ b/common.py @@ -11,6 +11,7 @@ context="sys/context/base.jsonld", system_base_iri="", union="common.jsonld.lines", + last_backwards_id_time="2022-10-14T16:26:16Z" ) if __name__ == "__main__": diff --git a/lxltools/datacompiler.py b/lxltools/datacompiler.py index d94be9e25..64a47ed63 100644 --- a/lxltools/datacompiler.py +++ b/lxltools/datacompiler.py @@ -33,7 +33,8 @@ def __init__(self, *, context=None, record_thing_link='mainEntity', system_base_iri=None, - union='all.jsonld.lines'): + union='all.jsonld.lines', + last_backwards_id_time=None): self.datasets_description = datasets_description self.datasets = {} self.current_ds_resources = set() @@ -49,6 +50,11 @@ def __init__(self, *, self.current_ds_file = None self.no_records = False + self.last_backwards_id_time = ( + timeutil.w3c_dtz_to_ms(last_backwards_id_time) + if isinstance(last_backwards_id_time, str) + else None) + if datasets_description: self._handlers_from_datasets_description(datasets_description) @@ -155,7 +161,8 @@ def _compile_dataset(self, name, result): data = self.to_jsonld(data) ds_url = urljoin(self.dataset_id, name) - self._create_dataset_description(ds_url, ds_created_ms, ds_modified_ms) + self._create_dataset_description( + ds_url, ds_created_ms, ds_created_ms=ds_created_ms) base_id = urljoin(self.dataset_id, base) @@ -172,10 +179,6 @@ def _compile_dataset(self, name, result): modified_ms = None fpath = urlparse(nodeid).path[1:] - if self.no_records: - self.write(node, fpath) - continue - meta = node.pop('meta', None) if meta: if 'created' in meta: @@ -189,10 +192,25 @@ def _compile_dataset(self, name, result): node, created_ms, modified_ms, - datasets=[self.dataset_id, ds_url]) - self.write(desc, fpath) + datasets=[self.dataset_id, ds_url], + ds_created_ms=ds_created_ms) + + # Keep sameAs "forwards" form in meta even if no_records is used + if self.no_records: + meta = meta or {} + sameas = meta.setdefault('sameAs', []) + rec = desc['@graph'][0] + if 'sameAs' in rec: + sameas.append({"@id": rec['@id']}) + for same in rec.get('sameAs', []): + sameas.append(same) + node['meta'] = meta + self.write(node, fpath) + else: + self.write(desc, fpath) - def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, label=None): + def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, + label=None, ds_created_ms=None): if not label: label = ds_url.rsplit('/', 1)[-1] ds = { @@ -211,7 +229,7 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe return desc = self._to_node_description(ds, created_ms, modified_ms, - datasets={self.dataset_id, ds_url}) + datasets={self.dataset_id, ds_url}, ds_created_ms=ds_created_ms) record = desc['@graph'][0] if self.tool_id: @@ -220,14 +238,16 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe self.write(desc, ds_path) def _to_node_description(self, node, created_ms, - modified_ms=None, datasets=None): + modified_ms=None, datasets=None, ds_created_ms=None): assert self.record_thing_link not in node node_id = node['@id'] record = OrderedDict() record['@type'] = 'Record' - record['@id'] = self.generate_record_id(created_ms, node_id) + + self.set_record_id(record, created_ms, node_id, ds_created_ms) + record[self.record_thing_link] = {'@id': node_id} # Add provenance @@ -241,9 +261,19 @@ def _to_node_description(self, node, created_ms, return {'@graph': items} - def generate_record_id(self, created_ms, node_id): - # FIXME: backwards_form=created_ms < 2015 - slug = lxlslug.librisencode(created_ms, lxlslug.checksum(node_id)) + def set_record_id(self, record, created_ms, node_id, ds_created_ms=None): + if ds_created_ms is None: + ds_created_ms = created_ms + backwards_form = ds_created_ms < self.last_backwards_id_time + # TODO: use normal form and keep backwards_form as sameAs until "GC:able"? + record['@id'] = self.generate_record_id(created_ms, node_id, backwards_form) + if backwards_form: + record['sameAs'] = [{'@id': self.generate_record_id(created_ms, node_id)}] + + def generate_record_id(self, created_ms, node_id, backwards_form=False): + slug = lxlslug.librisencode( + created_ms, lxlslug.checksum(node_id), backwards_form=backwards_form + ) return urljoin(self.system_base_iri, slug) def write(self, node, name): diff --git a/lxltools/lxlslug.py b/lxltools/lxlslug.py index ee514925f..e6e72ef2e 100755 --- a/lxltools/lxlslug.py +++ b/lxltools/lxlslug.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -from __future__ import unicode_literals, print_function - +from typing import Any from zlib import crc32 import string import time @@ -32,9 +31,12 @@ def rotate(c): def checksum(data): return crc32(data.encode('utf-8')) & 0xffffffff -def librisencode(a, b): +def librisencode(a, b, backwards_form=False): alphabet = lower_consonants_numbers - timepart = "".join(reversed(caesarize(alphabet, tobase(alphabet, a)))) + chars = caesarize(alphabet, tobase(alphabet, a)) + if backwards_form: + chars = reversed(chars) + timepart = "".join(chars) codepart = tobase(alphabet, b) codelen = len(codepart) if codelen < 7: @@ -53,7 +55,7 @@ def librisencode(a, b): print("Usage: %s TIMESTAMP IDENTIFIER" % (cmd), file=sys.stderr) exit(1) - timestamp = args.pop(0) + timestamp: Any = args.pop(0) identifiers = args try: diff --git a/syscore.py b/syscore.py index 246ae2451..8bd7ba977 100644 --- a/syscore.py +++ b/syscore.py @@ -31,7 +31,8 @@ def _get_repo_version(): context='sys/context/base.jsonld', record_thing_link='mainEntity', system_base_iri='', - union='syscore.jsonld.lines') + union='syscore.jsonld.lines', + last_backwards_id_time='2022-11-20T00:00:00Z') @compiler.handler @@ -145,7 +146,7 @@ def _insert_record(graph, created_ms, dataset_id): record = {'@type': 'SystemRecord'} record[compiler.record_thing_link] = {'@id': entity['@id']} graph.insert(0, record) - record['@id'] = compiler.generate_record_id(created_ms, entity['@id']) + compiler.set_record_id(record, created_ms, entity['@id']) record['inDataset'] = [{'@id': compiler.dataset_id}, {'@id': dataset_id}]