diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index ce942720..08c78d0d 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -421,9 +421,10 @@ def _insert_or_update_temporal(self, dataset_dict, key, value): else: dataset_dict["extras"].append({"key": key, "value": value}) - def _agent_details(self, subject, predicate): + def _agents_details(self, subject, predicate): """ - Returns a dict with details about a dct:publisher or dct:creator entity, a foaf:Agent + Returns a list of dicts with details about a foaf:Agent property, e.g. + dct:publisher or dct:creator entity. Both subject and predicate must be rdflib URIRef or BNode objects @@ -441,21 +442,26 @@ def _agent_details(self, subject, predicate): an empty string if they could not be found. """ - agent_details = {} - + agents = [] for agent in self.g.objects(subject, predicate): + agent_details = {} agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" agent_details["name"] = self._object_value(agent, FOAF.name) agent_details["email"] = self._object_value(agent, FOAF.mbox) + if not agent_details["email"]: + agent_details["email"] = self._without_mailto( + self._object_value(agent, VCARD.hasEmail) + ) agent_details["url"] = self._object_value(agent, FOAF.homepage) agent_details["type"] = self._object_value(agent, DCT.type) agent_details['identifier'] = self._object_value(agent, DCT.identifier) + agents.append(agent_details) - return agent_details + return agents def _contact_details(self, subject, predicate): """ - Returns a dict with details about a vcard expression + Returns a list of dicts with details about vcard expressions Both subject and predicate must be rdflib URIRef or BNode objects @@ -463,10 +469,10 @@ def _contact_details(self, subject, predicate): an empty string if they could not be found """ - contact = {} - + contacts = [] for agent in self.g.objects(subject, predicate): + contact = {} contact["uri"] = str(agent) if isinstance(agent, URIRef) else "" contact["name"] = self._get_vcard_property_value( @@ -478,8 +484,9 @@ def _contact_details(self, subject, predicate): ) contact["identifier"] = self._get_vcard_property_value(agent, VCARD.hasUID) + contacts.append(contact) - return contact + return contacts def _parse_geodata(self, spatial, datatype, cur_value): """ @@ -1148,10 +1155,13 @@ def _extract_catalog_dict(self, catalog_ref): if val: out.append({"key": key, "value": val}) + publishers = self._agents_details(catalog_ref, DCT.publisher) + if publishers: + publisher = publishers[0] out.append( { "key": "source_catalog_publisher", - "value": json.dumps(self._agent_details(catalog_ref, DCT.publisher)), + "value": json.dumps(publisher), } ) return out diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index ad40d988..57458c6a 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -108,33 +108,43 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): dataset_dict["extras"].append({"key": key, "value": json.dumps(values)}) # Contact details - contact = self._contact_details(dataset_ref, DCAT.contactPoint) - if not contact: - # adms:contactPoint was supported on the first version of DCAT-AP - contact = self._contact_details(dataset_ref, ADMS.contactPoint) - - if contact: - for key in ("uri", "name", "email", "identifier"): - if contact.get(key): - dataset_dict["extras"].append( - {"key": "contact_{0}".format(key), "value": contact.get(key)} - ) - - # Publisher - publisher = self._agent_details(dataset_ref, DCT.publisher) - for key in ("uri", "name", "email", "url", "type", "identifier"): - if publisher.get(key): - dataset_dict["extras"].append( - {"key": "publisher_{0}".format(key), "value": publisher.get(key)} - ) + if self._schema_field("contact"): + # This is a scheming field, will be hanlded in a separate profile + pass + else: + contact = self._contact_details(dataset_ref, DCAT.contactPoint) + if not contact: + # adms:contactPoint was supported on the first version of DCAT-AP + contact = self._contact_details(dataset_ref, ADMS.contactPoint) + if contact: + contact = contact[0] + for key in ("uri", "name", "email", "identifier"): + if contact.get(key): + dataset_dict["extras"].append( + { + "key": "contact_{0}".format(key), + "value": contact.get(key) + } + ) - # Creator - creator = self._agent_details(dataset_ref, DCT.creator) - for key in ("uri", "name", "email", "url", "type", "identifier"): - if creator.get(key): - dataset_dict["extras"].append( - {"key": "creator_{0}".format(key), "value": creator.get(key)} - ) + # Publishers and creators + for item in [("publisher", DCT.publisher), ("creator", DCT.creator)]: + agent_key, predicate = item + if self._schema_field(agent_key): + # This is a scheming field, will be hanlded in a separate profile + pass + else: + agents = self._agents_details(dataset_ref, predicate) + if agents: + agent = agents[0] + for key in ("uri", "name", "email", "url", "type", "identifier"): + if agent.get(key): + dataset_dict["extras"].append( + { + "key": f"{agent_key}_{key}", + "value": agent.get(key) + } + ) # Temporal start, end = self._time_interval(dataset_ref, DCT.temporal) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index f87c94ca..ca72cd21 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -87,7 +87,7 @@ def _parse_list_value(data_dict, field_name): check_name = new_fields_mapping.get(field_name, field_name) for extra in dataset_dict.get("extras", []): if extra["key"].startswith(f"{check_name}_"): - subfield = extra["key"][extra["key"].index("_") + 1:] + subfield = extra["key"][extra["key"].index("_") + 1 :] if subfield in [ f["field_name"] for f in schema_field["repeating_subfields"] ]: @@ -100,6 +100,18 @@ def _parse_list_value(data_dict, field_name): dataset_dict[field_name] = [new_dict] dataset_dict["extras"] = new_extras + # Contact details + contacts = self._contact_details(dataset_ref, DCAT.contactPoint) + if contacts: + dataset_dict["contact"] = contacts + + # Publishers and creators + for item in [("publisher", DCT.publisher), ("creator", DCT.creator)]: + key, predicate = item + agents = self._agents_details(dataset_ref, predicate) + if agents: + dataset_dict[key] = agents + # Repeating subfields: resources for schema_field in self._dataset_schema["resource_fields"]: if "repeating_subfields" in schema_field: @@ -124,7 +136,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): Add triples to the graph from new repeating subfields """ contact = dataset_dict.get("contact") - if isinstance(contact, list) and len(contact) and self._not_empty_dict(contact[0]): + if ( + isinstance(contact, list) + and len(contact) + and self._not_empty_dict(contact[0]) + ): for item in contact: contact_uri = item.get("uri") if contact_uri: @@ -150,11 +166,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): contact_details, VCARD.hasUID, "identifier", - _type=URIRefOrLiteral + _type=URIRefOrLiteral, ) - self._add_agent(dataset_ref, dataset_dict, "publisher", DCT.publisher) - self._add_agent(dataset_ref, dataset_dict, "creator", DCT.creator) + self._add_agents(dataset_ref, dataset_dict, "publisher", DCT.publisher) + self._add_agents(dataset_ref, dataset_dict, "creator", DCT.creator) temporal = dataset_dict.get("temporal_coverage") if ( @@ -172,7 +188,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): self.g.add((dataset_ref, DCT.temporal, temporal_ref)) spatial = dataset_dict.get("spatial_coverage") - if isinstance(spatial, list) and len(spatial) and self._not_empty_dict(spatial[0]): + if ( + isinstance(spatial, list) + and len(spatial) + and self._not_empty_dict(spatial[0]) + ): for item in spatial: if item.get("uri"): spatial_ref = CleanedURIRef(item["uri"]) @@ -205,55 +225,59 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): except ValueError: pass - def _add_agent(self, dataset_ref, dataset_dict, agent_key, rdf_predicate): + def _add_agents( + self, dataset_ref, dataset_dict, agent_key, rdf_predicate, first_only=False + ): """ - Adds an agent (publisher or creator) to the RDF graph. + Adds one or more agents (e.g. publisher or creator) to the RDF graph. :param dataset_ref: The RDF reference of the dataset :param dataset_dict: The dataset dictionary containing agent information - :param agent_key: 'publisher' or 'creator' to specify the agent - :param rdf_predicate: The RDF predicate (DCT.publisher or DCT.creator) + :param agent_key: field name in the CKAN dict (.e.g. "publisher", "creator", etc) + :param rdf_predicate: The RDF predicate (DCT.publisher, DCT.creator, etc) + :first_only: Add the first item found only (used for 0..1 properties) """ agent = dataset_dict.get(agent_key) - if ( - isinstance(agent, list) - and len(agent) - and self._not_empty_dict(agent[0]) - ): - agent = agent[0] - agent_uri = agent.get("uri") - if agent_uri: - agent_ref = CleanedURIRef(agent_uri) - else: - agent_ref = BNode() - - self.g.add((agent_ref, RDF.type, FOAF.Agent)) - self.g.add((dataset_ref, rdf_predicate, agent_ref)) - - self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name") - self._add_triple_from_dict(agent, agent_ref, FOAF.homepage, "url", _type=URIRef) - self._add_triple_from_dict( - agent, - agent_ref, - DCT.type, - "type", - _type=URIRefOrLiteral, - ) - self._add_triple_from_dict( - agent, - agent_ref, - VCARD.hasEmail, - "email", - _type=URIRef, - value_modifier=self._add_mailto, - ) - self._add_triple_from_dict( - agent, - agent_ref, - DCT.identifier, - "identifier", - _type=URIRefOrLiteral - ) + if isinstance(agent, list) and len(agent) and self._not_empty_dict(agent[0]): + agents = [agent[0]] if first_only else agent + + for agent in agents: + + agent_uri = agent.get("uri") + if agent_uri: + agent_ref = CleanedURIRef(agent_uri) + else: + agent_ref = BNode() + + self.g.add((agent_ref, RDF.type, FOAF.Agent)) + self.g.add((dataset_ref, rdf_predicate, agent_ref)) + + self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name") + self._add_triple_from_dict( + agent, agent_ref, FOAF.homepage, "url", _type=URIRef + ) + self._add_triple_from_dict( + agent, + agent_ref, + DCT.type, + "type", + _type=URIRefOrLiteral, + ) + self._add_triple_from_dict( + agent, + agent_ref, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + self._add_triple_from_dict( + agent, + agent_ref, + DCT.identifier, + "identifier", + _type=URIRefOrLiteral, + ) @staticmethod def _not_empty_dict(data_dict): diff --git a/ckanext/dcat/tests/profiles/base/test_base_profile.py b/ckanext/dcat/tests/profiles/base/test_base_profile.py index 9b341efc..fb08f51e 100644 --- a/ckanext/dcat/tests/profiles/base/test_base_profile.py +++ b/ckanext/dcat/tests/profiles/base/test_base_profile.py @@ -660,7 +660,7 @@ def test_publisher_foaf(self): p = RDFProfile(g) - publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher) + publisher = p._agents_details(URIRef('http://example.org'), DCT.publisher)[0] assert publisher['uri'] == 'http://orgs.vocab.org/some-org' assert publisher['name'] == 'Publishing Organization for dataset 1' @@ -688,7 +688,7 @@ def test_publisher_ref(self): p = RDFProfile(g) - publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher) + publisher = p._agents_details(URIRef('http://example.org'), DCT.publisher)[0] assert publisher['uri'] == 'http://orgs.vocab.org/some-org' @@ -721,6 +721,8 @@ def test_contact_details(self): contact = p._contact_details(URIRef('http://example.org'), ADMS.contactPoint) + contact = contact[0] + assert contact['name'] == 'Point of Contact' # mailto gets removed for storage and is added again on output assert contact['email'] == 'contact@some.org' diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py index 3b1d99f5..383de651 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py @@ -190,7 +190,7 @@ def test_e2e_ckan_to_dcat(self): g, publisher[0][2], DCT.identifier, - URIRef(dataset_dict["publisher"][0]["identifier"]) + URIRef(dataset_dict["publisher"][0]["identifier"]), ) creator = [t for t in g.triples((dataset_ref, DCT.creator, None))] @@ -221,10 +221,9 @@ def test_e2e_ckan_to_dcat(self): g, creator[0][2], DCT.identifier, - URIRef(dataset_dict["creator"][0]["identifier"]) + URIRef(dataset_dict["creator"][0]["identifier"]), ) - temporal = [t for t in g.triples((dataset_ref, DCT.temporal, None))] assert len(temporal) == len(dataset["temporal_coverage"]) @@ -275,8 +274,8 @@ def test_e2e_ckan_to_dcat(self): # Statements for item in [ - ('access_rights', DCT.accessRights), - ('provenance', DCT.provenance), + ("access_rights", DCT.accessRights), + ("provenance", DCT.provenance), ]: statement = [s for s in g.objects(dataset_ref, item[1])][0] assert self._triple(g, statement, RDFS.label, dataset[item[0]]) @@ -388,7 +387,7 @@ def test_e2e_ckan_to_dcat(self): # Resources: statements statement = [s for s in g.objects(distribution_ref, DCT.rights)][0] - assert self._triple(g, statement, RDFS.label, resource['rights']) + assert self._triple(g, statement, RDFS.label, resource["rights"]) def test_publisher_fallback_org(self): @@ -839,6 +838,138 @@ def test_statement_literal(self): assert dataset["notes"] == "This is a dataset" assert dataset["access_rights"] == "Some statement" + def test_multiple_contacts(self): + + data = """ + @prefix dcat: . + @prefix dct: . + @prefix rdfs: . + @prefix vcard: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dct:description "This is a dataset" ; + dcat:contactPoint [ a vcard:Kind ; + vcard:fn "Test Contact 1" ; + vcard:hasEmail ; + vcard:hasUID "https://orcid.org/0000-0002-9095-9201" + ], + [ a vcard:Kind ; + vcard:fn "Test Contact 2" ; + vcard:hasEmail ; + vcard:hasUID "https://orcid.org/0000-0002-9095-9202" + ] ; + . + """ + + p = RDFParser() + + p.parse(data, _format="ttl") + datasets = [d for d in p.datasets()] + + dataset = datasets[0] + assert len(dataset["contact"]) == 2 + assert dataset["contact"][0]["name"] == "Test Contact 1" + assert dataset["contact"][0]["email"] == "contact1@example.org" + assert ( + dataset["contact"][0]["identifier"] + == "https://orcid.org/0000-0002-9095-9201" + ) + assert dataset["contact"][1]["name"] == "Test Contact 2" + assert dataset["contact"][1]["email"] == "contact2@example.org" + assert ( + dataset["contact"][1]["identifier"] + == "https://orcid.org/0000-0002-9095-9202" + ) + + def test_multiple_publishers(self): + + data = """ + @prefix dcat: . + @prefix dct: . + @prefix rdfs: . + @prefix org: . + @prefix skos: . + @prefix foaf: . + @prefix vcard: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dct:description "This is a dataset" ; + dct:publisher [ a org:Organization ; + skos:prefLabel "Test Publisher 1" ; + vcard:hasEmail ; + dct:identifier "https://orcid.org/0000-0002-9095-9201" ; + foaf:name "Test Publisher 1" ], + [ a org:Organization ; + skos:prefLabel "Test Publisher 2" ; + vcard:hasEmail ; + dct:identifier "https://orcid.org/0000-0002-9095-9202" ; + foaf:name "Test Publisher 2" ] ; + . + """ + + p = RDFParser() + + p.parse(data, _format="ttl") + datasets = [d for d in p.datasets()] + + dataset = datasets[0] + assert len(dataset["publisher"]) == 2 + assert dataset["publisher"][0]["name"] == "Test Publisher 1" + assert dataset["publisher"][0]["email"] == "publisher1@example.org" + assert ( + dataset["publisher"][0]["identifier"] + == "https://orcid.org/0000-0002-9095-9201" + ) + assert dataset["publisher"][1]["name"] == "Test Publisher 2" + assert dataset["publisher"][1]["email"] == "publisher2@example.org" + assert ( + dataset["publisher"][1]["identifier"] + == "https://orcid.org/0000-0002-9095-9202" + ) + + def test_multiple_creators(self): + + data = """ + @prefix dcat: . + @prefix dct: . + @prefix rdfs: . + @prefix org: . + @prefix skos: . + @prefix foaf: . + @prefix vcard: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dct:description "This is a dataset" ; + dct:creator [ a org:Organization ; + skos:prefLabel "Test Creator 1" ; + vcard:hasEmail ; + foaf:name "Test Creator 1" ], + [ a org:Organization ; + skos:prefLabel "Test Creator 2" ; + vcard:hasEmail ; + foaf:name "Test Creator 2" ] ; + . + """ + + p = RDFParser() + + p.parse(data, _format="ttl") + datasets = [d for d in p.datasets()] + + dataset = datasets[0] + assert len(dataset["creator"]) == 2 + assert dataset["creator"][0]["name"] == "Test Creator 1" + assert dataset["creator"][0]["email"] == "creator1@example.org" + assert dataset["creator"][1]["name"] == "Test Creator 2" + assert dataset["creator"][1]["email"] == "creator2@example.org" + + @pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config(