diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index a520bcce..e10fda88 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -508,9 +508,10 @@ def _insert_or_update_temporal(self, dataset_dict, key, value): else: dataset_dict["extras"].append({"key": key, "value": value}) - def _agent_details(self, subject, predicate): + def _agents_details(self, subject, predicate): """ - Returns a dict with details about a dct:publisher or dct:creator entity, a foaf:Agent + Returns a list of dicts with details about a foaf:Agent property, e.g. + dct:publisher or dct:creator entity. Both subject and predicate must be rdflib URIRef or BNode objects @@ -528,21 +529,26 @@ def _agent_details(self, subject, predicate): an empty string if they could not be found. """ - agent_details = {} - + agents = [] for agent in self.g.objects(subject, predicate): + agent_details = {} agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" agent_details["name"] = self._object_value(agent, FOAF.name) agent_details["email"] = self._object_value(agent, FOAF.mbox) + if not agent_details["email"]: + agent_details["email"] = self._without_mailto( + self._object_value(agent, VCARD.hasEmail) + ) agent_details["url"] = self._object_value(agent, FOAF.homepage) agent_details["type"] = self._object_value(agent, DCT.type) agent_details['identifier'] = self._object_value(agent, DCT.identifier) + agents.append(agent_details) - return agent_details + return agents def _contact_details(self, subject, predicate): """ - Returns a dict with details about a vcard expression + Returns a list of dicts with details about vcard expressions Both subject and predicate must be rdflib URIRef or BNode objects @@ -550,10 +556,10 @@ def _contact_details(self, subject, predicate): an empty string if they could not be found """ - contact = {} - + contacts = [] for agent in self.g.objects(subject, predicate): + contact = {} contact["uri"] = str(agent) if isinstance(agent, URIRef) else "" contact["name"] = self._get_vcard_property_value( @@ -565,8 +571,9 @@ def _contact_details(self, subject, predicate): ) contact["identifier"] = self._get_vcard_property_value(agent, VCARD.hasUID) + contacts.append(contact) - return contact + return contacts def _parse_geodata(self, spatial, datatype, cur_value): """ @@ -1277,10 +1284,13 @@ def _extract_catalog_dict(self, catalog_ref): if val: out.append({"key": key, "value": val}) + publishers = self._agents_details(catalog_ref, DCT.publisher) + if publishers: + publisher = publishers[0] out.append( { "key": "source_catalog_publisher", - "value": json.dumps(self._agent_details(catalog_ref, DCT.publisher)), + "value": json.dumps(publisher), } ) return out diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index 0f5490d5..2356a2d4 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -134,33 +134,43 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): dataset_dict["extras"].append({"key": key, "value": json.dumps(values)}) # Contact details - contact = self._contact_details(dataset_ref, DCAT.contactPoint) - if not contact: - # adms:contactPoint was supported on the first version of DCAT-AP - contact = self._contact_details(dataset_ref, ADMS.contactPoint) - - if contact: - for key in ("uri", "name", "email", "identifier"): - if contact.get(key): - dataset_dict["extras"].append( - {"key": "contact_{0}".format(key), "value": contact.get(key)} - ) - - # Publisher - publisher = self._agent_details(dataset_ref, DCT.publisher) - for key in ("uri", "name", "email", "url", "type", "identifier"): - if publisher.get(key): - dataset_dict["extras"].append( - {"key": "publisher_{0}".format(key), "value": publisher.get(key)} - ) + if self._schema_field("contact"): + # This is a scheming field, will be hanlded in a separate profile + pass + else: + contact = self._contact_details(dataset_ref, DCAT.contactPoint) + if not contact: + # adms:contactPoint was supported on the first version of DCAT-AP + contact = self._contact_details(dataset_ref, ADMS.contactPoint) + if contact: + contact = contact[0] + for key in ("uri", "name", "email", "identifier"): + if contact.get(key): + dataset_dict["extras"].append( + { + "key": "contact_{0}".format(key), + "value": contact.get(key) + } + ) - # Creator - creator = self._agent_details(dataset_ref, DCT.creator) - for key in ("uri", "name", "email", "url", "type", "identifier"): - if creator.get(key): - dataset_dict["extras"].append( - {"key": "creator_{0}".format(key), "value": creator.get(key)} - ) + # Publishers and creators + for item in [("publisher", DCT.publisher), ("creator", DCT.creator)]: + agent_key, predicate = item + if self._schema_field(agent_key): + # This is a scheming field, will be hanlded in a separate profile + pass + else: + agents = self._agents_details(dataset_ref, predicate) + if agents: + agent = agents[0] + for key in ("uri", "name", "email", "url", "type", "identifier"): + if agent.get(key): + dataset_dict["extras"].append( + { + "key": f"{agent_key}_{key}", + "value": agent.get(key) + } + ) # Temporal start, end = self._time_interval(dataset_ref, DCT.temporal) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index f87c94ca..ca72cd21 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -87,7 +87,7 @@ def _parse_list_value(data_dict, field_name): check_name = new_fields_mapping.get(field_name, field_name) for extra in dataset_dict.get("extras", []): if extra["key"].startswith(f"{check_name}_"): - subfield = extra["key"][extra["key"].index("_") + 1:] + subfield = extra["key"][extra["key"].index("_") + 1 :] if subfield in [ f["field_name"] for f in schema_field["repeating_subfields"] ]: @@ -100,6 +100,18 @@ def _parse_list_value(data_dict, field_name): dataset_dict[field_name] = [new_dict] dataset_dict["extras"] = new_extras + # Contact details + contacts = self._contact_details(dataset_ref, DCAT.contactPoint) + if contacts: + dataset_dict["contact"] = contacts + + # Publishers and creators + for item in [("publisher", DCT.publisher), ("creator", DCT.creator)]: + key, predicate = item + agents = self._agents_details(dataset_ref, predicate) + if agents: + dataset_dict[key] = agents + # Repeating subfields: resources for schema_field in self._dataset_schema["resource_fields"]: if "repeating_subfields" in schema_field: @@ -124,7 +136,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): Add triples to the graph from new repeating subfields """ contact = dataset_dict.get("contact") - if isinstance(contact, list) and len(contact) and self._not_empty_dict(contact[0]): + if ( + isinstance(contact, list) + and len(contact) + and self._not_empty_dict(contact[0]) + ): for item in contact: contact_uri = item.get("uri") if contact_uri: @@ -150,11 +166,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): contact_details, VCARD.hasUID, "identifier", - _type=URIRefOrLiteral + _type=URIRefOrLiteral, ) - self._add_agent(dataset_ref, dataset_dict, "publisher", DCT.publisher) - self._add_agent(dataset_ref, dataset_dict, "creator", DCT.creator) + self._add_agents(dataset_ref, dataset_dict, "publisher", DCT.publisher) + self._add_agents(dataset_ref, dataset_dict, "creator", DCT.creator) temporal = dataset_dict.get("temporal_coverage") if ( @@ -172,7 +188,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): self.g.add((dataset_ref, DCT.temporal, temporal_ref)) spatial = dataset_dict.get("spatial_coverage") - if isinstance(spatial, list) and len(spatial) and self._not_empty_dict(spatial[0]): + if ( + isinstance(spatial, list) + and len(spatial) + and self._not_empty_dict(spatial[0]) + ): for item in spatial: if item.get("uri"): spatial_ref = CleanedURIRef(item["uri"]) @@ -205,55 +225,59 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): except ValueError: pass - def _add_agent(self, dataset_ref, dataset_dict, agent_key, rdf_predicate): + def _add_agents( + self, dataset_ref, dataset_dict, agent_key, rdf_predicate, first_only=False + ): """ - Adds an agent (publisher or creator) to the RDF graph. + Adds one or more agents (e.g. publisher or creator) to the RDF graph. :param dataset_ref: The RDF reference of the dataset :param dataset_dict: The dataset dictionary containing agent information - :param agent_key: 'publisher' or 'creator' to specify the agent - :param rdf_predicate: The RDF predicate (DCT.publisher or DCT.creator) + :param agent_key: field name in the CKAN dict (.e.g. "publisher", "creator", etc) + :param rdf_predicate: The RDF predicate (DCT.publisher, DCT.creator, etc) + :first_only: Add the first item found only (used for 0..1 properties) """ agent = dataset_dict.get(agent_key) - if ( - isinstance(agent, list) - and len(agent) - and self._not_empty_dict(agent[0]) - ): - agent = agent[0] - agent_uri = agent.get("uri") - if agent_uri: - agent_ref = CleanedURIRef(agent_uri) - else: - agent_ref = BNode() - - self.g.add((agent_ref, RDF.type, FOAF.Agent)) - self.g.add((dataset_ref, rdf_predicate, agent_ref)) - - self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name") - self._add_triple_from_dict(agent, agent_ref, FOAF.homepage, "url", _type=URIRef) - self._add_triple_from_dict( - agent, - agent_ref, - DCT.type, - "type", - _type=URIRefOrLiteral, - ) - self._add_triple_from_dict( - agent, - agent_ref, - VCARD.hasEmail, - "email", - _type=URIRef, - value_modifier=self._add_mailto, - ) - self._add_triple_from_dict( - agent, - agent_ref, - DCT.identifier, - "identifier", - _type=URIRefOrLiteral - ) + if isinstance(agent, list) and len(agent) and self._not_empty_dict(agent[0]): + agents = [agent[0]] if first_only else agent + + for agent in agents: + + agent_uri = agent.get("uri") + if agent_uri: + agent_ref = CleanedURIRef(agent_uri) + else: + agent_ref = BNode() + + self.g.add((agent_ref, RDF.type, FOAF.Agent)) + self.g.add((dataset_ref, rdf_predicate, agent_ref)) + + self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name") + self._add_triple_from_dict( + agent, agent_ref, FOAF.homepage, "url", _type=URIRef + ) + self._add_triple_from_dict( + agent, + agent_ref, + DCT.type, + "type", + _type=URIRefOrLiteral, + ) + self._add_triple_from_dict( + agent, + agent_ref, + VCARD.hasEmail, + "email", + _type=URIRef, + value_modifier=self._add_mailto, + ) + self._add_triple_from_dict( + agent, + agent_ref, + DCT.identifier, + "identifier", + _type=URIRefOrLiteral, + ) @staticmethod def _not_empty_dict(data_dict): diff --git a/ckanext/dcat/tests/profiles/base/test_base_profile.py b/ckanext/dcat/tests/profiles/base/test_base_profile.py index 1b47faf3..aa8daf1e 100644 --- a/ckanext/dcat/tests/profiles/base/test_base_profile.py +++ b/ckanext/dcat/tests/profiles/base/test_base_profile.py @@ -880,7 +880,7 @@ def test_publisher_foaf(self): p = RDFProfile(g) - publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher) + publisher = p._agents_details(URIRef('http://example.org'), DCT.publisher)[0] assert publisher['uri'] == 'http://orgs.vocab.org/some-org' assert publisher['name'] == 'Publishing Organization for dataset 1' @@ -908,7 +908,7 @@ def test_publisher_ref(self): p = RDFProfile(g) - publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher) + publisher = p._agents_details(URIRef('http://example.org'), DCT.publisher)[0] assert publisher['uri'] == 'http://orgs.vocab.org/some-org' @@ -941,6 +941,8 @@ def test_contact_details(self): contact = p._contact_details(URIRef('http://example.org'), ADMS.contactPoint) + contact = contact[0] + assert contact['name'] == 'Point of Contact' # mailto gets removed for storage and is added again on output assert contact['email'] == 'contact@some.org' diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py index 3837201d..20f87a2a 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py @@ -826,6 +826,137 @@ def test_statement_literal(self): assert dataset["notes"] == "This is a dataset" assert dataset["access_rights"] == "Some statement" + def test_multiple_contacts(self): + + data = """ + @prefix dcat: . + @prefix dct: . + @prefix rdfs: . + @prefix vcard: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dct:description "This is a dataset" ; + dcat:contactPoint [ a vcard:Kind ; + vcard:fn "Test Contact 1" ; + vcard:hasEmail ; + vcard:hasUID "https://orcid.org/0000-0002-9095-9201" + ], + [ a vcard:Kind ; + vcard:fn "Test Contact 2" ; + vcard:hasEmail ; + vcard:hasUID "https://orcid.org/0000-0002-9095-9202" + ] ; + . + """ + + p = RDFParser() + + p.parse(data, _format="ttl") + datasets = [d for d in p.datasets()] + + dataset = datasets[0] + assert len(dataset["contact"]) == 2 + assert dataset["contact"][0]["name"] == "Test Contact 1" + assert dataset["contact"][0]["email"] == "contact1@example.org" + assert ( + dataset["contact"][0]["identifier"] + == "https://orcid.org/0000-0002-9095-9201" + ) + assert dataset["contact"][1]["name"] == "Test Contact 2" + assert dataset["contact"][1]["email"] == "contact2@example.org" + assert ( + dataset["contact"][1]["identifier"] + == "https://orcid.org/0000-0002-9095-9202" + ) + + def test_multiple_publishers(self): + + data = """ + @prefix dcat: . + @prefix dct: . + @prefix rdfs: . + @prefix org: . + @prefix skos: . + @prefix foaf: . + @prefix vcard: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dct:description "This is a dataset" ; + dct:publisher [ a org:Organization ; + skos:prefLabel "Test Publisher 1" ; + vcard:hasEmail ; + dct:identifier "https://orcid.org/0000-0002-9095-9201" ; + foaf:name "Test Publisher 1" ], + [ a org:Organization ; + skos:prefLabel "Test Publisher 2" ; + vcard:hasEmail ; + dct:identifier "https://orcid.org/0000-0002-9095-9202" ; + foaf:name "Test Publisher 2" ] ; + . + """ + + p = RDFParser() + + p.parse(data, _format="ttl") + datasets = [d for d in p.datasets()] + + dataset = datasets[0] + assert len(dataset["publisher"]) == 2 + assert dataset["publisher"][0]["name"] == "Test Publisher 1" + assert dataset["publisher"][0]["email"] == "publisher1@example.org" + assert ( + dataset["publisher"][0]["identifier"] + == "https://orcid.org/0000-0002-9095-9201" + ) + assert dataset["publisher"][1]["name"] == "Test Publisher 2" + assert dataset["publisher"][1]["email"] == "publisher2@example.org" + assert ( + dataset["publisher"][1]["identifier"] + == "https://orcid.org/0000-0002-9095-9202" + ) + + def test_multiple_creators(self): + + data = """ + @prefix dcat: . + @prefix dct: . + @prefix rdfs: . + @prefix org: . + @prefix skos: . + @prefix foaf: . + @prefix vcard: . + + + a dcat:Dataset ; + dct:title "Dataset 1" ; + dct:description "This is a dataset" ; + dct:creator [ a org:Organization ; + skos:prefLabel "Test Creator 1" ; + vcard:hasEmail ; + foaf:name "Test Creator 1" ], + [ a org:Organization ; + skos:prefLabel "Test Creator 2" ; + vcard:hasEmail ; + foaf:name "Test Creator 2" ] ; + . + """ + + p = RDFParser() + + p.parse(data, _format="ttl") + datasets = [d for d in p.datasets()] + + dataset = datasets[0] + assert len(dataset["creator"]) == 2 + assert dataset["creator"][0]["name"] == "Test Creator 1" + assert dataset["creator"][0]["email"] == "creator1@example.org" + assert dataset["creator"][1]["name"] == "Test Creator 2" + assert dataset["creator"][1]["email"] == "creator2@example.org" + @pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..b29bf4fd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,63 @@ +[project] +name = "ckanext-dcat" +version = "2.0.0" +description = "Plugins for exposing and consuming DCAT metadata on CKAN" +authors = [ + {name = "AdriĆ  Mercader", email = "amercadero@gmail.com"} +] +maintainers = [ + {name = "CKAN Tech Team and contributors", email = "tech-team@ckan.org"}, + {name = "Seitenbau Govdata"}, + {name = "Stefan Oderbolz"} +] +license = {text = "AGPL"} +classifiers = [ + "Intended Audience :: Developers", + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: GNU Affero General Public License v3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" +] +keywords = [ + "ckan", + "ckanext", + "DCAT", + "DCAT-AP", + "Schema.org", + "Linked data", + "RDF", + "Semantic data" + ] +dependencies = [] + +[project.urls] +Documentation = "https://docs.ckan.org/projects/ckanext-dcat" +Repository = "https://github.com/ckan/ckanext-dcat" +Issues = "https://github.com/ckan/ckanext-dcat/issues" +Changelog = "https://docs.ckan.org/projects/ckanext-dcat/en/latest/changelog/" + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project.entry-points."ckan.plugins"] +dcat_xml_harvester = "ckanext.dcat.harvesters:DCATXMLHarvester" +dcat_json_harvester = "ckanext.dcat.harvesters:DCATJSONHarvester" +dcat_rdf_harvester = "ckanext.dcat.harvesters:DCATRDFHarvester" +dcat_json_interface = "ckanext.dcat.plugins:DCATJSONInterface" +dcat = "ckanext.dcat.plugins:DCATPlugin" +structured_data = "ckanext.dcat.plugins:StructuredDataPlugin" +# Test plugins +test_rdf_harvester = "ckanext.dcat.tests.harvester.test_harvester:TestRDFHarvester" +test_rdf_null_harvester = "ckanext.dcat.tests.harvester.test_harvester:TestRDFNullHarvester" +test_rdf_exception_harvester = "ckanext.dcat.tests.harvester.test_harvester:TestRDFExceptionHarvester" + +[project.entry-points."ckan.rdf.profiles"] +euro_dcat_ap = "ckanext.dcat.profiles:EuropeanDCATAPProfile" +euro_dcat_ap_2 = "ckanext.dcat.profiles:EuropeanDCATAP2Profile" +euro_dcat_ap_3 = "ckanext.dcat.profiles:EuropeanDCATAP3Profile" +euro_dcat_ap_scheming = "ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile" +schemaorg = "ckanext.dcat.profiles:SchemaOrgProfile" diff --git a/setup.cfg b/setup.cfg index f730b50a..320a1950 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,13 @@ +[options] +packages = find: +namespace_packages = ckanext +install_requires = +include_package_data = True + +[options.entry_points] +babel.extractors = + ckan = ckan.lib.extract:extract_ckan + [extract_messages] keywords = translate isPlural add_comments = TRANSLATORS: diff --git a/setup.py b/setup.py index 9a600e06..9074f835 100644 --- a/setup.py +++ b/setup.py @@ -1,55 +1,6 @@ -from setuptools import setup, find_packages - -version = '2.0.0' +from setuptools import setup setup( - name='ckanext-dcat', - version=version, - description="Plugins for exposing and consuming DCAT metadata on CKAN", - long_description='''\ - ''', - classifiers=[], - keywords='', - author='Open Knowledge Foundation', - author_email='info@ckan.org', - url='https://github.com/okfn/ckanext-dcat', - license='AGPL', - packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), - namespace_packages=['ckanext'], - include_package_data=True, - zip_safe=False, - install_requires=[ - # -*- Extra requirements: -*- - ], - entry_points=''' - - [ckan.plugins] - dcat_xml_harvester=ckanext.dcat.harvesters:DCATXMLHarvester - dcat_json_harvester=ckanext.dcat.harvesters:DCATJSONHarvester - - dcat_rdf_harvester=ckanext.dcat.harvesters:DCATRDFHarvester - - dcat_json_interface=ckanext.dcat.plugins:DCATJSONInterface - - dcat=ckanext.dcat.plugins:DCATPlugin - - structured_data=ckanext.dcat.plugins:StructuredDataPlugin - - # Test plugins - test_rdf_harvester=ckanext.dcat.tests.harvester.test_harvester:TestRDFHarvester - test_rdf_null_harvester=ckanext.dcat.tests.harvester.test_harvester:TestRDFNullHarvester - test_rdf_exception_harvester=ckanext.dcat.tests.harvester.test_harvester:TestRDFExceptionHarvester - - [ckan.rdf.profiles] - euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile - euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile - euro_dcat_ap_3=ckanext.dcat.profiles:EuropeanDCATAP3Profile - euro_dcat_ap_scheming=ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile - schemaorg=ckanext.dcat.profiles:SchemaOrgProfile - - [babel.extractors] - ckan = ckan.lib.extract:extract_ckan - ''', message_extractors={ 'ckanext': [ ('**.py', 'python', None),