Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for multiple agents when parsing #317

Merged
merged 3 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 20 additions & 10 deletions ckanext/dcat/profiles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,9 +421,10 @@ def _insert_or_update_temporal(self, dataset_dict, key, value):
else:
dataset_dict["extras"].append({"key": key, "value": value})

def _agent_details(self, subject, predicate):
def _agents_details(self, subject, predicate):
"""
Returns a dict with details about a dct:publisher or dct:creator entity, a foaf:Agent
Returns a list of dicts with details about a foaf:Agent property, e.g.
dct:publisher or dct:creator entity.

Both subject and predicate must be rdflib URIRef or BNode objects

Expand All @@ -441,32 +442,37 @@ def _agent_details(self, subject, predicate):
an empty string if they could not be found.
"""

agent_details = {}

agents = []
for agent in self.g.objects(subject, predicate):
agent_details = {}
agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else ""
agent_details["name"] = self._object_value(agent, FOAF.name)
agent_details["email"] = self._object_value(agent, FOAF.mbox)
if not agent_details["email"]:
agent_details["email"] = self._without_mailto(
self._object_value(agent, VCARD.hasEmail)
)
agent_details["url"] = self._object_value(agent, FOAF.homepage)
agent_details["type"] = self._object_value(agent, DCT.type)
agent_details['identifier'] = self._object_value(agent, DCT.identifier)
agents.append(agent_details)

return agent_details
return agents

def _contact_details(self, subject, predicate):
"""
Returns a dict with details about a vcard expression
Returns a list of dicts with details about vcard expressions

Both subject and predicate must be rdflib URIRef or BNode objects

Returns keys for uri, name and email with the values set to
an empty string if they could not be found
"""

contact = {}

contacts = []
for agent in self.g.objects(subject, predicate):

contact = {}
contact["uri"] = str(agent) if isinstance(agent, URIRef) else ""

contact["name"] = self._get_vcard_property_value(
Expand All @@ -478,8 +484,9 @@ def _contact_details(self, subject, predicate):
)

contact["identifier"] = self._get_vcard_property_value(agent, VCARD.hasUID)
contacts.append(contact)

return contact
return contacts

def _parse_geodata(self, spatial, datatype, cur_value):
"""
Expand Down Expand Up @@ -1148,10 +1155,13 @@ def _extract_catalog_dict(self, catalog_ref):
if val:
out.append({"key": key, "value": val})

publishers = self._agents_details(catalog_ref, DCT.publisher)
if publishers:
publisher = publishers[0]
out.append(
{
"key": "source_catalog_publisher",
"value": json.dumps(self._agent_details(catalog_ref, DCT.publisher)),
"value": json.dumps(publisher),
}
)
return out
Expand Down
62 changes: 36 additions & 26 deletions ckanext/dcat/profiles/euro_dcat_ap_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,33 +108,43 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
dataset_dict["extras"].append({"key": key, "value": json.dumps(values)})

# Contact details
contact = self._contact_details(dataset_ref, DCAT.contactPoint)
if not contact:
# adms:contactPoint was supported on the first version of DCAT-AP
contact = self._contact_details(dataset_ref, ADMS.contactPoint)

if contact:
for key in ("uri", "name", "email", "identifier"):
if contact.get(key):
dataset_dict["extras"].append(
{"key": "contact_{0}".format(key), "value": contact.get(key)}
)

# Publisher
publisher = self._agent_details(dataset_ref, DCT.publisher)
for key in ("uri", "name", "email", "url", "type", "identifier"):
if publisher.get(key):
dataset_dict["extras"].append(
{"key": "publisher_{0}".format(key), "value": publisher.get(key)}
)
if self._schema_field("contact"):
# This is a scheming field, will be hanlded in a separate profile
pass
else:
contact = self._contact_details(dataset_ref, DCAT.contactPoint)
if not contact:
# adms:contactPoint was supported on the first version of DCAT-AP
contact = self._contact_details(dataset_ref, ADMS.contactPoint)
if contact:
contact = contact[0]
for key in ("uri", "name", "email", "identifier"):
if contact.get(key):
dataset_dict["extras"].append(
{
"key": "contact_{0}".format(key),
"value": contact.get(key)
}
)

# Creator
creator = self._agent_details(dataset_ref, DCT.creator)
for key in ("uri", "name", "email", "url", "type", "identifier"):
if creator.get(key):
dataset_dict["extras"].append(
{"key": "creator_{0}".format(key), "value": creator.get(key)}
)
# Publishers and creators
for item in [("publisher", DCT.publisher), ("creator", DCT.creator)]:
agent_key, predicate = item
if self._schema_field(agent_key):
# This is a scheming field, will be hanlded in a separate profile
pass
else:
agents = self._agents_details(dataset_ref, predicate)
if agents:
agent = agents[0]
for key in ("uri", "name", "email", "url", "type", "identifier"):
if agent.get(key):
dataset_dict["extras"].append(
{
"key": f"{agent_key}_{key}",
"value": agent.get(key)
}
)

# Temporal
start, end = self._time_interval(dataset_ref, DCT.temporal)
Expand Down
122 changes: 73 additions & 49 deletions ckanext/dcat/profiles/euro_dcat_ap_scheming.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _parse_list_value(data_dict, field_name):
check_name = new_fields_mapping.get(field_name, field_name)
for extra in dataset_dict.get("extras", []):
if extra["key"].startswith(f"{check_name}_"):
subfield = extra["key"][extra["key"].index("_") + 1:]
subfield = extra["key"][extra["key"].index("_") + 1 :]
if subfield in [
f["field_name"] for f in schema_field["repeating_subfields"]
]:
Expand All @@ -100,6 +100,18 @@ def _parse_list_value(data_dict, field_name):
dataset_dict[field_name] = [new_dict]
dataset_dict["extras"] = new_extras

# Contact details
contacts = self._contact_details(dataset_ref, DCAT.contactPoint)
if contacts:
dataset_dict["contact"] = contacts

# Publishers and creators
for item in [("publisher", DCT.publisher), ("creator", DCT.creator)]:
key, predicate = item
agents = self._agents_details(dataset_ref, predicate)
if agents:
dataset_dict[key] = agents

# Repeating subfields: resources
for schema_field in self._dataset_schema["resource_fields"]:
if "repeating_subfields" in schema_field:
Expand All @@ -124,7 +136,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref):
Add triples to the graph from new repeating subfields
"""
contact = dataset_dict.get("contact")
if isinstance(contact, list) and len(contact) and self._not_empty_dict(contact[0]):
if (
isinstance(contact, list)
and len(contact)
and self._not_empty_dict(contact[0])
):
for item in contact:
contact_uri = item.get("uri")
if contact_uri:
Expand All @@ -150,11 +166,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref):
contact_details,
VCARD.hasUID,
"identifier",
_type=URIRefOrLiteral
_type=URIRefOrLiteral,
)

self._add_agent(dataset_ref, dataset_dict, "publisher", DCT.publisher)
self._add_agent(dataset_ref, dataset_dict, "creator", DCT.creator)
self._add_agents(dataset_ref, dataset_dict, "publisher", DCT.publisher)
self._add_agents(dataset_ref, dataset_dict, "creator", DCT.creator)

temporal = dataset_dict.get("temporal_coverage")
if (
Expand All @@ -172,7 +188,11 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref):
self.g.add((dataset_ref, DCT.temporal, temporal_ref))

spatial = dataset_dict.get("spatial_coverage")
if isinstance(spatial, list) and len(spatial) and self._not_empty_dict(spatial[0]):
if (
isinstance(spatial, list)
and len(spatial)
and self._not_empty_dict(spatial[0])
):
for item in spatial:
if item.get("uri"):
spatial_ref = CleanedURIRef(item["uri"])
Expand Down Expand Up @@ -205,55 +225,59 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref):
except ValueError:
pass

def _add_agent(self, dataset_ref, dataset_dict, agent_key, rdf_predicate):
def _add_agents(
self, dataset_ref, dataset_dict, agent_key, rdf_predicate, first_only=False
):
"""
Adds an agent (publisher or creator) to the RDF graph.
Adds one or more agents (e.g. publisher or creator) to the RDF graph.

:param dataset_ref: The RDF reference of the dataset
:param dataset_dict: The dataset dictionary containing agent information
:param agent_key: 'publisher' or 'creator' to specify the agent
:param rdf_predicate: The RDF predicate (DCT.publisher or DCT.creator)
:param agent_key: field name in the CKAN dict (.e.g. "publisher", "creator", etc)
:param rdf_predicate: The RDF predicate (DCT.publisher, DCT.creator, etc)
:first_only: Add the first item found only (used for 0..1 properties)
"""
agent = dataset_dict.get(agent_key)
if (
isinstance(agent, list)
and len(agent)
and self._not_empty_dict(agent[0])
):
agent = agent[0]
agent_uri = agent.get("uri")
if agent_uri:
agent_ref = CleanedURIRef(agent_uri)
else:
agent_ref = BNode()

self.g.add((agent_ref, RDF.type, FOAF.Agent))
self.g.add((dataset_ref, rdf_predicate, agent_ref))

self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name")
self._add_triple_from_dict(agent, agent_ref, FOAF.homepage, "url", _type=URIRef)
self._add_triple_from_dict(
agent,
agent_ref,
DCT.type,
"type",
_type=URIRefOrLiteral,
)
self._add_triple_from_dict(
agent,
agent_ref,
VCARD.hasEmail,
"email",
_type=URIRef,
value_modifier=self._add_mailto,
)
self._add_triple_from_dict(
agent,
agent_ref,
DCT.identifier,
"identifier",
_type=URIRefOrLiteral
)
if isinstance(agent, list) and len(agent) and self._not_empty_dict(agent[0]):
agents = [agent[0]] if first_only else agent

for agent in agents:

agent_uri = agent.get("uri")
if agent_uri:
agent_ref = CleanedURIRef(agent_uri)
else:
agent_ref = BNode()

self.g.add((agent_ref, RDF.type, FOAF.Agent))
self.g.add((dataset_ref, rdf_predicate, agent_ref))

self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name")
self._add_triple_from_dict(
agent, agent_ref, FOAF.homepage, "url", _type=URIRef
)
self._add_triple_from_dict(
agent,
agent_ref,
DCT.type,
"type",
_type=URIRefOrLiteral,
)
self._add_triple_from_dict(
agent,
agent_ref,
VCARD.hasEmail,
"email",
_type=URIRef,
value_modifier=self._add_mailto,
)
self._add_triple_from_dict(
agent,
agent_ref,
DCT.identifier,
"identifier",
_type=URIRefOrLiteral,
)

@staticmethod
def _not_empty_dict(data_dict):
Expand Down
6 changes: 4 additions & 2 deletions ckanext/dcat/tests/profiles/base/test_base_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ def test_publisher_foaf(self):

p = RDFProfile(g)

publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher)
publisher = p._agents_details(URIRef('http://example.org'), DCT.publisher)[0]

assert publisher['uri'] == 'http://orgs.vocab.org/some-org'
assert publisher['name'] == 'Publishing Organization for dataset 1'
Expand Down Expand Up @@ -688,7 +688,7 @@ def test_publisher_ref(self):

p = RDFProfile(g)

publisher = p._agent_details(URIRef('http://example.org'), DCT.publisher)
publisher = p._agents_details(URIRef('http://example.org'), DCT.publisher)[0]

assert publisher['uri'] == 'http://orgs.vocab.org/some-org'

Expand Down Expand Up @@ -721,6 +721,8 @@ def test_contact_details(self):

contact = p._contact_details(URIRef('http://example.org'), ADMS.contactPoint)

contact = contact[0]

assert contact['name'] == 'Point of Contact'
# mailto gets removed for storage and is added again on output
assert contact['email'] == '[email protected]'
Expand Down
Loading