From e95f490e2a91e038313e60eb8f7e644a7ca13a87 Mon Sep 17 00:00:00 2001
From: Pieter Gijsbers
Date: Wed, 26 Jun 2024 10:48:31 +0200
Subject: [PATCH] Separate mldcat ap endpoints (#160)
* Give MLDCAT-AP endpoint a separate tag
* Extract out Feature
* Finalize first iteration separation of endpoints
There are now endpoints for Dataset+Distribution, Feature, Quality
and DataService.
* Serialize status and visibility as url
---
src/routers/mldcat_ap/dataset.py | 190 +++++++++++++++++++++++++++---
src/routers/openml/qualities.py | 2 +-
src/schemas/datasets/mldcat_ap.py | 97 +++------------
3 files changed, 196 insertions(+), 93 deletions(-)
diff --git a/src/routers/mldcat_ap/dataset.py b/src/routers/mldcat_ap/dataset.py
index 771edb8..ea43c74 100644
--- a/src/routers/mldcat_ap/dataset.py
+++ b/src/routers/mldcat_ap/dataset.py
@@ -1,27 +1,189 @@
+"""Router for MLDCAT-AP endpoints: https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/#examples
+
+Incredibly inefficient, but it's just a proof of concept.
+Specific queries could be written to fetch e.g., a single feature or quality.
+"""
+
from typing import Annotated
-from fastapi import APIRouter, Depends
-from schemas.datasets.mldcat_ap import JsonLDGraph, convert_to_mldcat_ap
+import config
+from database.users import User
+from fastapi import APIRouter, Depends, HTTPException
+from schemas.datasets.mldcat_ap import (
+ DataService,
+ Dataset,
+ Distribution,
+ Feature,
+ JsonLDGraph,
+ JsonLDObjectReference,
+ JsonLDQualifiedLiteral,
+ MD5Checksum,
+ Quality,
+)
from sqlalchemy import Connection
-from routers.dependencies import expdb_connection, userdb_connection
-from routers.openml.datasets import get_dataset
+from routers.dependencies import expdb_connection, fetch_user, userdb_connection
+from routers.openml.datasets import get_dataset, get_dataset_features
+from routers.openml.qualities import get_qualities
-router = APIRouter(prefix="/mldcat_ap/datasets", tags=["datasets"])
+router = APIRouter(prefix="/mldcat_ap", tags=["MLDCAT-AP"])
+_configuration = config.load_configuration()
+_server_url = (
+ f"{_configuration['arff_base_url']}{_configuration['fastapi']['root_path']}{router.prefix}"
+)
@router.get(
- path="/{dataset_id}",
- description="Get meta-data for dataset with ID `dataset_id`.",
+ path="/distribution/{distribution_id}",
+ description="Get meta-data for distribution with ID `distribution_id`.",
)
-def get_mldcat_ap_dataset(
- dataset_id: int,
+def get_mldcat_ap_distribution(
+ distribution_id: int,
+ user: Annotated[User | None, Depends(fetch_user)] = None,
user_db: Annotated[Connection, Depends(userdb_connection)] = None,
- expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
+ expdb: Annotated[Connection, Depends(expdb_connection)] = None,
) -> JsonLDGraph:
- openml_dataset = get_dataset(
- dataset_id=dataset_id,
+ oml_dataset = get_dataset(
+ dataset_id=distribution_id,
+ user=user,
user_db=user_db,
- expdb_db=expdb_db,
+ expdb_db=expdb,
+ )
+ openml_features = get_dataset_features(distribution_id, user, expdb)
+ features = [
+ Feature(
+ id_=f"{_server_url}/feature/{distribution_id}/{feature.index}",
+ name=feature.name,
+ feature_type=f"{_server_url}/schema/feature-type#{feature.data_type}",
+ )
+ for feature in openml_features
+ ]
+ oml_qualities = get_qualities(distribution_id, user, expdb)
+ qualities = [
+ Quality(
+ id_=f"{_server_url}/quality/{quality.name}/{distribution_id}",
+ quality_type=f"{_server_url}/quality/{quality.name}",
+ value=str(quality.value),
+ )
+ for quality in oml_qualities
+ ]
+ checksum = MD5Checksum(
+ id_=f"{_server_url}/checksum/{distribution_id}",
+ value=oml_dataset.md5_checksum,
+ )
+ arff_service = DataService(
+ id_=f"{_server_url}/dataservice/1",
+ endpoint_url=_server_url,
+ title=["REST API for sharing OpenML metadata in MLDCAT-AP format."],
+ )
+ distribution = Distribution(
+ id_=f"{_server_url}/distribution/{distribution_id}",
+ access_url=[f"https://www.openml.org/d/{distribution_id}"],
+ has_feature=[JsonLDObjectReference[Feature].to(feature) for feature in features],
+ has_quality=[JsonLDObjectReference[Quality].to(quality) for quality in qualities],
+ default_target_attribute=next(iter(oml_dataset.default_target_attribute), None),
+ download_url=[oml_dataset.url],
+ format_=oml_dataset.format_,
+ checksum=JsonLDObjectReference[MD5Checksum].to(checksum),
+ access_service=[JsonLDObjectReference[DataService].to(arff_service)],
+ )
+ mldcat_dataset = Dataset(
+ id_=str(distribution_id),
+ type_="Dataset",
+ collection_date=str(oml_dataset.upload_date),
+ description=[oml_dataset.description],
+ title=[oml_dataset.name],
+ distribution=[JsonLDObjectReference[Distribution].to(distribution)],
+ status=oml_dataset.status,
+ version_info=str(oml_dataset.version),
+ version_label=oml_dataset.version_label,
+ visibility=oml_dataset.visibility,
+ keyword=oml_dataset.tags,
+ issued=JsonLDQualifiedLiteral(
+ value=str(oml_dataset.upload_date),
+ type_="http://www.w3.org/2001/XMLSchema#dateTime",
+ ),
+ )
+ return JsonLDGraph(
+ context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld",
+ graph=[
+ mldcat_dataset,
+ distribution,
+ arff_service,
+ checksum,
+ ],
+ )
+
+
+@router.get(
+ path="/dataservice/{service_id}",
+ description="Get meta-data for a specific data service.",
+)
+def get_dataservice(service_id: int) -> JsonLDGraph:
+ if service_id != 1:
+ raise HTTPException(status_code=404, detail="Service not found.")
+ return JsonLDGraph(
+ context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld",
+ graph=[
+ DataService(
+ id_=f"{_server_url}/dataservice/{service_id}",
+ endpoint_url=_server_url,
+ title=["REST API for sharing OpenML metadata in MLDCAT-AP format."],
+ ),
+ ],
+ )
+
+
+@router.get(
+ path="/quality/{quality_name}/{distribution_id}",
+ description="Get meta-data for a specific quality and distribution.",
+)
+def get_distribution_quality(
+ quality_name: str,
+ distribution_id: int,
+ user: Annotated[User | None, Depends(fetch_user)] = None,
+ expdb: Annotated[Connection, Depends(expdb_connection)] = None,
+) -> JsonLDGraph:
+ qualities = get_qualities(distribution_id, user, expdb)
+ quality = next(q for q in qualities if q.name == quality_name)
+ example_quality = Quality(
+ id_=f"{_server_url}/quality/{quality_name}/{distribution_id}",
+ quality_type=f"{_server_url}/quality/{quality_name}",
+ value=str(quality.value),
+ )
+
+ return JsonLDGraph(
+ context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld",
+ graph=[
+ example_quality,
+ ],
+ )
+
+
+@router.get(
+ path="/feature/{distribution_id}/{feature_no}",
+ description="Get meta-data for the n-th feature of a distribution.",
+)
+def get_distribution_feature(
+ distribution_id: int,
+ feature_no: int,
+ user: Annotated[Connection, Depends(fetch_user)] = None,
+ expdb: Annotated[Connection, Depends(expdb_connection)] = None,
+) -> JsonLDGraph:
+ features = get_dataset_features(
+ dataset_id=distribution_id,
+ user=user,
+ expdb=expdb,
+ )
+ feature = features[feature_no]
+ mldcat_feature = Feature(
+ id_=f"{_server_url}/feature/{distribution_id}/{feature.index}",
+ name=feature.name,
+ feature_type=f"{_server_url}/schema/feature-type#{feature.data_type}",
+ )
+ return JsonLDGraph(
+ context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld",
+ graph=[
+ mldcat_feature,
+ ],
)
- return convert_to_mldcat_ap(openml_dataset)
diff --git a/src/routers/openml/qualities.py b/src/routers/openml/qualities.py
index 4231d5a..c498a54 100644
--- a/src/routers/openml/qualities.py
+++ b/src/routers/openml/qualities.py
@@ -29,7 +29,7 @@ def list_qualities(
@router.get("/qualities/{dataset_id}")
def get_qualities(
dataset_id: int,
- user: Annotated[User, Depends(fetch_user)],
+ user: Annotated[User | None, Depends(fetch_user)],
expdb: Annotated[Connection, Depends(expdb_connection)],
) -> list[Quality]:
dataset = get_dataset(dataset_id, expdb)
diff --git a/src/schemas/datasets/mldcat_ap.py b/src/schemas/datasets/mldcat_ap.py
index 0d9836c..cfbe1b7 100644
--- a/src/schemas/datasets/mldcat_ap.py
+++ b/src/schemas/datasets/mldcat_ap.py
@@ -12,9 +12,9 @@
from enum import StrEnum
from typing import Generic, Literal, TypeVar
-from pydantic import BaseModel, Field, HttpUrl
+from pydantic import BaseModel, Field, HttpUrl, field_serializer, model_serializer
-from schemas.datasets.openml import DatasetMetadata, DatasetStatus, Visibility
+from schemas.datasets.openml import DatasetStatus, Visibility
class JsonLDQualifiedLiteral(BaseModel):
@@ -54,6 +54,10 @@ def to(cls, json_ld_object: T) -> JsonLDObjectReference[T]:
"""Create a reference to `json_ld_object`"""
return cls(id_=json_ld_object.id_)
+ @model_serializer
+ def ser_model(self) -> str:
+ return self.id_
+
class AccessRights(StrEnum):
"""Recommend values for 'access rights' within DCAT-AP context"""
@@ -101,18 +105,11 @@ class Feature(JsonLDObject):
type_: Literal["Feature"] = Field(default="Feature", serialization_alias="@type")
name: str = Field(serialization_alias="Feature.name")
feature_type: str = Field(serialization_alias="Feature.type")
- description: JsonLiteral | None = Field(default=None, serialization_alias="Feature.description")
-
-
-class QualityType(JsonLDObject):
- type_: Literal["QualityType"] = Field(default="QualityType", serialization_alias="@type")
- name: str = Field(serialization_alias="QualityType.name")
- quality_id: str = Field(serialization_alias="QualityType.id")
class Quality(JsonLDObject):
type_: Literal["Quality"] = Field(default="Quality", serialization_alias="@type")
- quality_type: QualityType = Field(serialization_alias="Quality.type")
+ quality_type: str = Field(serialization_alias="Quality.type")
value: JsonLiteral = Field(serialization_alias="Quality.value")
@@ -218,7 +215,10 @@ class Dataset(JsonLDObject):
default_factory=list,
serialization_alias="Dataset.hasVersion",
)
- identifier: list[JsonLiteral] = Field(default_factory=list)
+ identifier: list[JsonLiteral] = Field(
+ default_factory=list,
+ serialization_alias="Dataset.identifier",
+ )
is_referenced_by: list[JsonLiteral] = Field(
default_factory=list,
serialization_alias="Dataset.isReferencedBy",
@@ -248,6 +248,14 @@ class Dataset(JsonLDObject):
)
visibility: Visibility | None = Field(default=None, serialization_alias="Dataset.visibility")
+ @field_serializer("status")
+ def serialize_status(self, v: DatasetStatus) -> str:
+ return f"https://openml.org/mldcatap/mldcat-ap/dataset-status#{v}"
+
+ @field_serializer("visibility")
+ def serialize_visibility(self, v: Visibility) -> str:
+ return f"https://openml.org/mldcatap/mldcat-ap/dataset-visibility#{v}"
+
class DataService(JsonLDObject):
type_: Literal["DataService"] = Field(default="DataService", serialization_alias="@type")
@@ -276,70 +284,3 @@ class JsonLDGraph(BaseModel):
)
model_config = {"populate_by_name": True, "extra": "forbid"}
-
-
-def convert_to_mldcat_ap(dataset: DatasetMetadata) -> JsonLDGraph:
- arff_service = DataService(
- id_="openml-arff-service",
- title=["OpenML ARFF server"],
- endpoint_url="https://www.openml.org/data/download",
- )
- example_feature = Feature(
- id_="example-petal-width",
- name="example_petal_width",
- feature_type="https://schema.org/Number",
- description="Feature information not loaded, this is an example.",
- )
-
- example_quality = Quality(
- id_="example-quality",
- quality_type=QualityType(
- id_="quality-type-example",
- name="number_of_features",
- quality_id="link_to_definition",
- ),
- value="150",
- )
- checksum = MD5Checksum(id_="checksum-id", value=dataset.md5_checksum)
- # contributor and creator N/A
- distribution = Distribution(
- id_="distribution-id",
- access_url=[f"https://www.openml.org/d/{dataset.id_}"],
- has_feature=[JsonLDObjectReference[Feature].to(example_feature)],
- has_quality=[JsonLDObjectReference[Quality].to(example_quality)],
- default_target_attribute=next(iter(dataset.default_target_attribute), None),
- download_url=[dataset.url],
- format_=dataset.format_,
- checksum=JsonLDObjectReference[MD5Checksum].to(checksum),
- access_service=[JsonLDObjectReference[DataService].to(arff_service)],
- )
-
- mldcat_dataset = Dataset(
- id_=str(dataset.id_),
- type_="Dataset",
- collection_date=str(dataset.upload_date),
- description=[dataset.description],
- title=[dataset.name],
- distribution=[JsonLDObjectReference[Distribution].to(distribution)],
- status=dataset.status,
- version_info=str(dataset.version),
- version_label=dataset.version_label,
- visibility=dataset.visibility,
- keyword=dataset.tags,
- issued=JsonLDQualifiedLiteral(
- value=str(dataset.upload_date),
- type_="http://www.w3.org/2001/XMLSchema#dateTime",
- ),
- )
-
- return JsonLDGraph(
- context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context/mldcat-ap.jsonld",
- graph=[
- arff_service,
- distribution,
- mldcat_dataset,
- example_feature,
- example_quality,
- checksum,
- ],
- )