Skip to content

Commit

Permalink
Separate mldcat ap endpoints (#160)
Browse files Browse the repository at this point in the history
* Give MLDCAT-AP endpoint a separate tag

* Extract out Feature

* Finalize first iteration separation of endpoints

There are now endpoints for Dataset+Distribution, Feature, Quality
and DataService.

* Serialize status and visibility as url
  • Loading branch information
PGijsbers authored Jun 26, 2024
1 parent d17c9cf commit e95f490
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 93 deletions.
190 changes: 176 additions & 14 deletions src/routers/mldcat_ap/dataset.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,189 @@
"""Router for MLDCAT-AP endpoints: https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/#examples
Incredibly inefficient, but it's just a proof of concept.
Specific queries could be written to fetch e.g., a single feature or quality.
"""

from typing import Annotated

from fastapi import APIRouter, Depends
from schemas.datasets.mldcat_ap import JsonLDGraph, convert_to_mldcat_ap
import config
from database.users import User
from fastapi import APIRouter, Depends, HTTPException
from schemas.datasets.mldcat_ap import (
DataService,
Dataset,
Distribution,
Feature,
JsonLDGraph,
JsonLDObjectReference,
JsonLDQualifiedLiteral,
MD5Checksum,
Quality,
)
from sqlalchemy import Connection

from routers.dependencies import expdb_connection, userdb_connection
from routers.openml.datasets import get_dataset
from routers.dependencies import expdb_connection, fetch_user, userdb_connection
from routers.openml.datasets import get_dataset, get_dataset_features
from routers.openml.qualities import get_qualities

router = APIRouter(prefix="/mldcat_ap/datasets", tags=["datasets"])
router = APIRouter(prefix="/mldcat_ap", tags=["MLDCAT-AP"])
_configuration = config.load_configuration()
_server_url = (
f"{_configuration['arff_base_url']}{_configuration['fastapi']['root_path']}{router.prefix}"
)


@router.get(
path="/{dataset_id}",
description="Get meta-data for dataset with ID `dataset_id`.",
path="/distribution/{distribution_id}",
description="Get meta-data for distribution with ID `distribution_id`.",
)
def get_mldcat_ap_dataset(
dataset_id: int,
def get_mldcat_ap_distribution(
distribution_id: int,
user: Annotated[User | None, Depends(fetch_user)] = None,
user_db: Annotated[Connection, Depends(userdb_connection)] = None,
expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
expdb: Annotated[Connection, Depends(expdb_connection)] = None,
) -> JsonLDGraph:
openml_dataset = get_dataset(
dataset_id=dataset_id,
oml_dataset = get_dataset(
dataset_id=distribution_id,
user=user,
user_db=user_db,
expdb_db=expdb_db,
expdb_db=expdb,
)
openml_features = get_dataset_features(distribution_id, user, expdb)
features = [
Feature(
id_=f"{_server_url}/feature/{distribution_id}/{feature.index}",
name=feature.name,
feature_type=f"{_server_url}/schema/feature-type#{feature.data_type}",
)
for feature in openml_features
]
oml_qualities = get_qualities(distribution_id, user, expdb)
qualities = [
Quality(
id_=f"{_server_url}/quality/{quality.name}/{distribution_id}",
quality_type=f"{_server_url}/quality/{quality.name}",
value=str(quality.value),
)
for quality in oml_qualities
]
checksum = MD5Checksum(
id_=f"{_server_url}/checksum/{distribution_id}",
value=oml_dataset.md5_checksum,
)
arff_service = DataService(
id_=f"{_server_url}/dataservice/1",
endpoint_url=_server_url,
title=["REST API for sharing OpenML metadata in MLDCAT-AP format."],
)
distribution = Distribution(
id_=f"{_server_url}/distribution/{distribution_id}",
access_url=[f"https://www.openml.org/d/{distribution_id}"],
has_feature=[JsonLDObjectReference[Feature].to(feature) for feature in features],
has_quality=[JsonLDObjectReference[Quality].to(quality) for quality in qualities],
default_target_attribute=next(iter(oml_dataset.default_target_attribute), None),
download_url=[oml_dataset.url],
format_=oml_dataset.format_,
checksum=JsonLDObjectReference[MD5Checksum].to(checksum),
access_service=[JsonLDObjectReference[DataService].to(arff_service)],
)
mldcat_dataset = Dataset(
id_=str(distribution_id),
type_="Dataset",
collection_date=str(oml_dataset.upload_date),
description=[oml_dataset.description],
title=[oml_dataset.name],
distribution=[JsonLDObjectReference[Distribution].to(distribution)],
status=oml_dataset.status,
version_info=str(oml_dataset.version),
version_label=oml_dataset.version_label,
visibility=oml_dataset.visibility,
keyword=oml_dataset.tags,
issued=JsonLDQualifiedLiteral(
value=str(oml_dataset.upload_date),
type_="http://www.w3.org/2001/XMLSchema#dateTime",
),
)
return JsonLDGraph(
context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld",
graph=[
mldcat_dataset,
distribution,
arff_service,
checksum,
],
)


@router.get(
path="/dataservice/{service_id}",
description="Get meta-data for a specific data service.",
)
def get_dataservice(service_id: int) -> JsonLDGraph:
if service_id != 1:
raise HTTPException(status_code=404, detail="Service not found.")
return JsonLDGraph(
context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld",
graph=[
DataService(
id_=f"{_server_url}/dataservice/{service_id}",
endpoint_url=_server_url,
title=["REST API for sharing OpenML metadata in MLDCAT-AP format."],
),
],
)


@router.get(
path="/quality/{quality_name}/{distribution_id}",
description="Get meta-data for a specific quality and distribution.",
)
def get_distribution_quality(
quality_name: str,
distribution_id: int,
user: Annotated[User | None, Depends(fetch_user)] = None,
expdb: Annotated[Connection, Depends(expdb_connection)] = None,
) -> JsonLDGraph:
qualities = get_qualities(distribution_id, user, expdb)
quality = next(q for q in qualities if q.name == quality_name)
example_quality = Quality(
id_=f"{_server_url}/quality/{quality_name}/{distribution_id}",
quality_type=f"{_server_url}/quality/{quality_name}",
value=str(quality.value),
)

return JsonLDGraph(
context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld",
graph=[
example_quality,
],
)


@router.get(
path="/feature/{distribution_id}/{feature_no}",
description="Get meta-data for the n-th feature of a distribution.",
)
def get_distribution_feature(
distribution_id: int,
feature_no: int,
user: Annotated[Connection, Depends(fetch_user)] = None,
expdb: Annotated[Connection, Depends(expdb_connection)] = None,
) -> JsonLDGraph:
features = get_dataset_features(
dataset_id=distribution_id,
user=user,
expdb=expdb,
)
feature = features[feature_no]
mldcat_feature = Feature(
id_=f"{_server_url}/feature/{distribution_id}/{feature.index}",
name=feature.name,
feature_type=f"{_server_url}/schema/feature-type#{feature.data_type}",
)
return JsonLDGraph(
context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld",
graph=[
mldcat_feature,
],
)
return convert_to_mldcat_ap(openml_dataset)
2 changes: 1 addition & 1 deletion src/routers/openml/qualities.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def list_qualities(
@router.get("/qualities/{dataset_id}")
def get_qualities(
dataset_id: int,
user: Annotated[User, Depends(fetch_user)],
user: Annotated[User | None, Depends(fetch_user)],
expdb: Annotated[Connection, Depends(expdb_connection)],
) -> list[Quality]:
dataset = get_dataset(dataset_id, expdb)
Expand Down
97 changes: 19 additions & 78 deletions src/schemas/datasets/mldcat_ap.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
from enum import StrEnum
from typing import Generic, Literal, TypeVar

from pydantic import BaseModel, Field, HttpUrl
from pydantic import BaseModel, Field, HttpUrl, field_serializer, model_serializer

from schemas.datasets.openml import DatasetMetadata, DatasetStatus, Visibility
from schemas.datasets.openml import DatasetStatus, Visibility


class JsonLDQualifiedLiteral(BaseModel):
Expand Down Expand Up @@ -54,6 +54,10 @@ def to(cls, json_ld_object: T) -> JsonLDObjectReference[T]:
"""Create a reference to `json_ld_object`"""
return cls(id_=json_ld_object.id_)

@model_serializer
def ser_model(self) -> str:
return self.id_


class AccessRights(StrEnum):
"""Recommend values for 'access rights' within DCAT-AP context"""
Expand Down Expand Up @@ -101,18 +105,11 @@ class Feature(JsonLDObject):
type_: Literal["Feature"] = Field(default="Feature", serialization_alias="@type")
name: str = Field(serialization_alias="Feature.name")
feature_type: str = Field(serialization_alias="Feature.type")
description: JsonLiteral | None = Field(default=None, serialization_alias="Feature.description")


class QualityType(JsonLDObject):
type_: Literal["QualityType"] = Field(default="QualityType", serialization_alias="@type")
name: str = Field(serialization_alias="QualityType.name")
quality_id: str = Field(serialization_alias="QualityType.id")


class Quality(JsonLDObject):
type_: Literal["Quality"] = Field(default="Quality", serialization_alias="@type")
quality_type: QualityType = Field(serialization_alias="Quality.type")
quality_type: str = Field(serialization_alias="Quality.type")
value: JsonLiteral = Field(serialization_alias="Quality.value")


Expand Down Expand Up @@ -218,7 +215,10 @@ class Dataset(JsonLDObject):
default_factory=list,
serialization_alias="Dataset.hasVersion",
)
identifier: list[JsonLiteral] = Field(default_factory=list)
identifier: list[JsonLiteral] = Field(
default_factory=list,
serialization_alias="Dataset.identifier",
)
is_referenced_by: list[JsonLiteral] = Field(
default_factory=list,
serialization_alias="Dataset.isReferencedBy",
Expand Down Expand Up @@ -248,6 +248,14 @@ class Dataset(JsonLDObject):
)
visibility: Visibility | None = Field(default=None, serialization_alias="Dataset.visibility")

@field_serializer("status")
def serialize_status(self, v: DatasetStatus) -> str:
return f"https://openml.org/mldcatap/mldcat-ap/dataset-status#{v}"

@field_serializer("visibility")
def serialize_visibility(self, v: Visibility) -> str:
return f"https://openml.org/mldcatap/mldcat-ap/dataset-visibility#{v}"


class DataService(JsonLDObject):
type_: Literal["DataService"] = Field(default="DataService", serialization_alias="@type")
Expand Down Expand Up @@ -276,70 +284,3 @@ class JsonLDGraph(BaseModel):
)

model_config = {"populate_by_name": True, "extra": "forbid"}


def convert_to_mldcat_ap(dataset: DatasetMetadata) -> JsonLDGraph:
arff_service = DataService(
id_="openml-arff-service",
title=["OpenML ARFF server"],
endpoint_url="https://www.openml.org/data/download",
)
example_feature = Feature(
id_="example-petal-width",
name="example_petal_width",
feature_type="https://schema.org/Number",
description="Feature information not loaded, this is an example.",
)

example_quality = Quality(
id_="example-quality",
quality_type=QualityType(
id_="quality-type-example",
name="number_of_features",
quality_id="link_to_definition",
),
value="150",
)
checksum = MD5Checksum(id_="checksum-id", value=dataset.md5_checksum)
# contributor and creator N/A
distribution = Distribution(
id_="distribution-id",
access_url=[f"https://www.openml.org/d/{dataset.id_}"],
has_feature=[JsonLDObjectReference[Feature].to(example_feature)],
has_quality=[JsonLDObjectReference[Quality].to(example_quality)],
default_target_attribute=next(iter(dataset.default_target_attribute), None),
download_url=[dataset.url],
format_=dataset.format_,
checksum=JsonLDObjectReference[MD5Checksum].to(checksum),
access_service=[JsonLDObjectReference[DataService].to(arff_service)],
)

mldcat_dataset = Dataset(
id_=str(dataset.id_),
type_="Dataset",
collection_date=str(dataset.upload_date),
description=[dataset.description],
title=[dataset.name],
distribution=[JsonLDObjectReference[Distribution].to(distribution)],
status=dataset.status,
version_info=str(dataset.version),
version_label=dataset.version_label,
visibility=dataset.visibility,
keyword=dataset.tags,
issued=JsonLDQualifiedLiteral(
value=str(dataset.upload_date),
type_="http://www.w3.org/2001/XMLSchema#dateTime",
),
)

return JsonLDGraph(
context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context/mldcat-ap.jsonld",
graph=[
arff_service,
distribution,
mldcat_dataset,
example_feature,
example_quality,
checksum,
],
)

0 comments on commit e95f490

Please sign in to comment.