From e95f490e2a91e038313e60eb8f7e644a7ca13a87 Mon Sep 17 00:00:00 2001 From: Pieter Gijsbers Date: Wed, 26 Jun 2024 10:48:31 +0200 Subject: [PATCH] Separate mldcat ap endpoints (#160) * Give MLDCAT-AP endpoint a separate tag * Extract out Feature * Finalize first iteration separation of endpoints There are now endpoints for Dataset+Distribution, Feature, Quality and DataService. * Serialize status and visibility as url --- src/routers/mldcat_ap/dataset.py | 190 +++++++++++++++++++++++++++--- src/routers/openml/qualities.py | 2 +- src/schemas/datasets/mldcat_ap.py | 97 +++------------ 3 files changed, 196 insertions(+), 93 deletions(-) diff --git a/src/routers/mldcat_ap/dataset.py b/src/routers/mldcat_ap/dataset.py index 771edb8..ea43c74 100644 --- a/src/routers/mldcat_ap/dataset.py +++ b/src/routers/mldcat_ap/dataset.py @@ -1,27 +1,189 @@ +"""Router for MLDCAT-AP endpoints: https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/#examples + +Incredibly inefficient, but it's just a proof of concept. +Specific queries could be written to fetch e.g., a single feature or quality. +""" + from typing import Annotated -from fastapi import APIRouter, Depends -from schemas.datasets.mldcat_ap import JsonLDGraph, convert_to_mldcat_ap +import config +from database.users import User +from fastapi import APIRouter, Depends, HTTPException +from schemas.datasets.mldcat_ap import ( + DataService, + Dataset, + Distribution, + Feature, + JsonLDGraph, + JsonLDObjectReference, + JsonLDQualifiedLiteral, + MD5Checksum, + Quality, +) from sqlalchemy import Connection -from routers.dependencies import expdb_connection, userdb_connection -from routers.openml.datasets import get_dataset +from routers.dependencies import expdb_connection, fetch_user, userdb_connection +from routers.openml.datasets import get_dataset, get_dataset_features +from routers.openml.qualities import get_qualities -router = APIRouter(prefix="/mldcat_ap/datasets", tags=["datasets"]) +router = APIRouter(prefix="/mldcat_ap", tags=["MLDCAT-AP"]) +_configuration = config.load_configuration() +_server_url = ( + f"{_configuration['arff_base_url']}{_configuration['fastapi']['root_path']}{router.prefix}" +) @router.get( - path="/{dataset_id}", - description="Get meta-data for dataset with ID `dataset_id`.", + path="/distribution/{distribution_id}", + description="Get meta-data for distribution with ID `distribution_id`.", ) -def get_mldcat_ap_dataset( - dataset_id: int, +def get_mldcat_ap_distribution( + distribution_id: int, + user: Annotated[User | None, Depends(fetch_user)] = None, user_db: Annotated[Connection, Depends(userdb_connection)] = None, - expdb_db: Annotated[Connection, Depends(expdb_connection)] = None, + expdb: Annotated[Connection, Depends(expdb_connection)] = None, ) -> JsonLDGraph: - openml_dataset = get_dataset( - dataset_id=dataset_id, + oml_dataset = get_dataset( + dataset_id=distribution_id, + user=user, user_db=user_db, - expdb_db=expdb_db, + expdb_db=expdb, + ) + openml_features = get_dataset_features(distribution_id, user, expdb) + features = [ + Feature( + id_=f"{_server_url}/feature/{distribution_id}/{feature.index}", + name=feature.name, + feature_type=f"{_server_url}/schema/feature-type#{feature.data_type}", + ) + for feature in openml_features + ] + oml_qualities = get_qualities(distribution_id, user, expdb) + qualities = [ + Quality( + id_=f"{_server_url}/quality/{quality.name}/{distribution_id}", + quality_type=f"{_server_url}/quality/{quality.name}", + value=str(quality.value), + ) + for quality in oml_qualities + ] + checksum = MD5Checksum( + id_=f"{_server_url}/checksum/{distribution_id}", + value=oml_dataset.md5_checksum, + ) + arff_service = DataService( + id_=f"{_server_url}/dataservice/1", + endpoint_url=_server_url, + title=["REST API for sharing OpenML metadata in MLDCAT-AP format."], + ) + distribution = Distribution( + id_=f"{_server_url}/distribution/{distribution_id}", + access_url=[f"https://www.openml.org/d/{distribution_id}"], + has_feature=[JsonLDObjectReference[Feature].to(feature) for feature in features], + has_quality=[JsonLDObjectReference[Quality].to(quality) for quality in qualities], + default_target_attribute=next(iter(oml_dataset.default_target_attribute), None), + download_url=[oml_dataset.url], + format_=oml_dataset.format_, + checksum=JsonLDObjectReference[MD5Checksum].to(checksum), + access_service=[JsonLDObjectReference[DataService].to(arff_service)], + ) + mldcat_dataset = Dataset( + id_=str(distribution_id), + type_="Dataset", + collection_date=str(oml_dataset.upload_date), + description=[oml_dataset.description], + title=[oml_dataset.name], + distribution=[JsonLDObjectReference[Distribution].to(distribution)], + status=oml_dataset.status, + version_info=str(oml_dataset.version), + version_label=oml_dataset.version_label, + visibility=oml_dataset.visibility, + keyword=oml_dataset.tags, + issued=JsonLDQualifiedLiteral( + value=str(oml_dataset.upload_date), + type_="http://www.w3.org/2001/XMLSchema#dateTime", + ), + ) + return JsonLDGraph( + context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld", + graph=[ + mldcat_dataset, + distribution, + arff_service, + checksum, + ], + ) + + +@router.get( + path="/dataservice/{service_id}", + description="Get meta-data for a specific data service.", +) +def get_dataservice(service_id: int) -> JsonLDGraph: + if service_id != 1: + raise HTTPException(status_code=404, detail="Service not found.") + return JsonLDGraph( + context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld", + graph=[ + DataService( + id_=f"{_server_url}/dataservice/{service_id}", + endpoint_url=_server_url, + title=["REST API for sharing OpenML metadata in MLDCAT-AP format."], + ), + ], + ) + + +@router.get( + path="/quality/{quality_name}/{distribution_id}", + description="Get meta-data for a specific quality and distribution.", +) +def get_distribution_quality( + quality_name: str, + distribution_id: int, + user: Annotated[User | None, Depends(fetch_user)] = None, + expdb: Annotated[Connection, Depends(expdb_connection)] = None, +) -> JsonLDGraph: + qualities = get_qualities(distribution_id, user, expdb) + quality = next(q for q in qualities if q.name == quality_name) + example_quality = Quality( + id_=f"{_server_url}/quality/{quality_name}/{distribution_id}", + quality_type=f"{_server_url}/quality/{quality_name}", + value=str(quality.value), + ) + + return JsonLDGraph( + context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld", + graph=[ + example_quality, + ], + ) + + +@router.get( + path="/feature/{distribution_id}/{feature_no}", + description="Get meta-data for the n-th feature of a distribution.", +) +def get_distribution_feature( + distribution_id: int, + feature_no: int, + user: Annotated[Connection, Depends(fetch_user)] = None, + expdb: Annotated[Connection, Depends(expdb_connection)] = None, +) -> JsonLDGraph: + features = get_dataset_features( + dataset_id=distribution_id, + user=user, + expdb=expdb, + ) + feature = features[feature_no] + mldcat_feature = Feature( + id_=f"{_server_url}/feature/{distribution_id}/{feature.index}", + name=feature.name, + feature_type=f"{_server_url}/schema/feature-type#{feature.data_type}", + ) + return JsonLDGraph( + context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context.jsonld", + graph=[ + mldcat_feature, + ], ) - return convert_to_mldcat_ap(openml_dataset) diff --git a/src/routers/openml/qualities.py b/src/routers/openml/qualities.py index 4231d5a..c498a54 100644 --- a/src/routers/openml/qualities.py +++ b/src/routers/openml/qualities.py @@ -29,7 +29,7 @@ def list_qualities( @router.get("/qualities/{dataset_id}") def get_qualities( dataset_id: int, - user: Annotated[User, Depends(fetch_user)], + user: Annotated[User | None, Depends(fetch_user)], expdb: Annotated[Connection, Depends(expdb_connection)], ) -> list[Quality]: dataset = get_dataset(dataset_id, expdb) diff --git a/src/schemas/datasets/mldcat_ap.py b/src/schemas/datasets/mldcat_ap.py index 0d9836c..cfbe1b7 100644 --- a/src/schemas/datasets/mldcat_ap.py +++ b/src/schemas/datasets/mldcat_ap.py @@ -12,9 +12,9 @@ from enum import StrEnum from typing import Generic, Literal, TypeVar -from pydantic import BaseModel, Field, HttpUrl +from pydantic import BaseModel, Field, HttpUrl, field_serializer, model_serializer -from schemas.datasets.openml import DatasetMetadata, DatasetStatus, Visibility +from schemas.datasets.openml import DatasetStatus, Visibility class JsonLDQualifiedLiteral(BaseModel): @@ -54,6 +54,10 @@ def to(cls, json_ld_object: T) -> JsonLDObjectReference[T]: """Create a reference to `json_ld_object`""" return cls(id_=json_ld_object.id_) + @model_serializer + def ser_model(self) -> str: + return self.id_ + class AccessRights(StrEnum): """Recommend values for 'access rights' within DCAT-AP context""" @@ -101,18 +105,11 @@ class Feature(JsonLDObject): type_: Literal["Feature"] = Field(default="Feature", serialization_alias="@type") name: str = Field(serialization_alias="Feature.name") feature_type: str = Field(serialization_alias="Feature.type") - description: JsonLiteral | None = Field(default=None, serialization_alias="Feature.description") - - -class QualityType(JsonLDObject): - type_: Literal["QualityType"] = Field(default="QualityType", serialization_alias="@type") - name: str = Field(serialization_alias="QualityType.name") - quality_id: str = Field(serialization_alias="QualityType.id") class Quality(JsonLDObject): type_: Literal["Quality"] = Field(default="Quality", serialization_alias="@type") - quality_type: QualityType = Field(serialization_alias="Quality.type") + quality_type: str = Field(serialization_alias="Quality.type") value: JsonLiteral = Field(serialization_alias="Quality.value") @@ -218,7 +215,10 @@ class Dataset(JsonLDObject): default_factory=list, serialization_alias="Dataset.hasVersion", ) - identifier: list[JsonLiteral] = Field(default_factory=list) + identifier: list[JsonLiteral] = Field( + default_factory=list, + serialization_alias="Dataset.identifier", + ) is_referenced_by: list[JsonLiteral] = Field( default_factory=list, serialization_alias="Dataset.isReferencedBy", @@ -248,6 +248,14 @@ class Dataset(JsonLDObject): ) visibility: Visibility | None = Field(default=None, serialization_alias="Dataset.visibility") + @field_serializer("status") + def serialize_status(self, v: DatasetStatus) -> str: + return f"https://openml.org/mldcatap/mldcat-ap/dataset-status#{v}" + + @field_serializer("visibility") + def serialize_visibility(self, v: Visibility) -> str: + return f"https://openml.org/mldcatap/mldcat-ap/dataset-visibility#{v}" + class DataService(JsonLDObject): type_: Literal["DataService"] = Field(default="DataService", serialization_alias="@type") @@ -276,70 +284,3 @@ class JsonLDGraph(BaseModel): ) model_config = {"populate_by_name": True, "extra": "forbid"} - - -def convert_to_mldcat_ap(dataset: DatasetMetadata) -> JsonLDGraph: - arff_service = DataService( - id_="openml-arff-service", - title=["OpenML ARFF server"], - endpoint_url="https://www.openml.org/data/download", - ) - example_feature = Feature( - id_="example-petal-width", - name="example_petal_width", - feature_type="https://schema.org/Number", - description="Feature information not loaded, this is an example.", - ) - - example_quality = Quality( - id_="example-quality", - quality_type=QualityType( - id_="quality-type-example", - name="number_of_features", - quality_id="link_to_definition", - ), - value="150", - ) - checksum = MD5Checksum(id_="checksum-id", value=dataset.md5_checksum) - # contributor and creator N/A - distribution = Distribution( - id_="distribution-id", - access_url=[f"https://www.openml.org/d/{dataset.id_}"], - has_feature=[JsonLDObjectReference[Feature].to(example_feature)], - has_quality=[JsonLDObjectReference[Quality].to(example_quality)], - default_target_attribute=next(iter(dataset.default_target_attribute), None), - download_url=[dataset.url], - format_=dataset.format_, - checksum=JsonLDObjectReference[MD5Checksum].to(checksum), - access_service=[JsonLDObjectReference[DataService].to(arff_service)], - ) - - mldcat_dataset = Dataset( - id_=str(dataset.id_), - type_="Dataset", - collection_date=str(dataset.upload_date), - description=[dataset.description], - title=[dataset.name], - distribution=[JsonLDObjectReference[Distribution].to(distribution)], - status=dataset.status, - version_info=str(dataset.version), - version_label=dataset.version_label, - visibility=dataset.visibility, - keyword=dataset.tags, - issued=JsonLDQualifiedLiteral( - value=str(dataset.upload_date), - type_="http://www.w3.org/2001/XMLSchema#dateTime", - ), - ) - - return JsonLDGraph( - context="https://semiceu.github.io/MLDCAT-AP/releases/1.0.0/context/mldcat-ap.jsonld", - graph=[ - arff_service, - distribution, - mldcat_dataset, - example_feature, - example_quality, - checksum, - ], - )