diff --git a/docs/migration.md b/docs/migration.md index f241cee..61eef4b 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -91,6 +91,45 @@ includes datasets which are private. The `limit` and `offset` parameters can now be used independently, you no longer need to provide both if you wish to set only one. +## Studies + +### `GET /{id_or_alias}` + +Old-style "legacy" studies which are solely based on tags are no longer supported. + +??? info "Affected Legacy Studies" + + Only 24 old studies were affected by this change, listed below. + There is currently not yet a migration plan for these studies. + + | id | name| + | --: | :-- | + |1 |A large-scale comparison of classification algorit...| + |2 |Fast Algorithm Selection using Learning Curves| + |3 |Multi-Task Learning with a Natural Metric for Quan...| + |5 |Local and Global Feature Selection on Multilabel T...| + |7 |Massive machine learning experiments using mlr and...| + |8 |Decision tree comparaison| + |10| Collaborative primer| + |11| Having a Blast: Meta-Learning and Heterogeneous En...| + |12| Subspace Clustering via Seeking Neighbors with Min...| + |13| Meta-QSAR: learning how to learn QSARs| + |17| Subgroup Discovery| + |20| Mythbusting data mining urban legends through larg...| + |22| Identifying critical paths in undergraduate progra...| + |24| OpenML R paper| + |25| Bernd Demo Study for Multiclass SVMs OML WS 2016| + |27| Compare three different SVM versions of R package ...| + |30| OpenML Paper Study| + |31| Iris Data set Study| + |32| Data Streams and more| + |34| Massively Collaborative Machine Learning| + |37| Speeding up Algorithm Selection via Meta-learning ...| + |38| Performance of new ctree implementations on classi...| + |41| ASLib OpenML Scenario| + |50| Hyper-parameter tuning of Decision Trees| + |51| ensemble on diabetes | + ## Others ### `GET /estimationprocedure/list` diff --git a/src/core/formatting.py b/src/core/formatting.py index b61795e..e298b3e 100644 --- a/src/core/formatting.py +++ b/src/core/formatting.py @@ -7,9 +7,9 @@ def _str_to_bool(string: str) -> bool: - if string.casefold() in ["true", "1", "yes"]: + if string.casefold() in ["true", "1", "yes", "y"]: return True - if string.casefold() in ["false", "0", "no"]: + if string.casefold() in ["false", "0", "no", "n"]: return False msg = f"Could not parse {string=} as bool." raise ValueError(msg) diff --git a/src/database/studies.py b/src/database/studies.py new file mode 100644 index 0000000..047795c --- /dev/null +++ b/src/database/studies.py @@ -0,0 +1,68 @@ +from typing import cast + +from schemas.study import StudyType +from sqlalchemy import Connection, Row, text + + +def get_study_by_id(study_id: int, connection: Connection) -> Row: + return connection.execute( + text( + """ + SELECT *, main_entity_type as type_ + FROM study + WHERE id = :study_id + """, + ), + parameters={"study_id": study_id}, + ).fetchone() + + +def get_study_by_alias(alias: str, connection: Connection) -> Row: + return connection.execute( + text( + """ + SELECT *, main_entity_type as type_ + FROM study + WHERE alias = :study_id + """, + ), + parameters={"study_id": alias}, + ).fetchone() + + +def get_study_data(study: Row, expdb: Connection) -> list[Row]: + if study.type_ == StudyType.TASK: + return cast( + list[Row], + expdb.execute( + text( + """ + SELECT ts.task_id as task_id, ti.value as data_id + FROM task_study as ts LEFT JOIN task_inputs ti ON ts.task_id = ti.task_id + WHERE ts.study_id = :study_id AND ti.input = 'source_data' + """, + ), + parameters={"study_id": study.id}, + ).fetchall(), + ) + return cast( + list[Row], + expdb.execute( + text( + """ + SELECT + rs.run_id as run_id, + run.task_id as task_id, + run.setup as setup_id, + ti.value as data_id, + setup.implementation_id as flow_id + FROM run_study as rs + JOIN run ON run.rid = rs.run_id + JOIN algorithm_setup as setup ON setup.sid = run.setup + JOIN task_inputs as ti ON ti.task_id = run.task_id + WHERE rs.study_id = :study_id AND ti.input = 'source_data' + """, + ), + parameters={"study_id": study.id}, + ).fetchall(), + ) diff --git a/src/main.py b/src/main.py index 074b512..8228749 100644 --- a/src/main.py +++ b/src/main.py @@ -8,6 +8,7 @@ from routers.openml.evaluations import router as evaluationmeasures_router from routers.openml.flows import router as flows_router from routers.openml.qualities import router as qualities_router +from routers.openml.study import router as study_router from routers.openml.tasks import router as task_router from routers.openml.tasktype import router as ttype_router @@ -49,6 +50,7 @@ def create_api() -> FastAPI: app.include_router(estimationprocedure_router) app.include_router(task_router) app.include_router(flows_router) + app.include_router(study_router) return app diff --git a/src/routers/openml/study.py b/src/routers/openml/study.py new file mode 100644 index 0000000..f730965 --- /dev/null +++ b/src/routers/openml/study.py @@ -0,0 +1,62 @@ +import http.client +from typing import Annotated + +from core.formatting import _str_to_bool +from database.studies import get_study_by_alias, get_study_by_id, get_study_data +from database.users import User, UserGroup +from fastapi import APIRouter, Depends, HTTPException +from schemas.core import Visibility +from schemas.study import Study, StudyType +from sqlalchemy import Connection, Row + +from routers.dependencies import expdb_connection, fetch_user + +router = APIRouter(prefix="/studies", tags=["studies"]) + + +def _get_study_raise_otherwise(id_or_alias: int | str, user: User | None, expdb: Connection) -> Row: + if isinstance(id_or_alias, int) or id_or_alias.isdigit(): + study = get_study_by_id(int(id_or_alias), expdb) + else: + study = get_study_by_alias(id_or_alias, expdb) + + if study is None: + raise HTTPException(status_code=http.client.NOT_FOUND, detail="Study not found.") + if study.visibility == Visibility.PRIVATE: + if user is None: + raise HTTPException(status_code=http.client.UNAUTHORIZED, detail="Study is private.") + if study.creator != user.user_id and UserGroup.ADMIN not in user.groups: + raise HTTPException(status_code=http.client.FORBIDDEN, detail="Study is private.") + if _str_to_bool(study.legacy): + raise HTTPException( + status_code=http.client.GONE, + detail="Legacy studies are no longer supported", + ) + + return study + + +@router.get("/{alias_or_id}") +def get_study( + alias_or_id: int | str, + user: Annotated[User | None, Depends(fetch_user)] = None, + expdb: Annotated[Connection, Depends(expdb_connection)] = None, +) -> Study: + study = _get_study_raise_otherwise(alias_or_id, user, expdb) + study_data = get_study_data(study, expdb) + return Study( + id_=study.id, + name=study.name, + alias=study.alias, + main_entity_type=study.type_, + description=study.description, + visibility=study.visibility, + status=study.status, + creation_date=study.creation_date, + creator=study.creator, + data_ids=[row.data_id for row in study_data], + task_ids=[row.task_id for row in study_data], + run_ids=[row.run_id for row in study_data] if study.type_ == StudyType.RUN else [], + flow_ids=[row.flow_id for row in study_data] if study.type_ == StudyType.RUN else [], + setup_ids=[row.setup_id for row in study_data] if study.type_ == StudyType.RUN else [], + ) diff --git a/src/schemas/core.py b/src/schemas/core.py new file mode 100644 index 0000000..77287de --- /dev/null +++ b/src/schemas/core.py @@ -0,0 +1,6 @@ +from enum import StrEnum, auto + + +class Visibility(StrEnum): + PUBLIC = auto() + PRIVATE = auto() diff --git a/src/schemas/study.py b/src/schemas/study.py new file mode 100644 index 0000000..a313ba5 --- /dev/null +++ b/src/schemas/study.py @@ -0,0 +1,34 @@ +from datetime import datetime +from enum import StrEnum, auto + +from pydantic import BaseModel, Field + +from schemas.core import Visibility + + +class StudyType(StrEnum): + RUN = auto() + TASK = auto() + + +class StudyStatus(StrEnum): + ACTIVE = auto() + DEACTIVATED = auto() + IN_PREPARATION = auto() + + +class Study(BaseModel): + id_: int = Field(serialization_alias="id") + name: str + alias: str | None + main_entity_type: StudyType + description: str + visibility: Visibility + status: StudyStatus + creation_date: datetime + creator: int + task_ids: list[int] + run_ids: list[int] + data_ids: list[int] + setup_ids: list[int] + flow_ids: list[int] diff --git a/tests/routers/openml/migration/studies_migration_test.py b/tests/routers/openml/migration/studies_migration_test.py new file mode 100644 index 0000000..c820b1b --- /dev/null +++ b/tests/routers/openml/migration/studies_migration_test.py @@ -0,0 +1,36 @@ +import deepdiff +import httpx +import pytest +from core.conversions import nested_num_to_str, nested_remove_nones +from starlette.testclient import TestClient + + +@pytest.mark.php() +def test_get_study_equal(py_api: TestClient, php_api: httpx.Client) -> None: + new = py_api.get("/studies/1") + old = php_api.get("/study/1") + assert new.status_code == old.status_code + + new = new.json() + # New implementation is typed + new = nested_num_to_str(new) + # New implementation has same fields even if empty + new = nested_remove_nones(new) + new["tasks"] = {"task_id": new.pop("task_ids")} + new["data"] = {"data_id": new.pop("data_ids")} + if runs := new.pop("run_ids", None): + new["runs"] = {"run_id": runs} + if flows := new.pop("flow_ids", None): + new["flows"] = {"flow_id": flows} + if setups := new.pop("setup_ids", None): + new["setup"] = {"setup_id": setups} + + # New implementation is not nested + new = {"study": new} + difference = deepdiff.diff.DeepDiff( + new, + old.json(), + ignore_order=True, + ignore_numeric_type_changes=True, + ) + assert not difference diff --git a/tests/routers/openml/study_test.py b/tests/routers/openml/study_test.py new file mode 100644 index 0000000..96ffba0 --- /dev/null +++ b/tests/routers/openml/study_test.py @@ -0,0 +1,449 @@ +from starlette.testclient import TestClient + + +def test_get_task_study_by_id(py_api: TestClient) -> None: + response = py_api.get("/studies/1") + assert response.status_code == 200 + expected = { + "id": 1, + "alias": "OpenML100", + "main_entity_type": "task", + "name": "OpenML100", + "description": "OpenML100 equivalent on capa", + "visibility": "public", + "status": "active", + "creation_date": "2019-02-25T17:15:01", + "creator": 1, + "data_ids": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + ], + "task_ids": [ + 1, + 7, + 13, + 19, + 25, + 31, + 37, + 43, + 49, + 55, + 61, + 67, + 73, + 79, + 85, + 91, + 97, + 103, + 109, + 115, + 121, + 127, + 133, + 139, + 145, + 151, + 157, + 163, + 169, + 175, + 181, + 187, + 193, + 199, + 205, + 211, + 217, + 223, + 229, + 235, + 241, + 247, + 253, + 259, + 265, + 271, + 277, + 283, + 289, + 295, + 301, + 307, + 313, + 319, + 325, + 331, + 337, + 343, + 349, + 355, + 361, + 367, + 373, + 379, + 385, + 391, + 397, + 403, + 409, + 415, + 421, + 427, + 433, + 439, + 445, + 451, + 457, + 463, + 469, + 475, + 481, + 487, + 493, + 499, + 505, + 511, + 517, + 523, + 529, + 535, + 541, + 547, + 553, + 559, + 565, + 571, + 577, + 583, + 589, + 595, + ], + "run_ids": [], + "flow_ids": [], + "setup_ids": [], + } + assert response.json() == expected + + +def test_get_task_study_by_alias(py_api: TestClient) -> None: + response = py_api.get("/studies/OpenML100") + assert response.status_code == 200 + expected = { + "id": 1, + "alias": "OpenML100", + "main_entity_type": "task", + "name": "OpenML100", + "description": "OpenML100 equivalent on capa", + "visibility": "public", + "status": "active", + "creation_date": "2019-02-25T17:15:01", + "creator": 1, + "data_ids": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + ], + "task_ids": [ + 1, + 7, + 13, + 19, + 25, + 31, + 37, + 43, + 49, + 55, + 61, + 67, + 73, + 79, + 85, + 91, + 97, + 103, + 109, + 115, + 121, + 127, + 133, + 139, + 145, + 151, + 157, + 163, + 169, + 175, + 181, + 187, + 193, + 199, + 205, + 211, + 217, + 223, + 229, + 235, + 241, + 247, + 253, + 259, + 265, + 271, + 277, + 283, + 289, + 295, + 301, + 307, + 313, + 319, + 325, + 331, + 337, + 343, + 349, + 355, + 361, + 367, + 373, + 379, + 385, + 391, + 397, + 403, + 409, + 415, + 421, + 427, + 433, + 439, + 445, + 451, + 457, + 463, + 469, + 475, + 481, + 487, + 493, + 499, + 505, + 511, + 517, + 523, + 529, + 535, + 541, + 547, + 553, + 559, + 565, + 571, + 577, + 583, + 589, + 595, + ], + "run_ids": [], + "flow_ids": [], + "setup_ids": [], + } + assert response.json() == expected