diff --git a/src/routers/v1/qualities.py b/src/routers/v1/qualities.py index a00372d..366e208 100644 --- a/src/routers/v1/qualities.py +++ b/src/routers/v1/qualities.py @@ -1,10 +1,14 @@ -from typing import Annotated, Literal +import http.client +from typing import Annotated, Any, Literal -from database.datasets import list_all_qualities -from fastapi import APIRouter, Depends -from sqlalchemy import Connection +from database.datasets import get_dataset, list_all_qualities +from database.users import User, UserGroup +from fastapi import APIRouter, Depends, HTTPException +from schemas.datasets.openml import Quality +from sqlalchemy import Connection, text -from routers.dependencies import expdb_connection +from routers.dependencies import expdb_connection, fetch_user +from routers.v2.datasets import DatasetError router = APIRouter(prefix="/v1/datasets", tags=["datasets"]) @@ -19,3 +23,43 @@ def list_qualities( "quality": qualities, }, } + + +def _user_can_see_dataset(dataset: dict[str, Any], user: User) -> bool: + if dataset["visibility"] == "public": + return True + return user is not None and ( + dataset["uploader"] == user.user_id or UserGroup.ADMIN in user.groups + ) + + +@router.get("/qualities/{dataset_id}") +def get_qualities( + dataset_id: int, + user: Annotated[User, Depends(fetch_user)], + expdb: Annotated[Connection, Depends(expdb_connection)], +) -> list[Quality]: + dataset = get_dataset(dataset_id, expdb) + if not dataset or not _user_can_see_dataset(dataset, user): + raise HTTPException( + status_code=http.client.PRECONDITION_FAILED, + detail={"code": DatasetError.NO_DATA_FILE, "message": "Unknown dataset"}, + ) from None + rows = expdb.execute( + text( + """ + SELECT `quality`,`value` + FROM data_quality + WHERE `data`=:dataset_id + """, + ), + parameters={"dataset_id": dataset_id}, + ) + return [Quality(name=row.quality, value=row.value) for row in rows] + # The PHP API provided (sometime) helpful error messages + # if not qualities: + # check if dataset exists: error 360 + # check if user has access: error 361 + # check if there is a data processed entry and forward the error: 364 + # if nothing in process table: 363 + # otherwise: error 362 diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py index 389d663..e136000 100644 --- a/src/schemas/datasets/openml.py +++ b/src/schemas/datasets/openml.py @@ -24,6 +24,11 @@ class DatasetStatus(StrEnum): IN_PREPARATION = "in_preparation" +class Quality(BaseModel): + name: str + value: float | None + + class DatasetMetadata(BaseModel): id_: int = Field(json_schema_extra={"example": 1}, alias="id") visibility: Visibility = Field(json_schema_extra={"example": Visibility.PUBLIC}) diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/v1/qualities_test.py index f1a7a60..d2de98b 100644 --- a/tests/routers/v1/qualities_test.py +++ b/tests/routers/v1/qualities_test.py @@ -160,3 +160,156 @@ def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None: response = api_client.get("/v1/datasets/qualities/list") assert response.status_code == http.client.OK assert expected == response.json() + + +def test_get_quality(api_client: TestClient) -> None: + response = api_client.get("/v1/datasets/qualities/1") + assert response.status_code == http.client.OK + expected = [ + {"name": "AutoCorrelation", "value": 0.6064659977703456}, + {"name": "CfsSubsetEval_DecisionStumpAUC", "value": 0.9067742570970945}, + {"name": "CfsSubsetEval_DecisionStumpErrRate", "value": 0.13251670378619154}, + {"name": "CfsSubsetEval_DecisionStumpKappa", "value": 0.6191022730108037}, + {"name": "CfsSubsetEval_NaiveBayesAUC", "value": 0.9067742570970945}, + {"name": "CfsSubsetEval_NaiveBayesErrRate", "value": 0.13251670378619154}, + {"name": "CfsSubsetEval_NaiveBayesKappa", "value": 0.6191022730108037}, + {"name": "CfsSubsetEval_kNN1NAUC", "value": 0.9067742570970945}, + {"name": "CfsSubsetEval_kNN1NErrRate", "value": 0.13251670378619154}, + {"name": "CfsSubsetEval_kNN1NKappa", "value": 0.6191022730108037}, + {"name": "ClassEntropy", "value": 1.189833856204398}, + {"name": "DecisionStumpAUC", "value": 0.8652735384332186}, + {"name": "DecisionStumpErrRate", "value": 0.22828507795100222}, + {"name": "DecisionStumpKappa", "value": 0.4503332218612649}, + {"name": "Dimensionality", "value": 0.043429844097995544}, + {"name": "EquivalentNumberOfAtts", "value": 26.839183802676523}, + {"name": "J48.00001.AUC", "value": 0.9391585368767195}, + {"name": "J48.00001.ErrRate", "value": 0.10356347438752785}, + {"name": "J48.00001.Kappa", "value": 0.7043302166347443}, + {"name": "J48.0001.AUC", "value": 0.9391585368767195}, + {"name": "J48.0001.ErrRate", "value": 0.10356347438752785}, + {"name": "J48.0001.Kappa", "value": 0.7043302166347443}, + {"name": "J48.001.AUC", "value": 0.9391585368767195}, + {"name": "J48.001.ErrRate", "value": 0.10356347438752785}, + {"name": "J48.001.Kappa", "value": 0.7043302166347443}, + {"name": "MajorityClassPercentage", "value": 76.16926503340757}, + {"name": "MajorityClassSize", "value": 684.0}, + {"name": "MaxAttributeEntropy", "value": 1.8215224482924186}, + {"name": "MaxKurtosisOfNumericAtts", "value": 13.215477213878724}, + {"name": "MaxMeansOfNumericAtts", "value": 1263.0946547884187}, + {"name": "MaxMutualInformation", "value": 0.40908953764451}, + {"name": "MaxNominalAttDistinctValues", "value": 7.0}, + {"name": "MaxSkewnessOfNumericAtts", "value": 3.7616019689156888}, + {"name": "MaxStdDevOfNumericAtts", "value": 1871.3991072665933}, + {"name": "MeanAttributeEntropy", "value": 0.2515351603742048}, + {"name": "MeanKurtosisOfNumericAtts", "value": 4.6480244352098286}, + {"name": "MeanMeansOfNumericAtts", "value": 348.50426818856715}, + {"name": "MeanMutualInformation", "value": 0.044331968697414056}, + {"name": "MeanNoiseToSignalRatio", "value": 4.673900071775454}, + {"name": "MeanNominalAttDistinctValues", "value": 1.6363636363636362}, + {"name": "MeanSkewnessOfNumericAtts", "value": 2.0269825910719437}, + {"name": "MeanStdDevOfNumericAtts", "value": 405.17326983791025}, + {"name": "MinAttributeEntropy", "value": -0.0}, + {"name": "MinKurtosisOfNumericAtts", "value": -0.9723842038435437}, + {"name": "MinMeansOfNumericAtts", "value": 1.1985489977728285}, + {"name": "MinMutualInformation", "value": 0.0}, + {"name": "MinNominalAttDistinctValues", "value": 0.0}, + {"name": "MinSkewnessOfNumericAtts", "value": 0.07299048442083138}, + {"name": "MinStdDevOfNumericAtts", "value": 0.871208280971892}, + {"name": "MinorityClassPercentage", "value": 0.8908685968819599}, + {"name": "MinorityClassSize", "value": 8.0}, + {"name": "NaiveBayesAUC", "value": 0.9315907109421729}, + {"name": "NaiveBayesErrRate", "value": 0.24610244988864144}, + {"name": "NaiveBayesKappa", "value": 0.5569590016631507}, + {"name": "NumberOfBinaryFeatures", "value": 4.0}, + {"name": "NumberOfClasses", "value": 5.0}, + {"name": "NumberOfFeatures", "value": 39.0}, + {"name": "NumberOfInstances", "value": 898.0}, + {"name": "NumberOfInstancesWithMissingValues", "value": 898.0}, + {"name": "NumberOfMissingValues", "value": 22175.0}, + {"name": "NumberOfNumericFeatures", "value": 6.0}, + {"name": "NumberOfSymbolicFeatures", "value": 33.0}, + {"name": "PercentageOfBinaryFeatures", "value": 10.256410256410255}, + {"name": "PercentageOfInstancesWithMissingValues", "value": 100.0}, + {"name": "PercentageOfMissingValues", "value": 63.317343384158534}, + {"name": "PercentageOfNumericFeatures", "value": 15.384615384615385}, + {"name": "PercentageOfSymbolicFeatures", "value": 84.61538461538461}, + {"name": "Quartile1AttributeEntropy", "value": 0.0}, + {"name": "Quartile1KurtosisOfNumericAtts", "value": -0.40305022089010156}, + {"name": "Quartile1MeansOfNumericAtts", "value": 3.025695155902005}, + {"name": "Quartile1MutualInformation", "value": 0.0}, + {"name": "Quartile1SkewnessOfNumericAtts", "value": 0.967384603629726}, + {"name": "Quartile1StdDevOfNumericAtts", "value": 10.505435772171138}, + {"name": "Quartile2AttributeEntropy", "value": 0.0}, + {"name": "Quartile2KurtosisOfNumericAtts", "value": 1.6372437439142264}, + {"name": "Quartile2MeansOfNumericAtts", "value": 21.222160356347437}, + {"name": "Quartile2MutualInformation", "value": 0.0}, + {"name": "Quartile2SkewnessOfNumericAtts", "value": 1.6547313364025702}, + {"name": "Quartile2StdDevOfNumericAtts", "value": 69.85338529046133}, + {"name": "Quartile3AttributeEntropy", "value": 0.2385631077559124}, + {"name": "Quartile3KurtosisOfNumericAtts", "value": 12.741748058445403}, + {"name": "Quartile3MeansOfNumericAtts", "value": 901.2636692650334}, + {"name": "Quartile3MutualInformation", "value": 0.0206465881071925}, + {"name": "Quartile3SkewnessOfNumericAtts", "value": 3.7546438249219056}, + {"name": "Quartile3StdDevOfNumericAtts", "value": 771.8590427889504}, + {"name": "REPTreeDepth1AUC", "value": 0.962680369298288}, + {"name": "REPTreeDepth1ErrRate", "value": 0.08463251670378619}, + {"name": "REPTreeDepth1Kappa", "value": 0.768583383630482}, + {"name": "REPTreeDepth2AUC", "value": 0.962680369298288}, + {"name": "REPTreeDepth2ErrRate", "value": 0.08463251670378619}, + {"name": "REPTreeDepth2Kappa", "value": 0.768583383630482}, + {"name": "REPTreeDepth3AUC", "value": 0.962680369298288}, + {"name": "REPTreeDepth3ErrRate", "value": 0.08463251670378619}, + {"name": "REPTreeDepth3Kappa", "value": 0.768583383630482}, + {"name": "RandomTreeDepth1AUC", "value": 0.9296999989655875}, + {"name": "RandomTreeDepth1ErrRate", "value": 0.0801781737193764}, + {"name": "RandomTreeDepth1Kappa", "value": 0.7953250436852635}, + {"name": "RandomTreeDepth2AUC", "value": 0.9296999989655875}, + {"name": "RandomTreeDepth2ErrRate", "value": 0.0801781737193764}, + {"name": "RandomTreeDepth2Kappa", "value": 0.7953250436852635}, + {"name": "RandomTreeDepth3AUC", "value": 0.9296999989655875}, + {"name": "RandomTreeDepth3ErrRate", "value": 0.0801781737193764}, + {"name": "RandomTreeDepth3Kappa", "value": 0.7953250436852635}, + {"name": "StdvNominalAttDistinctValues", "value": 1.5576059718800395}, + {"name": "kNN1NAUC", "value": 0.8721948540771287}, + {"name": "kNN1NErrRate", "value": 0.06347438752783964}, + {"name": "kNN1NKappa", "value": 0.8261102938928316}, + ] + assert response.json() == expected + + +@pytest.mark.php() +@pytest.mark.parametrize( + "data_id", + list(set(range(1, 132)) - {55, 56, 59, 116, 130}), +) +def test_get_quality_identical(data_id: int, api_client: TestClient) -> None: + php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}") + python_response = api_client.get(f"/v1/datasets/qualities/{data_id}") + assert python_response.status_code == php_response.status_code + + expected = [ + { + "name": quality["name"], + "value": None if quality["value"] == [] else float(quality["value"]), + } + for quality in php_response.json()["data_qualities"]["quality"] + ] + assert python_response.json() == expected + + +@pytest.mark.php() +@pytest.mark.parametrize( + "data_id", + [55, 56, 59, 116, 130, 132], +) +def test_get_quality_identical_error(data_id: int, api_client: TestClient) -> None: + if data_id in [55, 56, 59]: + pytest.skip("Detailed error for code 364 (failed processing) not yet supported.") + if data_id in [116]: + pytest.skip("Detailed error for code 362 (no qualities) not yet supported.") + php_response = httpx.get(f"http://server-api-php-api-1:80/api/v1/json/data/qualities/{data_id}") + python_response = api_client.get(f"/v1/datasets/qualities/{data_id}") + assert python_response.status_code == php_response.status_code + # The "dataset unknown" error currently has a separate code in PHP depending on + # where it occurs (e.g., get dataset->113 get quality->361) + assert python_response.json()["detail"]["message"] == php_response.json()["error"]["message"]