diff --git a/src/core/conversions.py b/src/core/conversions.py new file mode 100644 index 0000000..70ca9cc --- /dev/null +++ b/src/core/conversions.py @@ -0,0 +1,67 @@ +from typing import Any + + +def _str_to_num(string: str) -> int | float | str: + """Tries to convert the string to integer, otherwise float, otherwise returns the input.""" + if string.isdigit(): + return int(string) + try: + return float(string) + except ValueError: + return string + + +def nested_str_to_num(obj: Any) -> Any: + """Recursively tries to convert all strings in the object to numbers. + For dictionaries, only the values will be converted.""" + if isinstance(obj, dict): + return {key: nested_str_to_num(val) for key, val in obj.items()} + if isinstance(obj, list): + return [nested_str_to_num(val) for val in obj] + if isinstance(obj, str): + return _str_to_num(obj) + return obj + + +def nested_num_to_str(obj: Any) -> Any: + """Recursively tries to convert all numbers in the object to strings. + For dictionaries, only the values will be converted.""" + if isinstance(obj, dict): + return {key: nested_num_to_str(val) for key, val in obj.items()} + if isinstance(obj, list): + return [nested_num_to_str(val) for val in obj] + if isinstance(obj, (int, float)): + return str(obj) + return obj + + +def nested_int_to_str(obj: Any) -> Any: + if isinstance(obj, dict): + return {key: nested_int_to_str(val) for key, val in obj.items()} + if isinstance(obj, list): + return [nested_int_to_str(val) for val in obj] + if isinstance(obj, int): + return str(obj) + return obj + + +def nested_remove_nones(obj: Any) -> Any: + if isinstance(obj, dict): + return { + key: nested_remove_nones(val) + for key, val in obj.items() + if val is not None and nested_remove_nones(val) is not None + } + if isinstance(obj, list): + return [nested_remove_nones(val) for val in obj if nested_remove_nones(val) is not None] + return obj + + +def nested_remove_single_element_list(obj: Any) -> Any: + if isinstance(obj, dict): + return {key: nested_remove_single_element_list(val) for key, val in obj.items()} + if isinstance(obj, list): + if len(obj) == 1: + return nested_remove_single_element_list(obj[0]) + return [nested_remove_single_element_list(val) for val in obj] + return obj diff --git a/src/database/flows.py b/src/database/flows.py new file mode 100644 index 0000000..c369210 --- /dev/null +++ b/src/database/flows.py @@ -0,0 +1,56 @@ +from typing import Any + +from sqlalchemy import Connection, CursorResult, text + + +def get_flow_subflows(flow_id: int, expdb: Connection) -> CursorResult[Any]: + return expdb.execute( + text( + """ + SELECT child as child_id, identifier + FROM implementation_component + WHERE parent = :flow_id + """, + ), + parameters={"flow_id": flow_id}, + ) + + +def get_flow_tags(flow_id: int, expdb: Connection) -> CursorResult[Any]: + tag_rows = expdb.execute( + text( + """ + SELECT tag + FROM implementation_tag + WHERE id = :flow_id + """, + ), + parameters={"flow_id": flow_id}, + ) + return [tag.tag for tag in tag_rows] + + +def get_flow_parameters(flow_id: int, expdb: Connection) -> CursorResult[Any]: + return expdb.execute( + text( + """ + SELECT *, defaultValue as default_value, dataType as data_type + FROM input + WHERE implementation_id = :flow_id + """, + ), + parameters={"flow_id": flow_id}, + ) + + +def get_flow(flow_id: int, expdb: Connection) -> CursorResult[Any]: + return expdb.execute( + text( + """ + SELECT *, uploadDate as upload_date + FROM implementation + WHERE id = :flow_id + """, + ), + parameters={"flow_id": flow_id}, + ) diff --git a/src/main.py b/src/main.py index cc5e635..074b512 100644 --- a/src/main.py +++ b/src/main.py @@ -6,6 +6,7 @@ from routers.openml.datasets import router as datasets_router from routers.openml.estimation_procedure import router as estimationprocedure_router from routers.openml.evaluations import router as evaluationmeasures_router +from routers.openml.flows import router as flows_router from routers.openml.qualities import router as qualities_router from routers.openml.tasks import router as task_router from routers.openml.tasktype import router as ttype_router @@ -47,6 +48,7 @@ def create_api() -> FastAPI: app.include_router(evaluationmeasures_router) app.include_router(estimationprocedure_router) app.include_router(task_router) + app.include_router(flows_router) return app diff --git a/src/routers/openml/flows.py b/src/routers/openml/flows.py new file mode 100644 index 0000000..89b6e8b --- /dev/null +++ b/src/routers/openml/flows.py @@ -0,0 +1,61 @@ +import http.client +from typing import Annotated + +from core.conversions import _str_to_num +from database.flows import get_flow as db_get_flow +from database.flows import get_flow_parameters, get_flow_subflows, get_flow_tags +from fastapi import APIRouter, Depends, HTTPException +from schemas.flows import Flow, Parameter +from sqlalchemy import Connection + +from routers.dependencies import expdb_connection + +router = APIRouter(prefix="/flows", tags=["flows"]) + + +@router.get("/{flow_id}") +def get_flow(flow_id: int, expdb: Annotated[Connection, Depends(expdb_connection)] = None) -> Flow: + flow_rows = db_get_flow(flow_id, expdb) + if not (flow := next(flow_rows, None)): + raise HTTPException(status_code=http.client.NOT_FOUND, detail="Flow not found") + + parameter_rows = get_flow_parameters(flow_id, expdb) + parameters = [ + Parameter( + name=parameter.name, + # PHP sets the default value to [], not sure where that comes from. + # In the modern interface, `None` is used instead for now, but I think it might + # make more sense to omit it if there is none. + default_value=_str_to_num(parameter.default_value) if parameter.default_value else None, + data_type=parameter.data_type, + description=parameter.description, + ) + for parameter in parameter_rows + ] + + tags = get_flow_tags(flow_id, expdb) + + flow_rows = get_flow_subflows(flow_id, expdb) + subflows = [ + { + "identifier": flow.identifier, + "flow": get_flow(flow_id=flow.child_id, expdb=expdb), + } + for flow in flow_rows + ] + + return Flow( + id_=flow.id, + uploader=flow.uploader, + name=flow.name, + class_name=flow.class_name, + version=flow.version, + external_version=flow.external_version, + description=flow.description, + upload_date=flow.upload_date, + language=flow.language, + dependencies=flow.dependencies, + parameter=parameters, + subflows=subflows, + tag=tags, + ) diff --git a/src/schemas/flows.py b/src/schemas/flows.py new file mode 100644 index 0000000..01e3c83 --- /dev/null +++ b/src/schemas/flows.py @@ -0,0 +1,29 @@ +from datetime import datetime +from typing import Any, Self + +from pydantic import BaseModel, ConfigDict, Field + + +class Parameter(BaseModel): + name: str + default_value: Any + data_type: str + description: str + + +class Flow(BaseModel): + id_: int = Field(serialization_alias="id") + uploader: int | None + name: str = Field(max_length=1024) + class_name: str | None = Field(max_length=256) + version: int + external_version: str = Field(max_length=128) + description: str | None + upload_date: datetime + language: str | None = Field(max_length=128) + dependencies: str | None + parameter: list[Parameter] + subflows: list[Self] + tag: list[str] + + model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/tests/routers/__init__.py b/tests/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/routers/openml/__init__.py b/tests/routers/openml/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/routers/openml/flows_test.py b/tests/routers/openml/flows_test.py new file mode 100644 index 0000000..b957060 --- /dev/null +++ b/tests/routers/openml/flows_test.py @@ -0,0 +1,302 @@ +import deepdiff.diff +from starlette.testclient import TestClient + + +def test_get_flow_no_subflow(py_api: TestClient) -> None: + response = py_api.get("/flows/1") + assert response.status_code == 200 + expected = { + "id": 1, + "uploader": 16, + "name": "weka.ZeroR", + "class_name": "weka.classifiers.rules.ZeroR", + "version": 1, + "external_version": "Weka_3.9.0_12024", + "description": "Weka implementation of ZeroR", + "upload_date": "2017-03-24T14:26:38", + "language": "English", + "dependencies": "Weka_3.9.0", + "parameter": [ + { + "name": "-do-not-check-capabilities", + "data_type": "flag", + "default_value": None, + "description": "If set, classifier capabilities are not checked before classifier is built\n\t(use with caution).", # noqa: E501 + }, + { + "name": "batch-size", + "data_type": "option", + "default_value": None, + "description": "The desired batch size for batch prediction (default 100).", + }, + { + "name": "num-decimal-places", + "data_type": "option", + "default_value": None, + "description": "The number of decimal places for the output of numbers in the model (default 2).", # noqa: E501 + }, + { + "name": "output-debug-info", + "data_type": "flag", + "default_value": None, + "description": "If set, classifier is run in debug mode and\n\tmay output additional info to the console", # noqa: E501 + }, + ], + "subflows": [], + "tag": ["OpenmlWeka", "weka"], + } + difference = deepdiff.diff.DeepDiff(response.json(), expected, ignore_order=True) + assert not difference + + +def test_get_flow_with_subflow(py_api: TestClient) -> None: + response = py_api.get("/flows/3") + assert response.status_code == 200 + expected = { + "id": 3, + "uploader": 16, + "name": "weka.JRip", + "class_name": "weka.classifiers.rules.JRip", + "version": 1, + "external_version": "Weka_3.9.0_10153", + "description": ( + "William W. Cohen: Fast Effective Rule Induction. " + "In: Twelfth International Conference on Machine Learning, 115-123, 1995." + ), + "upload_date": "2017-03-24T14:26:40", + "language": "English", + "dependencies": "Weka_3.9.0", + "parameter": [ + { + "name": "-do-not-check-capabilities", + "data_type": "flag", + "default_value": None, + "description": ( + "If set, classifier capabilities are not checked before classifier is built\n\t" + "(use with caution)." + ), + }, + { + "name": "D", + "data_type": "flag", + "default_value": None, + "description": "Set whether turn on the\n\tdebug mode (Default: false)", + }, + { + "name": "E", + "data_type": "flag", + "default_value": None, + "description": ( + "Whether NOT check the error rate>=0.5\n\t" + "in stopping criteria \t(default: check)" + ), + }, + { + "name": "F", + "data_type": "option", + "default_value": 3, + "description": ( + "Set number of folds for REP\n\tOne fold is used as pruning set.\n\t(default 3)" + ), + }, + { + "name": "N", + "data_type": "option", + "default_value": 2.0, + "description": ( + "Set the minimal weights of instances\n\twithin a split.\n\t(default 2.0)" + ), + }, + { + "name": "O", + "data_type": "option", + "default_value": 2, + "description": "Set the number of runs of\n\toptimizations. (Default: 2)", + }, + { + "name": "P", + "data_type": "flag", + "default_value": None, + "description": "Whether NOT use pruning\n\t(default: use pruning)", + }, + { + "name": "S", + "data_type": "option", + "default_value": 1, + "description": "The seed of randomization\n\t(Default: 1)", + }, + { + "name": "batch-size", + "data_type": "option", + "default_value": None, + "description": "The desired batch size for batch prediction (default 100).", + }, + { + "name": "num-decimal-places", + "data_type": "option", + "default_value": None, + "description": ( + "The number of decimal places for the output of numbers in " + "the model (default 2)." + ), + }, + { + "name": "output-debug-info", + "data_type": "flag", + "default_value": None, + "description": ( + "If set, classifier is run in debug mode and\n\t" + "may output additional info to the console" + ), + }, + ], + "subflows": [ + { + "identifier": None, + "flow": { + "id": 4, + "uploader": 16, + "name": "weka.J48", + "class_name": "weka.classifiers.trees.J48", + "version": 1, + "external_version": "Weka_3.9.0_11194", + "description": ( + "Ross Quinlan (1993). C4.5: Programs for Machine Learning. " + "Morgan Kaufmann Publishers, San Mateo, CA." + ), + "upload_date": "2017-03-24T14:26:40", + "language": "English", + "dependencies": "Weka_3.9.0", + "parameter": [ + { + "name": "-do-not-check-capabilities", + "data_type": "flag", + "default_value": None, + "description": ( + "If set, classifier capabilities are not checked" + " before classifier is built\n\t(use with caution)." + ), + }, + { + "name": "-doNotMakeSplitPointActualValue", + "data_type": "flag", + "default_value": None, + "description": "Do not make split point actual value.", + }, + { + "name": "A", + "data_type": "flag", + "default_value": None, + "description": "Laplace smoothing for predicted probabilities.", + }, + { + "name": "B", + "data_type": "flag", + "default_value": None, + "description": "Use binary splits only.", + }, + { + "name": "C", + "data_type": "option", + "default_value": 0.25, + "description": ( + "Set confidence threshold for pruning.\n\t(default 0.25)" + ), + }, + { + "name": "J", + "data_type": "flag", + "default_value": None, + "description": ( + "Do not use MDL correction for info" " gain on numeric attributes." + ), + }, + { + "name": "L", + "data_type": "flag", + "default_value": None, + "description": "Do not clean up after the tree has been built.", + }, + { + "name": "M", + "data_type": "option", + "default_value": 2, + "description": ( + "Set minimum number of instances per leaf.\n\t(default 2)" + ), + }, + { + "name": "N", + "data_type": "option", + "default_value": None, + "description": ( + "Set number of folds for reduced error\n\t" + "pruning. One fold is used as pruning set.\n\t(default 3)" + ), + }, + { + "name": "O", + "data_type": "flag", + "default_value": None, + "description": "Do not collapse tree.", + }, + { + "name": "Q", + "data_type": "option", + "default_value": None, + "description": "Seed for random data shuffling (default 1).", + }, + { + "name": "R", + "data_type": "flag", + "default_value": None, + "description": "Use reduced error pruning.", + }, + { + "name": "S", + "data_type": "flag", + "default_value": None, + "description": "Do not perform subtree raising.", + }, + { + "name": "U", + "data_type": "flag", + "default_value": None, + "description": "Use unpruned tree.", + }, + { + "name": "batch-size", + "data_type": "option", + "default_value": None, + "description": ( + "The desired batch size for batch prediction (default 100)." + ), + }, + { + "name": "num-decimal-places", + "data_type": "option", + "default_value": None, + "description": ( + "The number of decimal places for the output of numbers" + " in the model (default 2)." + ), + }, + { + "name": "output-debug-info", + "data_type": "flag", + "default_value": None, + "description": ( + "If set, classifier is run in debug mode and\n\t" + "may output additional info to the console" + ), + }, + ], + "tag": ["OpenmlWeka", "weka"], + "subflows": [], + }, + }, + ], + "tag": ["OpenmlWeka", "weka"], + } + difference = deepdiff.diff.DeepDiff(response.json(), expected, ignore_order=True) + assert not difference diff --git a/tests/routers/openml/migration/__init__.py b/tests/routers/openml/migration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/routers/openml/migration/flows_migration_test.py b/tests/routers/openml/migration/flows_migration_test.py new file mode 100644 index 0000000..e1cb5e8 --- /dev/null +++ b/tests/routers/openml/migration/flows_migration_test.py @@ -0,0 +1,53 @@ +from typing import Any + +import deepdiff +import httpx +import pytest +from core.conversions import ( + nested_remove_single_element_list, + nested_str_to_num, +) +from starlette.testclient import TestClient + + +@pytest.mark.php() +@pytest.mark.parametrize( + "flow_id", + range(1, 16), +) +def test_get_flow_equal(flow_id: int, py_api: TestClient, php_api: httpx.Client) -> None: + response = py_api.get(f"/flows/{flow_id}") + assert response.status_code == 200 + + new = response.json() + + # PHP sets parameter default value to [], None is more appropriate, omission is considered + # Similar for the default "identifier" of subflows. + # Subflow field (old: component) is omitted if empty + def convert_flow_naming_and_defaults(flow: dict[str, Any]) -> dict[str, Any]: + for parameter in flow["parameter"]: + if parameter["default_value"] is None: + parameter["default_value"] = [] + for subflow in flow["subflows"]: + subflow["flow"] = convert_flow_naming_and_defaults(subflow["flow"]) + if subflow["identifier"] is None: + subflow["identifier"] = [] + flow["component"] = flow.pop("subflows") + if flow["component"] == []: + flow.pop("component") + return flow + + new = convert_flow_naming_and_defaults(new) + new = nested_remove_single_element_list(new) + + expected = php_api.get(f"/flow/{flow_id}").json()["flow"] + # The reason we don't transform "new" to str is that it becomes harder to ignore numeric type + # differences (e.g., '1.0' vs '1') + expected = nested_str_to_num(expected) + difference = deepdiff.diff.DeepDiff( + expected, + new, + ignore_order=True, + ignore_numeric_type_changes=True, + ) + assert not difference diff --git a/tests/routers/openml/migration/tasks_migration_test.py b/tests/routers/openml/migration/tasks_migration_test.py index 91fe593..e5a02a9 100644 --- a/tests/routers/openml/migration/tasks_migration_test.py +++ b/tests/routers/openml/migration/tasks_migration_test.py @@ -1,43 +1,14 @@ -from typing import Any - import deepdiff import httpx import pytest +from core.conversions import ( + nested_int_to_str, + nested_remove_nones, + nested_remove_single_element_list, +) from starlette.testclient import TestClient -def nested_remove_nones(obj: Any) -> Any: - if isinstance(obj, dict): - return { - key: nested_remove_nones(val) - for key, val in obj.items() - if val is not None and nested_remove_nones(val) is not None - } - if isinstance(obj, list): - return [nested_remove_nones(val) for val in obj if nested_remove_nones(val) is not None] - return obj - - -def nested_int_to_str(obj: Any) -> Any: - if isinstance(obj, dict): - return {key: nested_int_to_str(val) for key, val in obj.items()} - if isinstance(obj, list): - return [nested_int_to_str(val) for val in obj] - if isinstance(obj, int): - return str(obj) - return obj - - -def nested_remove_single_element_list(obj: Any) -> Any: - if isinstance(obj, dict): - return {key: nested_remove_single_element_list(val) for key, val in obj.items()} - if isinstance(obj, list): - if len(obj) == 1: - return nested_remove_single_element_list(obj[0]) - return [nested_remove_single_element_list(val) for val in obj] - return obj - - @pytest.mark.php() @pytest.mark.parametrize( "task_id",