From 2653757f68b375e66833e68ca5f2fa1bf1c077d9 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 13 Dec 2023 19:10:44 +0300 Subject: [PATCH 01/34] add DatasetModelsFitnessScaler --- .../model_fitness_scalers/__init__.py | 0 .../dataset_models_fitness_scaler.py | 38 +++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 meta_automl/data_preparation/model_fitness_scalers/__init__.py create mode 100644 meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py diff --git a/meta_automl/data_preparation/model_fitness_scalers/__init__.py b/meta_automl/data_preparation/model_fitness_scalers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py new file mode 100644 index 00000000..45d8ad8b --- /dev/null +++ b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py @@ -0,0 +1,38 @@ +from copy import copy, deepcopy +from typing import Any, Dict, Sequence + +from sklearn.preprocessing import MinMaxScaler + +from meta_automl.data_preparation.dataset import DatasetIDType +from meta_automl.data_preparation.evaluated_model import EvaluatedModel + + +class DatasetModelsFitnessScaler: + def __init__(self, scaler_class=MinMaxScaler): + self.scaler_class = scaler_class + self.scalers: Dict[DatasetIDType, Any] = {} + + def fit(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]): + for dataset_id, dataset_models in zip(dataset_ids, models): + scaler = self.scaler_class() + self.scalers[dataset_id] = scaler + fitness_values_array = [model.fitness.values for model in dataset_models] + scaler.fit(fitness_values_array) + + def transform(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]): + new_models = [[copy(model) for model in dataset_models] for dataset_models in models] + for dataset_id, dataset_models in zip(dataset_ids, new_models): + scaler = self.scalers[dataset_id] + fitness_values_array = [model.fitness.values for model in dataset_models] + fitness_values_array = scaler.transform(fitness_values_array) + for model, fitness_values in zip(dataset_models, fitness_values_array): + fitness = copy(model.fitness) + fitness.values = fitness_values + model.fitness = fitness + return new_models + + def fit_transform(self, + dataset_ids: Sequence[DatasetIDType], + models: Sequence[Sequence[EvaluatedModel]]) -> Sequence[Sequence[EvaluatedModel]]: + self.fit(dataset_ids, models) + return self.transform(dataset_ids, models) From 87ab25d4820654950841c1fb4ba802124fa08440 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 26 Mar 2024 14:39:57 +0300 Subject: [PATCH 02/34] simplify imports --- meta_automl/data_preparation/model_fitness_scalers/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/meta_automl/data_preparation/model_fitness_scalers/__init__.py b/meta_automl/data_preparation/model_fitness_scalers/__init__.py index e69de29b..6698276c 100644 --- a/meta_automl/data_preparation/model_fitness_scalers/__init__.py +++ b/meta_automl/data_preparation/model_fitness_scalers/__init__.py @@ -0,0 +1 @@ +from .dataset_models_fitness_scaler import DatasetModelsFitnessScaler \ No newline at end of file From 6d3a30dd7463189462da22acb5f18737f4eba871 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 13 Dec 2023 19:51:01 +0300 Subject: [PATCH 03/34] typing & other fixes --- .../model_fitness_scalers/dataset_models_fitness_scaler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py index 45d8ad8b..5cd56dbc 100644 --- a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py +++ b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py @@ -1,7 +1,8 @@ -from copy import copy, deepcopy +from copy import copy from typing import Any, Dict, Sequence from sklearn.preprocessing import MinMaxScaler +from typing_extensions import Self from meta_automl.data_preparation.dataset import DatasetIDType from meta_automl.data_preparation.evaluated_model import EvaluatedModel @@ -12,12 +13,13 @@ def __init__(self, scaler_class=MinMaxScaler): self.scaler_class = scaler_class self.scalers: Dict[DatasetIDType, Any] = {} - def fit(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]): + def fit(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]) -> Self: for dataset_id, dataset_models in zip(dataset_ids, models): scaler = self.scaler_class() self.scalers[dataset_id] = scaler fitness_values_array = [model.fitness.values for model in dataset_models] scaler.fit(fitness_values_array) + return self def transform(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]): new_models = [[copy(model) for model in dataset_models] for dataset_models in models] From ad1e1fa6b478b3628250b36ce0666307f5c437a8 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 13 Dec 2023 21:28:43 +0300 Subject: [PATCH 04/34] update DatasetModelsFitnessScaler to support different dataset types --- .../dataset_models_fitness_scaler.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py index 5cd56dbc..48400ca4 100644 --- a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py +++ b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py @@ -5,6 +5,7 @@ from typing_extensions import Self from meta_automl.data_preparation.dataset import DatasetIDType +from meta_automl.data_preparation.dataset.dataset_base import DatasetType_co from meta_automl.data_preparation.evaluated_model import EvaluatedModel @@ -13,18 +14,20 @@ def __init__(self, scaler_class=MinMaxScaler): self.scaler_class = scaler_class self.scalers: Dict[DatasetIDType, Any] = {} - def fit(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]) -> Self: - for dataset_id, dataset_models in zip(dataset_ids, models): + def fit(self, models: Sequence[Sequence[EvaluatedModel]], datasets: Sequence[DatasetType_co]) -> Self: + dataset_representations = map(repr, datasets) + for dataset_repr, dataset_models in zip(dataset_representations, models): scaler = self.scaler_class() - self.scalers[dataset_id] = scaler + self.scalers[dataset_repr] = scaler fitness_values_array = [model.fitness.values for model in dataset_models] scaler.fit(fitness_values_array) return self - def transform(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Sequence[EvaluatedModel]]): + def transform(self, models: Sequence[Sequence[EvaluatedModel]], datasets: Sequence[DatasetType_co]): new_models = [[copy(model) for model in dataset_models] for dataset_models in models] - for dataset_id, dataset_models in zip(dataset_ids, new_models): - scaler = self.scalers[dataset_id] + dataset_representations = map(repr, datasets) + for dataset_repr, dataset_models in zip(dataset_representations, new_models): + scaler = self.scalers[dataset_repr] fitness_values_array = [model.fitness.values for model in dataset_models] fitness_values_array = scaler.transform(fitness_values_array) for model, fitness_values in zip(dataset_models, fitness_values_array): @@ -34,7 +37,7 @@ def transform(self, dataset_ids: Sequence[DatasetIDType], models: Sequence[Seque return new_models def fit_transform(self, - dataset_ids: Sequence[DatasetIDType], - models: Sequence[Sequence[EvaluatedModel]]) -> Sequence[Sequence[EvaluatedModel]]: - self.fit(dataset_ids, models) - return self.transform(dataset_ids, models) + models: Sequence[Sequence[EvaluatedModel]], + datasets: Sequence[DatasetType_co]) -> Sequence[Sequence[EvaluatedModel]]: + self.fit(models, datasets) + return self.transform(models, datasets) From 176ce1fa6a961b39400d7fdd6fe8a2d356bd5f07 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 14 Dec 2023 12:12:45 +0300 Subject: [PATCH 05/34] fix typing --- .../data_preparation/model_fitness_scalers/__init__.py | 2 +- .../dataset_models_fitness_scaler.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/meta_automl/data_preparation/model_fitness_scalers/__init__.py b/meta_automl/data_preparation/model_fitness_scalers/__init__.py index 6698276c..544991f1 100644 --- a/meta_automl/data_preparation/model_fitness_scalers/__init__.py +++ b/meta_automl/data_preparation/model_fitness_scalers/__init__.py @@ -1 +1 @@ -from .dataset_models_fitness_scaler import DatasetModelsFitnessScaler \ No newline at end of file +from .dataset_models_fitness_scaler import DatasetModelsFitnessScaler, ScalerType \ No newline at end of file diff --git a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py index 48400ca4..170ca957 100644 --- a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py +++ b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py @@ -1,18 +1,20 @@ from copy import copy -from typing import Any, Dict, Sequence +from typing import Dict, Sequence, Type +from sklearn.base import OneToOneFeatureMixin, TransformerMixin from sklearn.preprocessing import MinMaxScaler from typing_extensions import Self -from meta_automl.data_preparation.dataset import DatasetIDType from meta_automl.data_preparation.dataset.dataset_base import DatasetType_co from meta_automl.data_preparation.evaluated_model import EvaluatedModel +ScalerType = Type[OneToOneFeatureMixin, TransformerMixin] + class DatasetModelsFitnessScaler: - def __init__(self, scaler_class=MinMaxScaler): + def __init__(self, scaler_class: Type[ScalerType] = MinMaxScaler): self.scaler_class = scaler_class - self.scalers: Dict[DatasetIDType, Any] = {} + self.scalers: Dict[str, ScalerType] = {} def fit(self, models: Sequence[Sequence[EvaluatedModel]], datasets: Sequence[DatasetType_co]) -> Self: dataset_representations = map(repr, datasets) From 7aa5db81ff8a631fd4cc32edfcc1cc79a8ad12bd Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 14 Dec 2023 15:36:51 +0300 Subject: [PATCH 06/34] fix typing [2] --- .../model_fitness_scalers/dataset_models_fitness_scaler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py index 170ca957..23377028 100644 --- a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py +++ b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py @@ -1,5 +1,5 @@ from copy import copy -from typing import Dict, Sequence, Type +from typing import Dict, Sequence, Type, TypeVar from sklearn.base import OneToOneFeatureMixin, TransformerMixin from sklearn.preprocessing import MinMaxScaler @@ -8,7 +8,7 @@ from meta_automl.data_preparation.dataset.dataset_base import DatasetType_co from meta_automl.data_preparation.evaluated_model import EvaluatedModel -ScalerType = Type[OneToOneFeatureMixin, TransformerMixin] +ScalerType = TypeVar('ScalerType', OneToOneFeatureMixin, TransformerMixin) class DatasetModelsFitnessScaler: From b6605ac0d67eef1444f8cf135cebea8f7173e410 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 22 Oct 2023 12:21:38 +0300 Subject: [PATCH 07/34] add MetaLearningApproach --- meta_automl/approaches/__init__.py | 1 + .../approaches/meta_learning_approach.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 meta_automl/approaches/__init__.py create mode 100644 meta_automl/approaches/meta_learning_approach.py diff --git a/meta_automl/approaches/__init__.py b/meta_automl/approaches/__init__.py new file mode 100644 index 00000000..a7c6bef0 --- /dev/null +++ b/meta_automl/approaches/__init__.py @@ -0,0 +1 @@ +from .meta_learning_approach import MetaLearningApproach diff --git a/meta_automl/approaches/meta_learning_approach.py b/meta_automl/approaches/meta_learning_approach.py new file mode 100644 index 00000000..1e6aef0e --- /dev/null +++ b/meta_automl/approaches/meta_learning_approach.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass + + +class MetaLearningApproach(ABC): + @dataclass + class Parameters: + any_param: None + + @dataclass + class Data: + pass + + @dataclass + class Components: + pass + + @abstractmethod + def predict(self, *args, **kwargs): + raise NotImplementedError() From b72719f5052ccebab5db979755054b88475b7095 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 13 Dec 2023 15:49:49 +0300 Subject: [PATCH 08/34] add fedot_history_loader.py --- gamlet/components/models_loaders/fedot_history_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gamlet/components/models_loaders/fedot_history_loader.py b/gamlet/components/models_loaders/fedot_history_loader.py index a7bbfd18..c65fc079 100644 --- a/gamlet/components/models_loaders/fedot_history_loader.py +++ b/gamlet/components/models_loaders/fedot_history_loader.py @@ -28,7 +28,7 @@ def extract_best_models_from_history( best_individuals.insert(0, individual) best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - best_individuals = best_individuals[:n_best_models_to_load] + best_individuals = best_individuals[:n_best_models_to_load - 1] node_params_repo = DefaultOperationParamsRepository() for individual in best_individuals: From 0d814e0ceb2facf965ef425db960b1a194a8e33b Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 13 Dec 2023 15:53:49 +0300 Subject: [PATCH 09/34] add KNNSimilarityModelAdvice --- .../approaches/knn_similarity_model_advice.py | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 meta_automl/approaches/knn_similarity_model_advice.py diff --git a/meta_automl/approaches/knn_similarity_model_advice.py b/meta_automl/approaches/knn_similarity_model_advice.py new file mode 100644 index 00000000..c4b499d2 --- /dev/null +++ b/meta_automl/approaches/knn_similarity_model_advice.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Callable, List, Optional, Sequence + +from golem.core.optimisers.opt_history_objects.opt_history import OptHistory +from sklearn.preprocessing import MinMaxScaler + +from meta_automl.approaches import MetaLearningApproach +from meta_automl.data_preparation.dataset import DatasetIDType, OpenMLDataset, TabularData +from meta_automl.data_preparation.evaluated_model import EvaluatedModel +from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor +from meta_automl.data_preparation.meta_features_extractors.dataset_meta_features import DatasetMetaFeatures +from meta_automl.data_preparation.model_fitness_scalers import DatasetModelsFitnessScaler, ScalerType +from meta_automl.data_preparation.models_loaders.fedot_history_loader import FedotHistoryLoader +from meta_automl.meta_algorithm.dataset_similarity_assessors import KNeighborsSimilarityAssessor +from meta_automl.meta_algorithm.model_advisors import DiverseModelAdvisor + + +class KNNSimilarityModelAdvice(MetaLearningApproach): + def __init__(self, n_best_dataset_models_to_memorize: int, + mf_extractor_params: dict, assessor_params: dict, advisor_params: dict): + self.parameters = self.Parameters( + n_best_dataset_models_to_memorize=n_best_dataset_models_to_memorize, + mf_extractor_params=mf_extractor_params, + assessor_params=assessor_params, + advisor_params=advisor_params, + ) + self.data = self.Data() + + mf_extractor = PymfeExtractor(extractor_params=mf_extractor_params) + datasets_similarity_assessor = KNeighborsSimilarityAssessor(**assessor_params) + + models_loader = FedotHistoryLoader() + model_advisor = DiverseModelAdvisor(**advisor_params) + + self.components = self.Components( + models_loader=models_loader, + models_fitness_scaler=DatasetModelsFitnessScaler(MinMaxScaler), + mf_extractor=mf_extractor, + mf_scaler=MinMaxScaler(), + datasets_similarity_assessor=datasets_similarity_assessor, + model_advisor=model_advisor, + ) + + @dataclass + class Parameters: + n_best_dataset_models_to_memorize: int + mf_extractor_params: dict = field(default_factory=dict) + assessor_params: dict = field(default_factory=dict) + advisor_params: dict = field(default_factory=dict) + + @dataclass + class Data: + meta_features: DatasetMetaFeatures = None + datasets: List[OpenMLDataset] = None + datasets_data: List[OpenMLDataset] = None + dataset_ids: List[DatasetIDType] = None + best_models: List[List[EvaluatedModel]] = None + + @dataclass + class Components: + models_loader: FedotHistoryLoader + models_fitness_scaler: DatasetModelsFitnessScaler + mf_extractor: PymfeExtractor + mf_scaler: ScalerType + datasets_similarity_assessor: KNeighborsSimilarityAssessor + model_advisor: DiverseModelAdvisor + + def fit(self, + datasets_data: Sequence[TabularData], + histories: Sequence[Sequence[OptHistory]], + evaluate_model_func: Optional[Sequence[Callable]] = None): + data = self.data + params = self.parameters + + data.datasets_data = list(datasets_data) + data.datasets = [d.dataset for d in datasets_data] + data.dataset_ids = [d.id for d in datasets_data] + + data.meta_features = self.extract_train_meta_features(data.datasets_data) + self.fit_datasets_similarity_assessor(data.meta_features, data.dataset_ids) + + data.best_models = self.load_models(data.datasets, histories, params.n_best_dataset_models_to_memorize, + evaluate_model_func) + self.fit_model_advisor(data.dataset_ids, data.best_models) + + return self + + def load_models( + self, datasets: Sequence[OpenMLDataset], + histories: Sequence[Sequence[OptHistory]], + n_best_dataset_models_to_load: int, + evaluate_model_func: Optional[Sequence[Callable]] = None) -> Sequence[Sequence[EvaluatedModel]]: + models = self.components.models_loader.load(datasets, histories, n_best_dataset_models_to_load, + evaluate_model_func) + models = self.components.models_fitness_scaler.fit_transform(models, datasets) + return models + + def extract_train_meta_features(self, datasets_data: List[TabularData]) -> DatasetMetaFeatures: + components = self.components + + meta_features = components.mf_extractor.extract( + datasets_data, fill_input_nans=True) + + meta_features.fillna(0, inplace=True) + + meta_features[meta_features.columns] = components.mf_scaler.fit_transform(meta_features) + + return meta_features + + def fit_datasets_similarity_assessor(self, meta_features: DatasetMetaFeatures, dataset_ids: List[DatasetIDType] + ) -> KNeighborsSimilarityAssessor: + return self.components.datasets_similarity_assessor.fit(meta_features, dataset_ids) + + def fit_model_advisor(self, dataset_ids: List[DatasetIDType], best_models: Sequence[Sequence[EvaluatedModel]] + ) -> DiverseModelAdvisor: + return self.components.model_advisor.fit(dataset_ids, best_models) + + def predict(self, datasets_data: Sequence[TabularData]) -> List[List[EvaluatedModel]]: + mf_extractor = self.components.mf_extractor + mf_scaler = self.components.mf_scaler + assessor = self.components.datasets_similarity_assessor + advisor = self.components.model_advisor + + meta_features = mf_extractor.extract(datasets_data, fill_input_nans=True) + + meta_features.fillna(0, inplace=True) + + meta_features[meta_features.columns] = mf_scaler.transform(meta_features) + + similar_dataset_ids = assessor.predict(meta_features) + + return advisor.predict(similar_dataset_ids) From 1cf3a106b58d015c4ef503c3d07bda5f6c38b064 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 13 Dec 2023 18:13:39 +0300 Subject: [PATCH 10/34] minor fixes --- .../approaches/knn_similarity_model_advice.py | 23 ++++++------------- .../approaches/meta_learning_approach.py | 2 +- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/meta_automl/approaches/knn_similarity_model_advice.py b/meta_automl/approaches/knn_similarity_model_advice.py index c4b499d2..6ea5fe37 100644 --- a/meta_automl/approaches/knn_similarity_model_advice.py +++ b/meta_automl/approaches/knn_similarity_model_advice.py @@ -26,22 +26,15 @@ def __init__(self, n_best_dataset_models_to_memorize: int, assessor_params=assessor_params, advisor_params=advisor_params, ) - self.data = self.Data() - - mf_extractor = PymfeExtractor(extractor_params=mf_extractor_params) - datasets_similarity_assessor = KNeighborsSimilarityAssessor(**assessor_params) - - models_loader = FedotHistoryLoader() - model_advisor = DiverseModelAdvisor(**advisor_params) - self.components = self.Components( - models_loader=models_loader, + models_loader=FedotHistoryLoader(), models_fitness_scaler=DatasetModelsFitnessScaler(MinMaxScaler), - mf_extractor=mf_extractor, + mf_extractor=PymfeExtractor(**mf_extractor_params), mf_scaler=MinMaxScaler(), - datasets_similarity_assessor=datasets_similarity_assessor, - model_advisor=model_advisor, + datasets_similarity_assessor=KNeighborsSimilarityAssessor(**assessor_params), + model_advisor=DiverseModelAdvisor(**advisor_params), ) + self.data = self.Data() @dataclass class Parameters: @@ -124,11 +117,9 @@ def predict(self, datasets_data: Sequence[TabularData]) -> List[List[EvaluatedMo advisor = self.components.model_advisor meta_features = mf_extractor.extract(datasets_data, fill_input_nans=True) - meta_features.fillna(0, inplace=True) - meta_features[meta_features.columns] = mf_scaler.transform(meta_features) - similar_dataset_ids = assessor.predict(meta_features) + models = advisor.predict(similar_dataset_ids) - return advisor.predict(similar_dataset_ids) + return models diff --git a/meta_automl/approaches/meta_learning_approach.py b/meta_automl/approaches/meta_learning_approach.py index 1e6aef0e..56535768 100644 --- a/meta_automl/approaches/meta_learning_approach.py +++ b/meta_automl/approaches/meta_learning_approach.py @@ -5,7 +5,7 @@ class MetaLearningApproach(ABC): @dataclass class Parameters: - any_param: None + pass @dataclass class Data: From c938452454d1ffb06f8e50f87ce9c005e32bf548 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 20 Apr 2023 16:38:47 +0300 Subject: [PATCH 11/34] create Dockerfile abd .dockerignore --- .dockerignore | 13 +++++++++++++ Dockerfile | 30 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..66731471 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +# Config & info files +.pep8speaks.yml +Dockerfile +LICENSE +README.md + +# Unnecessary files +examples +notebooks +test + +# User data +data/cache diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..7958082a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +# Download base image ubuntu 20.04 +FROM ubuntu:20.04 + +# For apt to be noninteractive +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +# Preseed tzdata, update package index, upgrade packages and install needed software +RUN truncate -s0 /tmp/preseed.cfg; \ + echo "tzdata tzdata/Areas select Europe" >> /tmp/preseed.cfg; \ + echo "tzdata tzdata/Zones/Europe select Berlin" >> /tmp/preseed.cfg; \ + debconf-set-selections /tmp/preseed.cfg && \ + rm -f /etc/timezone /etc/localtime && \ + apt-get update && \ + apt-get install -y nano && \ + apt-get install -y mc && \ + apt-get install -y python3.9 python3-pip && \ + apt-get install -y git && \ + rm -rf /var/lib/apt/lists/* + +# Set the workdir +ENV WORKDIR /home/meta-automl-research +WORKDIR $WORKDIR +COPY . $WORKDIR + +RUN pip3 install pip && \ + pip install wheel && \ + pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt + +ENV PYTHONPATH $WORKDIR From 046725988f9973bd083a4a2a577b72f3f96d97f3 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 20 Jul 2023 14:06:26 +0300 Subject: [PATCH 12/34] create the experiment script & config --- experiments/fedot_warm_start/config.yaml | 26 ++ experiments/fedot_warm_start/run.py | 360 +++++++++++++++++++++++ 2 files changed, 386 insertions(+) create mode 100644 experiments/fedot_warm_start/config.yaml create mode 100644 experiments/fedot_warm_start/run.py diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml new file mode 100644 index 00000000..bcab1083 --- /dev/null +++ b/experiments/fedot_warm_start/config.yaml @@ -0,0 +1,26 @@ +--- +seed: 42 +#data_settings: +n_datasets: null # null for all available datasets +test_size: 0.25 +train_timeout: 15 +test_timeout: 15 +#meta_learning_params: +n_best_dataset_models_to_memorize: 10 +n_closest_datasets_to_propose: 5 +minimal_distance_between_advised_models: 1 +n_best_models_to_advise: 5 +mf_extractor_params: + groups: general +#evaluation_params: +collect_metrics: + - f1 + - roc_auc + - accuracy + - neg_log_loss + - precision +common_fedot_params: + problem: classification + n_jobs: -1 + show_progress: false +baseline_model: 'xgboost' diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py new file mode 100644 index 00000000..956ecf30 --- /dev/null +++ b/experiments/fedot_warm_start/run.py @@ -0,0 +1,360 @@ +import functools +import json +import logging +import timeit +from pathlib import Path + +import yaml + +from datetime import datetime +from itertools import chain +from typing import Dict, List, Tuple, Sequence, Any + +import numpy as np +import openml +import pandas as pd + +from fedot.api.main import Fedot +from fedot.core.data.data import InputData +from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate +from fedot.core.pipelines.adapters import PipelineAdapter +from fedot.core.pipelines.pipeline import Pipeline +from fedot.core.pipelines.pipeline_builder import PipelineBuilder +from fedot.core.repository.quality_metrics_repository import QualityMetricsEnum, MetricsRepository +from fedot.core.validation.split import tabular_cv_generator +from golem.core.log import Log +from golem.core.optimisers.fitness import SingleObjFitness +from golem.core.optimisers.opt_history_objects.opt_history import OptHistory +from sklearn.model_selection import StratifiedKFold +from tqdm import tqdm + + +from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData, DatasetBase +from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split +from meta_automl.data_preparation.file_system import get_cache_dir +from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor +from meta_automl.data_preparation.model import Model +from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor +from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor + + +CONFIG_PATH = Path(__file__).parent.joinpath('config.yaml') + + +with open(CONFIG_PATH, 'r') as config_file: + config = yaml.load(config_file, yaml.Loader) + +# Load constants +SEED = config['seed'] +N_DATASETS = config['n_datasets'] +TEST_SIZE = config['test_size'] +TRAIN_TIMEOUT = config['train_timeout'] +TEST_TIMEOUT = config['test_timeout'] +N_BEST_DATASET_MODELS_TO_MEMORIZE = config['n_best_dataset_models_to_memorize'] +N_CLOSEST_DATASETS_TO_PROPOSE = config['n_closest_datasets_to_propose'] +MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = config['minimal_distance_between_advised_models'] +N_BEST_MODELS_TO_ADVISE = config['n_best_models_to_advise'] +MF_EXTRACTOR_PARAMS = config['mf_extractor_params'] +COLLECT_METRICS = config['collect_metrics'] +COMMON_FEDOT_PARAMS = config['common_fedot_params'] +BASELINE_MODEL = config['baseline_model'] + +# Postprocess constants +COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.metric_by_id, COLLECT_METRICS)) +COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss' +COMMON_FEDOT_PARAMS['seed'] = SEED + + +def setup_logging(save_dir: Path): + """ Creates "log.txt" at the "save_dir" and redirects all logging output to it. """ + log_file = save_dir.joinpath('log.txt') + Log(log_file=log_file) + logging.basicConfig( + filename=log_file, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', + datefmt='%H:%M:%S', + force=True, + ) + + +def get_current_formatted_date() -> (datetime, str, str): + """ Returns current date in the following formats: + + 1. datetime + 2. str: ISO + 3. str: ISO compatible with Windows file system path (with "." instead of ":") """ + time_now = datetime.now() + time_now_iso = time_now.isoformat(timespec="minutes") + time_now_for_path = time_now_iso.replace(":", ".") + return time_now, time_now_iso, time_now_for_path + + +def get_save_dir(time_now_for_path) -> Path: + save_dir = get_cache_dir(). \ + joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') + save_dir.mkdir(parents=True) + return save_dir + + +def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]: + """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" + + dataset_ids = openml.study.get_suite(99).data + if N_DATASETS is not None: + dataset_ids = pd.Series(dataset_ids) + dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) + + df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED) + df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1] + df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0] + + datasets = {dataset.id_: dataset for dataset in OpenMLDatasetsLoader().load(dataset_ids)} + return df_datasets_train, df_datasets_test, datasets + + +def evaluate_pipeline(pipeline: Pipeline, + input_data: InputData, + metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM, + metric_names: Sequence[str] = COLLECT_METRICS) -> Dict[str, float]: + """Gets quality metrics for the fitted pipeline. + The function is based on `Fedot.get_metrics()` + + Returns: + the values of quality metrics + """ + data_producer = functools.partial(tabular_cv_generator, input_data, 10, StratifiedKFold) + + objective = MetricsObjective(metrics) + obj_eval = PipelineObjectiveEvaluate(objective=objective, + data_producer=data_producer, + eval_n_jobs=-1) + + metric_values = obj_eval.evaluate(pipeline).values + metric_values = {metric_name: round(value, 3) for (metric_name, value) in zip(metric_names, metric_values)} + + return metric_values + + +def fit_offline_meta_learning_components(best_models_per_dataset_id: Dict[int, Sequence[Model]]) \ + -> (KNeighborsBasedSimilarityAssessor, PymfeExtractor, DiverseFEDOTPipelineAdvisor): + dataset_ids = list(best_models_per_dataset_id.keys()) + # Meta Features + extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS) + meta_features_train = extractor.extract(dataset_ids, fill_input_nans=True) + meta_features_train = meta_features_train.fillna(0) + # Datasets similarity + data_similarity_assessor = KNeighborsBasedSimilarityAssessor( + n_neighbors=min(len(dataset_ids), N_CLOSEST_DATASETS_TO_PROPOSE)) + data_similarity_assessor.fit(meta_features_train, dataset_ids) + # Model advisor + model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, + minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) + model_advisor.fit(best_models_per_dataset_id) + return extractor, model_advisor + + +def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array): + x = data.x + y = data.y + if len(y.shape) == 1: + y = y.reshape(-1, 1) + return x, y + + +def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None) \ + -> (Fedot, Dict[str, Any]): + """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset. + Returns Fedot instance & properties of the run along with the evaluated metrics. """ + x, y = transform_data_for_fedot(dataset.get_data(dataset_format='array')) + + time_start = timeit.default_timer() + fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS) + fedot.fit(x, y) + automl_time = timeit.default_timer() - time_start + + metrics = evaluate_pipeline(fedot.current_pipeline, fedot.train_data) + pipeline = fedot.current_pipeline + run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, + automl_time_sec=automl_time, automl_timeout_min=fedot.params.timeout, + history_obj=fedot.history, **metrics) + return fedot, run_results + + +def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., + automl_timeout_min=0., **metrics) -> Dict[str, Any]: + run_results = dict(dataset_id=dataset.id_, + dataset_name=dataset.name, + run_label=run_label, + model_obj=pipeline, + model_str=pipeline.descriptive_id, + history_obj=history_obj, + automl_time_sec=automl_time_sec, + automl_timeout_min=automl_timeout_min, + task_type='classification', + **metrics) + return run_results + + +def extract_best_models_from_history(dataset: DatasetBase, history: OptHistory) -> List[Model]: + if history.individuals: + best_individuals = sorted(chain(*history.individuals), + key=lambda ind: ind.fitness, + reverse=True) + best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) + best_models = [] + for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: + pipeline = PipelineAdapter().restore(individual.graph) + model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset) + best_models.append(model) + else: + pipeline = PipelineAdapter().restore(history.tuning_result) + best_models = [Model(pipeline, SingleObjFitness(), history.objective.metric_names[0], dataset)] + + return best_models + + +def save_experiment_params(params_dict: Dict[str, Any], save_dir: Path): + """ Save the hyperparameters of the experiment """ + params_file_path = save_dir.joinpath('parameters.json') + with open(params_file_path, 'w') as params_file: + json.dump(params_dict, params_file, indent=2) + + +def save_evaluation(evaluation_properties: Dict[str, Any], run_date: datetime, experiment_date: datetime, + save_dir: Path): + histories_dir = save_dir.joinpath('histories') + models_dir = save_dir.joinpath('models') + eval_results_path = save_dir.joinpath('evaluation_results.csv') + + histories_dir.mkdir(exist_ok=True) + models_dir.mkdir(exist_ok=True) + + try: + evaluation_properties['experiment_date'] = experiment_date + evaluation_properties['run_date'] = run_date + dataset_id = evaluation_properties['dataset_id'] + run_label = evaluation_properties['run_label'] + # define saving paths + model_path = models_dir.joinpath(f'{dataset_id}_{run_label}') + history_path = histories_dir.joinpath(f'{dataset_id}_{run_label}_history.json') + # replace objects with export paths for csv + evaluation_properties['model_path'] = str(model_path) + evaluation_properties.pop('model_obj').save(model_path) + evaluation_properties['history_path'] = str(history_path) + history_obj = evaluation_properties.pop('history_obj') + if history_obj is not None: + history_obj.save(evaluation_properties['history_path']) + + df_evaluation_properties = pd.DataFrame([evaluation_properties]) + + if eval_results_path.exists(): + df_results = pd.read_csv(eval_results_path) + df_results = pd.concat([df_results, df_evaluation_properties]) + else: + df_results = df_evaluation_properties + df_results.to_csv(eval_results_path, index=False) + + except Exception: + logging.exception(f'Saving results "{evaluation_properties}"') + + +def main(): + experiment_date, experiment_date_iso, experiment_date_for_path = get_current_formatted_date() + save_dir = get_save_dir(experiment_date_for_path) + setup_logging(save_dir) + progress_file_path = save_dir.joinpath('progress.txt') + + df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets() + + dataset_ids = list(datasets_dict.keys()) + dataset_ids_train = df_datasets_train.index.to_list() + dataset_ids_test = df_datasets_test.index.to_list() + + dataset_names_train = df_datasets_train['dataset_name'].to_list() + dataset_names_test = df_datasets_test['dataset_name'].to_list() + + datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items())) + + experiment_params_dict = dict( + experiment_start_date_iso=experiment_date_iso, + input_config=config, + dataset_ids=dataset_ids, + dataset_ids_train=dataset_ids_train, + dataset_names_train=dataset_names_train, + dataset_ids_test=dataset_ids_test, + dataset_names_test=dataset_names_test, + baseline_pipeline=BASELINE_MODEL, + ) + save_experiment_params(experiment_params_dict, save_dir) + + best_models_per_dataset = {} + with open(progress_file_path, 'a') as progress_file: + for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file): + try: + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT + run_date = datetime.now() + fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT') + save_evaluation(run_results, run_date, experiment_date, save_dir) + # TODO: + # x Turn the tuned pipeline into a model (evaluate its fitness on the data) + # x Evaluate historical pipelines on the data instead of using fitness + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run + + # Filter out unique individuals with the best fitness + history = fedot.history + best_models = extract_best_models_from_history(dataset, history) + best_models_per_dataset[dataset_id] = best_models + except Exception: + logging.exception(f'Train dataset "{dataset_id}"') + + mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset) + + with open(progress_file_path, 'a') as progress_file: + for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file): + try: + # Run meta AutoML + # 1 + time_start = timeit.default_timer() + meta_features = mf_extractor.extract([dataset], + fill_input_nans=True, use_cached=False, update_cached=True) + meta_features = meta_features.fillna(0) + meta_learning_time_sec = timeit.default_timer() - time_start + initial_assumptions = model_advisor.predict(meta_features)[0] + assumption_pipelines = [model.predictor for model in initial_assumptions] + # 2 + run_date = datetime.now() + fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', + initial_assumption=assumption_pipelines) + fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec + save_evaluation(fedot_meta_results, run_date, experiment_date, save_dir) + + # Fit & evaluate simple baseline + baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() + run_date = datetime.now() + baseline_metrics = evaluate_pipeline(baseline_pipeline, fedot_meta.train_data) + baseline_res = get_result_data_row(dataset=dataset, run_label=f'simple baseline {BASELINE_MODEL}', + pipeline=baseline_pipeline, + **baseline_metrics) + save_evaluation(baseline_res, run_date, experiment_date, save_dir) + + # Fit & evaluate initial assumptions + for i, assumption in enumerate(initial_assumptions): + pipeline = assumption.predictor + run_date = datetime.now() + assumption_metrics = evaluate_pipeline(pipeline, fedot_meta.train_data) + assumption_res = get_result_data_row(dataset=dataset, + run_label=f'MetaFEDOT - initial assumption {i}', + pipeline=pipeline, **assumption_metrics) + save_evaluation(assumption_res, run_date, experiment_date, save_dir) + except Exception: + logging.exception(f'Test dataset "{dataset_id}"') + + +if __name__ == "__main__": + try: + main() + except Exception as e: + logging.exception('Main level caught an error.') + raise From cac2ba324d86938e8c2f4fb1af22168c1b86be66 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 27 Jul 2023 19:11:18 +0300 Subject: [PATCH 13/34] adapt to #39 --- experiments/fedot_warm_start/run.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 956ecf30..1957053b 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -156,10 +156,8 @@ def fit_offline_meta_learning_components(best_models_per_dataset_id: Dict[int, S def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array): - x = data.x - y = data.y - if len(y.shape) == 1: - y = y.reshape(-1, 1) + x = data.x.to_numpy() + y = data.y.to_numpy() return x, y @@ -167,7 +165,7 @@ def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_as -> (Fedot, Dict[str, Any]): """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset. Returns Fedot instance & properties of the run along with the evaluated metrics. """ - x, y = transform_data_for_fedot(dataset.get_data(dataset_format='array')) + x, y = transform_data_for_fedot(dataset.get_data()) time_start = timeit.default_timer() fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS) From 1e1b08cb57d52d59a61b6ce5943a88e1ebbc627b Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 28 Jul 2023 12:10:53 +0300 Subject: [PATCH 14/34] add config for debugging --- .../fedot_warm_start/config_debug.yaml | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 experiments/fedot_warm_start/config_debug.yaml diff --git a/experiments/fedot_warm_start/config_debug.yaml b/experiments/fedot_warm_start/config_debug.yaml new file mode 100644 index 00000000..226cbe38 --- /dev/null +++ b/experiments/fedot_warm_start/config_debug.yaml @@ -0,0 +1,26 @@ +--- +seed: 42 +#data_settings: +n_datasets: 3 # null for all available datasets +test_size: 0.33 +train_timeout: 0.01 +test_timeout: 0.01 +#meta_learning_params: +n_best_dataset_models_to_memorize: 10 +n_closest_datasets_to_propose: 5 +minimal_distance_between_advised_models: 1 +n_best_models_to_advise: 5 +mf_extractor_params: + groups: general +#evaluation_params: +collect_metrics: + - f1 + - roc_auc + - accuracy + - neg_log_loss + - precision +common_fedot_params: + problem: classification + n_jobs: -1 + show_progress: false +baseline_model: 'xgboost' From ff6852abfacd7d7e3811534dbc795df634faabce Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 12 Oct 2023 16:58:53 +0300 Subject: [PATCH 15/34] remove data leak --- experiments/fedot_warm_start/run.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 1957053b..8fd37d62 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -267,7 +267,7 @@ def main(): df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets() dataset_ids = list(datasets_dict.keys()) - dataset_ids_train = df_datasets_train.index.to_list() + dataset_ids_test = df_datasets_train.index.to_list() dataset_ids_test = df_datasets_test.index.to_list() dataset_names_train = df_datasets_train['dataset_name'].to_list() @@ -279,7 +279,7 @@ def main(): experiment_start_date_iso=experiment_date_iso, input_config=config, dataset_ids=dataset_ids, - dataset_ids_train=dataset_ids_train, + dataset_ids_train=dataset_ids_test, dataset_names_train=dataset_names_train, dataset_ids_test=dataset_ids_test, dataset_names_test=dataset_names_test, @@ -291,7 +291,7 @@ def main(): with open(progress_file_path, 'a') as progress_file: for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file): try: - timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT run_date = datetime.now() fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT') save_evaluation(run_results, run_date, experiment_date, save_dir) @@ -307,7 +307,8 @@ def main(): except Exception: logging.exception(f'Train dataset "{dataset_id}"') - mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset) + best_models_per_dataset_test = {dataset_id: best_models_per_dataset[dataset_id] for dataset_id in dataset_ids_test} + mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset_test) with open(progress_file_path, 'a') as progress_file: for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file): From cf1190a9662b6a14f81031f3a44dbc85008f13b1 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 12 Oct 2023 20:06:22 +0300 Subject: [PATCH 16/34] persist train/test datasets split --- experiments/fedot_warm_start/run.py | 10 ++- .../train_test_datasets_split.csv | 73 +++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 experiments/fedot_warm_start/train_test_datasets_split.csv diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 8fd37d62..f08553fa 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -59,6 +59,7 @@ COLLECT_METRICS = config['collect_metrics'] COMMON_FEDOT_PARAMS = config['common_fedot_params'] BASELINE_MODEL = config['baseline_model'] +UPDATE_TRAIN_TEST_DATASETS_SPLIT = config.get('update_train_test_datasets_split') # Postprocess constants COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.metric_by_id, COLLECT_METRICS)) @@ -106,7 +107,14 @@ def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDatase dataset_ids = pd.Series(dataset_ids) dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) - df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED) + split_path = Path(__file__).parent / 'train_test_datasets_split.csv' + + if UPDATE_TRAIN_TEST_DATASETS_SPLIT: + df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED) + df_split_datasets.to_csv(split_path) + else: + df_split_datasets = pd.read_csv(split_path, index_col=0) + df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1] df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0] diff --git a/experiments/fedot_warm_start/train_test_datasets_split.csv b/experiments/fedot_warm_start/train_test_datasets_split.csv new file mode 100644 index 00000000..884f8167 --- /dev/null +++ b/experiments/fedot_warm_start/train_test_datasets_split.csv @@ -0,0 +1,73 @@ +dataset_id,dataset_name,category,is_train,NumberOfInstances,NumberOfFeatures,NumberOfClasses +1063,kc2,small_small_binary,1,small,small,binary +40927,CIFAR_10,big_big_big,1,big,big,big +1480,ilpd,small_small_binary,1,small,small,binary +54,vehicle,small_small_small,1,small,small,small +40978,Internet-Advertisements,big_big_binary,1,big,big,binary +1464,blood-transfusion-service-center,small_small_binary,1,small,small,binary +300,isolet,big_big_big,1,big,big,big +18,mfeat-morphological,small_small_big,1,small,small,big +23381,dresses-sales,small_small_binary,1,small,small,binary +46,splice,big_big_small,1,big,big,small +1461,bank-marketing,big_small_binary,1,big,small,binary +40966,MiceProtein,small_big_small,1,small,big,small +40983,wilt,big_small_binary,1,big,small,binary +469,analcatdata_dmft,small_small_small,1,small,small,small +1053,jm1,big_small_binary,1,big,small,binary +40499,texture,big_big_big,1,big,big,big +40701,churn,big_small_binary,1,big,small,binary +12,mfeat-factors,small_big_big,1,small,big,big +1486,nomao,big_big_binary,1,big,big,binary +40982,steel-plates-fault,small_small_small,1,small,small,small +1050,pc3,small_big_binary,1,small,big,binary +307,vowel,small_small_big,1,small,small,big +1475,first-order-theorem-proving,big_big_small,1,big,big,small +1049,pc4,small_big_binary,1,small,big,binary +23517,numerai28.6,big_small_binary,1,big,small,binary +1468,cnae-9,small_big_big,1,small,big,big +40984,segment,big_small_small,1,big,small,small +151,electricity,big_small_binary,1,big,small,binary +29,credit-approval,small_small_binary,1,small,small,binary +188,eucalyptus,small_small_small,1,small,small,small +40668,connect-4,big_big_small,1,big,big,small +1478,har,big_big_small,1,big,big,small +22,mfeat-zernike,small_big_big,1,small,big,big +1067,kc1,small_small_binary,1,small,small,binary +1487,ozone-level-8hr,big_big_binary,1,big,big,binary +6332,cylinder-bands,small_big_binary,1,small,big,binary +1497,wall-robot-navigation,big_small_small,1,big,small,small +1590,adult,big_small_binary,1,big,small,binary +16,mfeat-karhunen,small_big_big,1,small,big,big +1068,pc1,small_small_binary,1,small,small,binary +3,kr-vs-kp,big_big_binary,1,big,big,binary +28,optdigits,big_big_big,1,big,big,big +40996,Fashion-MNIST,big_big_big,1,big,big,big +1462,banknote-authentication,small_small_binary,1,small,small,binary +458,analcatdata_authorship,small_big_small,1,small,big,small +6,letter,big_small_big,1,big,small,big +40670,dna,big_big_small,1,big,big,small +1510,wdbc,small_big_binary,1,small,big,binary +40975,car,small_small_small,1,small,small,small +4134,Bioresponse,big_big_binary,1,big,big,binary +37,diabetes,small_small_binary,1,small,small,binary +44,spambase,big_big_binary,1,big,big,binary +15,breast-w,small_small_binary,1,small,small,binary +1501,semeion,small_big_big,1,small,big,big +40994,climate-model-simulation-crashes,small_small_binary,0,small,small,binary +4538,GesturePhaseSegmentationProcessed,big_big_small,0,big,big,small +14,mfeat-fourier,small_big_big,0,small,big,big +1485,madelon,big_big_binary,0,big,big,binary +11,balance-scale,small_small_small,0,small,small,small +23,cmc,small_small_small,0,small,small,small +554,mnist_784,big_big_big,0,big,big,big +4534,PhishingWebsites,big_big_binary,0,big,big,binary +38,sick,big_small_binary,0,big,small,binary +1494,qsar-biodeg,small_big_binary,0,small,big,binary +50,tic-tac-toe,small_small_binary,0,small,small,binary +40979,mfeat-pixel,small_big_big,0,small,big,big +1489,phoneme,big_small_binary,0,big,small,binary +31,credit-g,small_small_binary,0,small,small,binary +32,pendigits,big_small_big,0,big,small,big +41027,jungle_chess_2pcs_raw_endgame_complete,big_small_small,0,big,small,small +182,satimage,big_big_small,0,big,big,small +40923,Devnagari-Script,big_big_big,0,big,big,big From 50379f8b0e92b9d1a63d14a9ce5b0fee73f0b79e Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 12 Oct 2023 20:07:11 +0300 Subject: [PATCH 17/34] add final choices to the best models --- experiments/fedot_warm_start/run.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index f08553fa..bc181902 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -208,11 +208,18 @@ def extract_best_models_from_history(dataset: DatasetBase, history: OptHistory) best_individuals = sorted(chain(*history.individuals), key=lambda ind: ind.fitness, reverse=True) + for individual in history.final_choices: + if individual not in best_individuals: + best_individuals.insert(0, individual) + + best_individuals = best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE] + best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) best_models = [] - for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: + for individual in best_individuals: pipeline = PipelineAdapter().restore(individual.graph) - model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset) + fitness = individual.fitness or SingleObjFitness() + model = Model(pipeline, fitness, history.objective.metric_names[0], dataset) best_models.append(model) else: pipeline = PipelineAdapter().restore(history.tuning_result) From 4f3d0d8634abeb4e87f8cef00f582326b6c2ccc2 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 22 Oct 2023 12:21:38 +0300 Subject: [PATCH 18/34] fix pipeline evaluation, compute fitness on test data; fix knn experiment; add data split --- experiments/__init__.py | 0 experiments/fedot_warm_start/__init__.py | 0 experiments/fedot_warm_start/config.yaml | 8 +- .../fedot_warm_start/config_debug.yaml | 8 +- experiments/fedot_warm_start/run.py | 509 ++++++++++-------- 5 files changed, 283 insertions(+), 242 deletions(-) create mode 100644 experiments/__init__.py create mode 100644 experiments/fedot_warm_start/__init__.py diff --git a/experiments/__init__.py b/experiments/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/fedot_warm_start/__init__.py b/experiments/fedot_warm_start/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml index bcab1083..5023e74b 100644 --- a/experiments/fedot_warm_start/config.yaml +++ b/experiments/fedot_warm_start/config.yaml @@ -7,11 +7,13 @@ train_timeout: 15 test_timeout: 15 #meta_learning_params: n_best_dataset_models_to_memorize: 10 -n_closest_datasets_to_propose: 5 -minimal_distance_between_advised_models: 1 -n_best_models_to_advise: 5 mf_extractor_params: groups: general +assessor_params: + n_neighbors: 5 +advisor_params: + minimal_distance: 1 + n_best_to_advise: 5 #evaluation_params: collect_metrics: - f1 diff --git a/experiments/fedot_warm_start/config_debug.yaml b/experiments/fedot_warm_start/config_debug.yaml index 226cbe38..11d0d26a 100644 --- a/experiments/fedot_warm_start/config_debug.yaml +++ b/experiments/fedot_warm_start/config_debug.yaml @@ -7,11 +7,13 @@ train_timeout: 0.01 test_timeout: 0.01 #meta_learning_params: n_best_dataset_models_to_memorize: 10 -n_closest_datasets_to_propose: 5 -minimal_distance_between_advised_models: 1 -n_best_models_to_advise: 5 mf_extractor_params: groups: general +assessor_params: + n_neighbors: 2 +advisor_params: + minimal_distance: 1 + n_best_to_advise: 5 #evaluation_params: collect_metrics: - f1 diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index bc181902..887b7a82 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,47 +1,39 @@ -import functools +from __future__ import annotations + import json import logging +import os +import pickle +import shutil import timeit -from pathlib import Path - -import yaml - from datetime import datetime -from itertools import chain -from typing import Dict, List, Tuple, Sequence, Any +from functools import partial, wraps +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union -import numpy as np import openml import pandas as pd - +import yaml from fedot.api.main import Fedot -from fedot.core.data.data import InputData +from fedot.core.data.data import array_to_input_data from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate -from fedot.core.pipelines.adapters import PipelineAdapter from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.pipeline_builder import PipelineBuilder -from fedot.core.repository.quality_metrics_repository import QualityMetricsEnum, MetricsRepository -from fedot.core.validation.split import tabular_cv_generator +from fedot.core.repository.quality_metrics_repository import MetricsRepository, QualityMetricsEnum from golem.core.log import Log -from golem.core.optimisers.fitness import SingleObjFitness -from golem.core.optimisers.opt_history_objects.opt_history import OptHistory -from sklearn.model_selection import StratifiedKFold +from golem.core.optimisers.fitness import Fitness +from pecapiku import CacheDict +from sklearn.model_selection import train_test_split from tqdm import tqdm +from typing_extensions import Literal - -from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData, DatasetBase -from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.approaches.knn_similarity_model_advice import KNNSimilarityModelAdvice +from meta_automl.data_preparation.dataset import DatasetIDType, OpenMLDataset, TabularData from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split from meta_automl.data_preparation.file_system import get_cache_dir -from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor -from meta_automl.data_preparation.model import Model -from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor -from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor - CONFIG_PATH = Path(__file__).parent.joinpath('config.yaml') - with open(CONFIG_PATH, 'r') as config_file: config = yaml.load(config_file, yaml.Loader) @@ -52,19 +44,23 @@ TRAIN_TIMEOUT = config['train_timeout'] TEST_TIMEOUT = config['test_timeout'] N_BEST_DATASET_MODELS_TO_MEMORIZE = config['n_best_dataset_models_to_memorize'] -N_CLOSEST_DATASETS_TO_PROPOSE = config['n_closest_datasets_to_propose'] -MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = config['minimal_distance_between_advised_models'] -N_BEST_MODELS_TO_ADVISE = config['n_best_models_to_advise'] +ASSESSOR_PARAMS = config['assessor_params'] +ADVISOR_PARAMS = config['advisor_params'] MF_EXTRACTOR_PARAMS = config['mf_extractor_params'] COLLECT_METRICS = config['collect_metrics'] -COMMON_FEDOT_PARAMS = config['common_fedot_params'] +FEDOT_PARAMS = config['fedot_params'] +DATA_TEST_SIZE = config['data_test_size'] +DATA_SPLIT_SEED = config['data_split_seed'] BASELINE_MODEL = config['baseline_model'] +# Optional values +TMPDIR = config.get('tmpdir') +SAVE_DIR_PREFIX = config.get('save_dir_prefix') + UPDATE_TRAIN_TEST_DATASETS_SPLIT = config.get('update_train_test_datasets_split') # Postprocess constants COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.metric_by_id, COLLECT_METRICS)) COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss' -COMMON_FEDOT_PARAMS['seed'] = SEED def setup_logging(save_dir: Path): @@ -80,7 +76,7 @@ def setup_logging(save_dir: Path): ) -def get_current_formatted_date() -> (datetime, str, str): +def get_current_formatted_date() -> Tuple[datetime, str, str]: """ Returns current date in the following formats: 1. datetime @@ -95,137 +91,100 @@ def get_current_formatted_date() -> (datetime, str, str): def get_save_dir(time_now_for_path) -> Path: save_dir = get_cache_dir(). \ joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') + if save_dir.exists(): + shutil.rmtree(save_dir) save_dir.mkdir(parents=True) - return save_dir + return save_dir -def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]: - """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" +def get_dataset_ids() -> List[DatasetIDType]: dataset_ids = openml.study.get_suite(99).data if N_DATASETS is not None: dataset_ids = pd.Series(dataset_ids) dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) + return list(dataset_ids) + +def split_datasets(dataset_ids, n_datasets: Optional[int] = None, update_train_test_split: bool = False) \ + -> Tuple[pd.DataFrame, pd.DataFrame]: split_path = Path(__file__).parent / 'train_test_datasets_split.csv' + if n_datasets is not None: + dataset_ids = pd.Series(dataset_ids) + dataset_ids = dataset_ids.sample(n=n_datasets, random_state=SEED) - if UPDATE_TRAIN_TEST_DATASETS_SPLIT: + if n_datasets is not None or update_train_test_split: df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED) - df_split_datasets.to_csv(split_path) else: df_split_datasets = pd.read_csv(split_path, index_col=0) - df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1] - df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0] + datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1].index.to_list() + datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0].index.to_list() + + if update_train_test_split: + df_split_datasets.to_csv(split_path) - datasets = {dataset.id_: dataset for dataset in OpenMLDatasetsLoader().load(dataset_ids)} - return df_datasets_train, df_datasets_test, datasets + return datasets_train, datasets_test def evaluate_pipeline(pipeline: Pipeline, - input_data: InputData, + train_data: TabularData, + test_data: TabularData, metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM, - metric_names: Sequence[str] = COLLECT_METRICS) -> Dict[str, float]: + metric_names: Sequence[str] = COLLECT_METRICS, + mode: Literal['fitness', 'float'] = 'float' + ) -> Union[Dict[str, float], Tuple[Fitness, Sequence[str]]]: """Gets quality metrics for the fitted pipeline. The function is based on `Fedot.get_metrics()` Returns: the values of quality metrics """ - data_producer = functools.partial(tabular_cv_generator, input_data, 10, StratifiedKFold) + train_data = array_to_input_data(train_data.x, train_data.y) + test_data = array_to_input_data(test_data.x, test_data.y) + + def data_producer(): + yield train_data, test_data objective = MetricsObjective(metrics) obj_eval = PipelineObjectiveEvaluate(objective=objective, data_producer=data_producer, eval_n_jobs=-1) - metric_values = obj_eval.evaluate(pipeline).values - metric_values = {metric_name: round(value, 3) for (metric_name, value) in zip(metric_names, metric_values)} - - return metric_values - - -def fit_offline_meta_learning_components(best_models_per_dataset_id: Dict[int, Sequence[Model]]) \ - -> (KNeighborsBasedSimilarityAssessor, PymfeExtractor, DiverseFEDOTPipelineAdvisor): - dataset_ids = list(best_models_per_dataset_id.keys()) - # Meta Features - extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS) - meta_features_train = extractor.extract(dataset_ids, fill_input_nans=True) - meta_features_train = meta_features_train.fillna(0) - # Datasets similarity - data_similarity_assessor = KNeighborsBasedSimilarityAssessor( - n_neighbors=min(len(dataset_ids), N_CLOSEST_DATASETS_TO_PROPOSE)) - data_similarity_assessor.fit(meta_features_train, dataset_ids) - # Model advisor - model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, - minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) - model_advisor.fit(best_models_per_dataset_id) - return extractor, model_advisor - - -def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array): - x = data.x.to_numpy() - y = data.y.to_numpy() - return x, y - - -def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None) \ - -> (Fedot, Dict[str, Any]): - """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset. - Returns Fedot instance & properties of the run along with the evaluated metrics. """ - x, y = transform_data_for_fedot(dataset.get_data()) - - time_start = timeit.default_timer() - fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS) - fedot.fit(x, y) - automl_time = timeit.default_timer() - time_start - - metrics = evaluate_pipeline(fedot.current_pipeline, fedot.train_data) - pipeline = fedot.current_pipeline - run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, - automl_time_sec=automl_time, automl_timeout_min=fedot.params.timeout, - history_obj=fedot.history, **metrics) - return fedot, run_results - - -def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., - automl_timeout_min=0., **metrics) -> Dict[str, Any]: - run_results = dict(dataset_id=dataset.id_, - dataset_name=dataset.name, - run_label=run_label, - model_obj=pipeline, - model_str=pipeline.descriptive_id, - history_obj=history_obj, - automl_time_sec=automl_time_sec, - automl_timeout_min=automl_timeout_min, - task_type='classification', - **metrics) - return run_results - - -def extract_best_models_from_history(dataset: DatasetBase, history: OptHistory) -> List[Model]: - if history.individuals: - best_individuals = sorted(chain(*history.individuals), - key=lambda ind: ind.fitness, - reverse=True) - for individual in history.final_choices: - if individual not in best_individuals: - best_individuals.insert(0, individual) - - best_individuals = best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE] - - best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - best_models = [] - for individual in best_individuals: - pipeline = PipelineAdapter().restore(individual.graph) - fitness = individual.fitness or SingleObjFitness() - model = Model(pipeline, fitness, history.objective.metric_names[0], dataset) - best_models.append(model) - else: - pipeline = PipelineAdapter().restore(history.tuning_result) - best_models = [Model(pipeline, SingleObjFitness(), history.objective.metric_names[0], dataset)] + fitness = obj_eval.evaluate(pipeline) + if mode == 'float': + metric_values = fitness.values + metric_values = {metric_name: round(value, 3) for (metric_name, value) in zip(metric_names, metric_values)} + return metric_values + if mode == 'fitness': + return fitness, metric_names + + +def timed(func, resolution: Literal['sec', 'min'] = 'min'): + @wraps(func) + def wrapper(*args, **kwargs): + time_start = timeit.default_timer() + result = func(*args, **kwargs) + time_delta = timeit.default_timer() - time_start + if resolution == 'min': + time_delta /= 60 + return result, time_delta + + return wrapper - return best_models + +def fit_evaluate_automl(fit_func, evaluate_func) -> (Fedot, Dict[str, Any]): + """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset.. """ + result, fit_time = timed(fit_func)() + metrics = evaluate_func(result) + return result, metrics, fit_time + + +def fit_evaluate_pipeline(pipeline, fit_func, evaluate_func) -> (Fedot, Dict[str, Any]): + """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset.. """ + _, fit_time = timed(fit_func)() + metrics = evaluate_func(pipeline) + return pipeline, metrics, fit_time def save_experiment_params(params_dict: Dict[str, Any], save_dir: Path): @@ -235,32 +194,36 @@ def save_experiment_params(params_dict: Dict[str, Any], save_dir: Path): json.dump(params_dict, params_file, indent=2) -def save_evaluation(evaluation_properties: Dict[str, Any], run_date: datetime, experiment_date: datetime, - save_dir: Path): - histories_dir = save_dir.joinpath('histories') - models_dir = save_dir.joinpath('models') - eval_results_path = save_dir.joinpath('evaluation_results.csv') +def save_evaluation(save_dir: Path, dataset, pipeline, **kwargs): + run_results: Dict[str, Any] = dict(dataset_id=dataset.id, + dataset_name=dataset.name, + model_obj=pipeline, + model_str=pipeline.descriptive_id, + task_type='classification', + **kwargs) + try: + histories_dir = save_dir.joinpath('histories') + models_dir = save_dir.joinpath('models') + eval_results_path = save_dir.joinpath('evaluation_results.csv') - histories_dir.mkdir(exist_ok=True) - models_dir.mkdir(exist_ok=True) + histories_dir.mkdir(exist_ok=True) + models_dir.mkdir(exist_ok=True) - try: - evaluation_properties['experiment_date'] = experiment_date - evaluation_properties['run_date'] = run_date - dataset_id = evaluation_properties['dataset_id'] - run_label = evaluation_properties['run_label'] + dataset_id = run_results['dataset_id'] + run_label = run_results['run_label'] # define saving paths model_path = models_dir.joinpath(f'{dataset_id}_{run_label}') history_path = histories_dir.joinpath(f'{dataset_id}_{run_label}_history.json') # replace objects with export paths for csv - evaluation_properties['model_path'] = str(model_path) - evaluation_properties.pop('model_obj').save(model_path) - evaluation_properties['history_path'] = str(history_path) - history_obj = evaluation_properties.pop('history_obj') - if history_obj is not None: - history_obj.save(evaluation_properties['history_path']) + run_results['model_path'] = str(model_path) + run_results.pop('model_obj').save(model_path) + run_results['history_path'] = str(history_path) + if 'history_obj' in run_results: + history_obj = run_results.pop('history_obj') + if history_obj is not None: + history_obj.save(run_results['history_path']) - df_evaluation_properties = pd.DataFrame([evaluation_properties]) + df_evaluation_properties = pd.DataFrame([run_results]) if eval_results_path.exists(): df_results = pd.read_csv(eval_results_path) @@ -269,106 +232,180 @@ def save_evaluation(evaluation_properties: Dict[str, Any], run_date: datetime, e df_results = df_evaluation_properties df_results.to_csv(eval_results_path, index=False) - except Exception: - logging.exception(f'Saving results "{evaluation_properties}"') + except Exception as e: + logging.exception(f'Saving results "{run_results}"') + if __debug__: + raise e + + +def run_fedot(train_data: TabularData, test_data: TabularData, timeout: float, + run_label: str, experiment_date: datetime, save_dir: Path, fedot_evaluations_cache: CacheDict, + initial_assumption: Optional[Sequence[Pipeline]] = None, meta_learning_time_sec: float = 0.): + fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **FEDOT_PARAMS) + fit_func = partial(fedot.fit, features=train_data.x, target=train_data.y) + evaluate_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data) + run_date = datetime.now() + cache_key = f'{run_label}_{train_data.id}' + with fedot_evaluations_cache as cache_dict: + cached_run = cache_dict[cache_key] + if cached_run: + fedot = cached_run['fedot'] + pipeline = cached_run['pipeline'] + metrics = cached_run['metrics'] + fit_time = cached_run['fit_time'] + else: + pipeline, metrics, fit_time = fit_evaluate_automl(fit_func=fit_func, evaluate_func=evaluate_func) + cached_run = dict( + fedot=fedot, + pipeline=pipeline, + metrics=metrics, + fit_time=fit_time, + ) + cache_dict[cache_key] = cached_run + save_evaluation(dataset=train_data.dataset, + run_label=run_label, + pipeline=pipeline, + meta_learning_time_sec=meta_learning_time_sec, + automl_time_min=fit_time, + automl_timeout_min=fedot.params.timeout, + history_obj=fedot.history, + run_data=run_date, + experiment_date=experiment_date, + save_dir=save_dir, + **metrics) + return fedot + + +def run_pipeline(train_data: TabularData, test_data: TabularData, pipeline: Pipeline, + run_label: str, experiment_date: datetime, save_dir: Path): + train_data_for_fedot = array_to_input_data(train_data.x, train_data.y) + fit_func = partial(pipeline.fit, train_data_for_fedot) + evaluate_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data) + run_date = datetime.now() + pipeline, metrics, fit_time = fit_evaluate_pipeline(pipeline=pipeline, fit_func=fit_func, + evaluate_func=evaluate_func) + save_evaluation(dataset=train_data.dataset, + run_label=run_label, + pipeline=pipeline, + automl_time_min=0, + pipeline_fit_time=fit_time, + automl_timeout_min=0, + meta_learning_time_sec=0, + run_data=run_date, + experiment_date=experiment_date, + save_dir=save_dir, + **metrics) + return pipeline def main(): experiment_date, experiment_date_iso, experiment_date_for_path = get_current_formatted_date() save_dir = get_save_dir(experiment_date_for_path) setup_logging(save_dir) - progress_file_path = save_dir.joinpath('progress.txt') - - df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets() - - dataset_ids = list(datasets_dict.keys()) - dataset_ids_test = df_datasets_train.index.to_list() - dataset_ids_test = df_datasets_test.index.to_list() - - dataset_names_train = df_datasets_train['dataset_name'].to_list() - dataset_names_test = df_datasets_test['dataset_name'].to_list() - - datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items())) + if TMPDIR: + os.environ.putenv('TMPDIR', TMPDIR) + meta_learner_path = save_dir.joinpath('meta_learner.pkl') + + dataset_ids = get_dataset_ids() + dataset_ids_train, dataset_ids_test = split_datasets(dataset_ids, N_DATASETS, UPDATE_TRAIN_TEST_DATASETS_SPLIT) + + algorithm = KNNSimilarityModelAdvice( + N_BEST_DATASET_MODELS_TO_MEMORIZE, + MF_EXTRACTOR_PARAMS, + ASSESSOR_PARAMS, + ADVISOR_PARAMS + ) experiment_params_dict = dict( - experiment_start_date_iso=experiment_date_iso, - input_config=config, - dataset_ids=dataset_ids, - dataset_ids_train=dataset_ids_test, - dataset_names_train=dataset_names_train, - dataset_ids_test=dataset_ids_test, - dataset_names_test=dataset_names_test, - baseline_pipeline=BASELINE_MODEL, - ) + experiment_start_date_iso=experiment_date_iso, + input_config=config, + dataset_ids=dataset_ids, + dataset_ids_train=dataset_ids_train, + dataset_ids_test=dataset_ids_test, + baseline_pipeline=BASELINE_MODEL, + ) save_experiment_params(experiment_params_dict, save_dir) - - best_models_per_dataset = {} - with open(progress_file_path, 'a') as progress_file: - for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file): - try: - timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT - run_date = datetime.now() - fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT') - save_evaluation(run_results, run_date, experiment_date, save_dir) - # TODO: - # x Turn the tuned pipeline into a model (evaluate its fitness on the data) - # x Evaluate historical pipelines on the data instead of using fitness - # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run - - # Filter out unique individuals with the best fitness - history = fedot.history - best_models = extract_best_models_from_history(dataset, history) - best_models_per_dataset[dataset_id] = best_models - except Exception: - logging.exception(f'Train dataset "{dataset_id}"') - - best_models_per_dataset_test = {dataset_id: best_models_per_dataset[dataset_id] for dataset_id in dataset_ids_test} - mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset_test) - - with open(progress_file_path, 'a') as progress_file: - for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file): - try: - # Run meta AutoML - # 1 - time_start = timeit.default_timer() - meta_features = mf_extractor.extract([dataset], - fill_input_nans=True, use_cached=False, update_cached=True) - meta_features = meta_features.fillna(0) - meta_learning_time_sec = timeit.default_timer() - time_start - initial_assumptions = model_advisor.predict(meta_features)[0] - assumption_pipelines = [model.predictor for model in initial_assumptions] - # 2 - run_date = datetime.now() - fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', - initial_assumption=assumption_pipelines) - fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec - save_evaluation(fedot_meta_results, run_date, experiment_date, save_dir) - - # Fit & evaluate simple baseline - baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() - run_date = datetime.now() - baseline_metrics = evaluate_pipeline(baseline_pipeline, fedot_meta.train_data) - baseline_res = get_result_data_row(dataset=dataset, run_label=f'simple baseline {BASELINE_MODEL}', - pipeline=baseline_pipeline, - **baseline_metrics) - save_evaluation(baseline_res, run_date, experiment_date, save_dir) - - # Fit & evaluate initial assumptions - for i, assumption in enumerate(initial_assumptions): - pipeline = assumption.predictor - run_date = datetime.now() - assumption_metrics = evaluate_pipeline(pipeline, fedot_meta.train_data) - assumption_res = get_result_data_row(dataset=dataset, - run_label=f'MetaFEDOT - initial assumption {i}', - pipeline=pipeline, **assumption_metrics) - save_evaluation(assumption_res, run_date, experiment_date, save_dir) - except Exception: - logging.exception(f'Test dataset "{dataset_id}"') + # Gathering knowledge base + # fit_fedot_cached = CacheDict.decorate(fit_evaluate_automl, get_cache_dir() / 'fedot_runs.pkl', inner_key='dataset.id') + dataset_splits = {} + for dataset_id in dataset_ids: + dataset = OpenMLDataset(dataset_id) + dataset_data = dataset.get_data() + idx_train, idx_test = train_test_split(range(len(dataset_data.y)), + test_size=DATA_TEST_SIZE, + stratify=dataset_data.y, + shuffle=True, + random_state=DATA_SPLIT_SEED) + train_data, test_data = dataset_data[idx_train], dataset_data[idx_test] + dataset_splits[dataset_id] = dict(train=train_data, test=test_data) + knowledge_base = {} + fedot_evaluations_cache = CacheDict(get_cache_dir() / 'fedot_runs.pkl', access='rew') + description = 'FEDOT, all datasets' + for dataset_id in (pbar := tqdm(dataset_ids, description)): + pbar.set_description(description + f' ({dataset_id})') + try: + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT + train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] + run_label = 'FEDOT' + fedot = run_fedot(train_data, test_data, timeout, run_label, experiment_date, save_dir, + fedot_evaluations_cache) + # TODO: + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run + if dataset_id not in dataset_ids_test: + if fedot.history: + knowledge_base[dataset_id] = [fedot.history] + except Exception as e: + logging.exception(f'Train dataset "{dataset_id}"') + if __debug__: + raise e + knowledge_base_data = [OpenMLDataset(dataset).get_data() for dataset in knowledge_base.keys()] + knowledge_base_histories = list(knowledge_base.values()) + # Learning + dataset_eval_funcs = [] + for dataset_id in dataset_ids_train: + split = dataset_splits[dataset_id] + train_data, test_data = split['train'], split['test'] + model_eval_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data, mode='fitness') + dataset_eval_funcs.append(model_eval_func) + algorithm.fit(knowledge_base_data, knowledge_base_histories, dataset_eval_funcs) + with open(meta_learner_path, 'wb') as meta_learner_file: + pickle.dump(algorithm, meta_learner_file) + + description = 'MetaFEDOT, Test datasets' + for dataset_id in (pbar := tqdm(dataset_ids_test, description)): + pbar.set_description(description + f' ({dataset_id})') + try: + train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] + # Run meta AutoML + # 1 + initial_assumptions, meta_learning_time_sec = timed(algorithm.predict, resolution='sec')([train_data]) + initial_assumptions = initial_assumptions[0] + assumption_pipelines = [model.predictor for model in initial_assumptions] + # 2 + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT + run_label = 'MetaFEDOT' + run_fedot(train_data, test_data, timeout, run_label, experiment_date, save_dir, + fedot_evaluations_cache, initial_assumption=assumption_pipelines, + meta_learning_time_sec=meta_learning_time_sec) + # Fit & evaluate simple baseline + baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() + run_label = 'simple baseline' + run_pipeline(train_data, test_data, baseline_pipeline, run_label, experiment_date, save_dir) + + # Fit & evaluate initial assumptions + for i, assumption in enumerate(initial_assumptions): + pipeline = assumption.predictor + run_label = f'MetaFEDOT - initial assumption {i}' + run_pipeline(train_data, test_data, pipeline, run_label, experiment_date, save_dir) + except Exception as e: + logging.exception(f'Test dataset "{dataset_id}"') + if __debug__: + raise e if __name__ == "__main__": try: main() except Exception as e: - logging.exception('Main level caught an error.') - raise + logging.exception('Exception at main().') + raise e From bf8aac6471d3d17677d236036532ad2f5eb43506 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 3 Nov 2023 16:19:52 +0300 Subject: [PATCH 19/34] set TMPDIR from script --- experiments/fedot_warm_start/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml index 5023e74b..4c41e365 100644 --- a/experiments/fedot_warm_start/config.yaml +++ b/experiments/fedot_warm_start/config.yaml @@ -1,5 +1,6 @@ --- seed: 42 +tmpdir: '/var/essdata/tmp' #data_settings: n_datasets: null # null for all available datasets test_size: 0.25 From 877be966418e21dde6eefd405decbed526e1a4f2 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 7 Nov 2023 17:31:57 +0300 Subject: [PATCH 20/34] set logging level of FEDOT --- experiments/fedot_warm_start/run.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 887b7a82..a580176a 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -20,7 +20,6 @@ from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.pipeline_builder import PipelineBuilder from fedot.core.repository.quality_metrics_repository import MetricsRepository, QualityMetricsEnum -from golem.core.log import Log from golem.core.optimisers.fitness import Fitness from pecapiku import CacheDict from sklearn.model_selection import train_test_split @@ -66,13 +65,13 @@ def setup_logging(save_dir: Path): """ Creates "log.txt" at the "save_dir" and redirects all logging output to it. """ log_file = save_dir.joinpath('log.txt') - Log(log_file=log_file) logging.basicConfig( filename=log_file, filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', datefmt='%H:%M:%S', force=True, + level=logging.DEBUG, ) @@ -91,6 +90,8 @@ def get_current_formatted_date() -> Tuple[datetime, str, str]: def get_save_dir(time_now_for_path) -> Path: save_dir = get_cache_dir(). \ joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') + if 'debug' in CONFIG_PATH.name: + save_dir = save_dir.with_name('debug_' + save_dir.name) if save_dir.exists(): shutil.rmtree(save_dir) save_dir.mkdir(parents=True) From fa48660ab718b4995fe5fe9a3b70d9b11fbe2143 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 10 Nov 2023 17:20:53 +0300 Subject: [PATCH 21/34] create config_light.yaml --- .../fedot_warm_start/config_light.yaml | 29 +++++++++++++++++++ experiments/fedot_warm_start/run.py | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 experiments/fedot_warm_start/config_light.yaml diff --git a/experiments/fedot_warm_start/config_light.yaml b/experiments/fedot_warm_start/config_light.yaml new file mode 100644 index 00000000..78617968 --- /dev/null +++ b/experiments/fedot_warm_start/config_light.yaml @@ -0,0 +1,29 @@ +--- +seed: 42 +tmpdir: '/var/essdata/tmp' +#data_settings: +n_datasets: 16 # null for all available datasets +test_size: 0.25 +train_timeout: 15 +test_timeout: 15 +#meta_learning_params: +n_best_dataset_models_to_memorize: 10 +mf_extractor_params: + groups: general +assessor_params: + n_neighbors: 5 +advisor_params: + minimal_distance: 1 + n_best_to_advise: 5 +#evaluation_params: +collect_metrics: + - f1 + - roc_auc + - accuracy + - neg_log_loss + - precision +common_fedot_params: + problem: classification + n_jobs: -1 + show_progress: false +baseline_model: 'xgboost' diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index a580176a..f6926330 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -31,7 +31,7 @@ from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split from meta_automl.data_preparation.file_system import get_cache_dir -CONFIG_PATH = Path(__file__).parent.joinpath('config.yaml') +CONFIG_PATH = Path(__file__).parent.joinpath('config_light.yaml') with open(CONFIG_PATH, 'r') as config_file: config = yaml.load(config_file, yaml.Loader) From 28506e69636b33f3a760881c62bfc9aef7cf94e7 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Mon, 13 Nov 2023 17:16:20 +0300 Subject: [PATCH 22/34] fix train/test split --- experiments/fedot_warm_start/run.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index f6926330..5fecac02 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -110,20 +110,23 @@ def get_dataset_ids() -> List[DatasetIDType]: def split_datasets(dataset_ids, n_datasets: Optional[int] = None, update_train_test_split: bool = False) \ -> Tuple[pd.DataFrame, pd.DataFrame]: split_path = Path(__file__).parent / 'train_test_datasets_split.csv' - if n_datasets is not None: - dataset_ids = pd.Series(dataset_ids) - dataset_ids = dataset_ids.sample(n=n_datasets, random_state=SEED) - if n_datasets is not None or update_train_test_split: + if update_train_test_split: df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED) + df_split_datasets.to_csv(split_path) else: df_split_datasets = pd.read_csv(split_path, index_col=0) - datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1].index.to_list() - datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0].index.to_list() + df_train = df_split_datasets[df_split_datasets['is_train'] == 1] + df_test = df_split_datasets[df_split_datasets['is_train'] == 0] - if update_train_test_split: - df_split_datasets.to_csv(split_path) + if n_datasets is not None: + frac = n_datasets / len(df_split_datasets) + df_train = df_train.sample(frac=frac, random_state=SEED) + df_test = df_test.sample(frac=frac, random_state=SEED) + + datasets_train = df_train.index.to_list() + datasets_test = df_test.index.to_list() return datasets_train, datasets_test From 071574b1075c135a2cf39292f839a4d09d90ba52 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 15 Nov 2023 14:48:46 +0300 Subject: [PATCH 23/34] add evaluation caching --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 72f9c583..fa7411ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ pytest>=7.4.0 scikit-learn>=1.0.0 scipy>=1.7.3 tqdm>=4.65.0 +pecapiku @ git+https://github.com/MorrisNein/pecapiku From 2b9b863e6f84c6f9d51b35dc616910dad988cf75 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 21 Nov 2023 15:53:53 +0300 Subject: [PATCH 24/34] split config file --- experiments/fedot_warm_start/config.yaml | 12 ------------ experiments/fedot_warm_start/config_debug.yaml | 13 +------------ experiments/fedot_warm_start/config_light.yaml | 12 ------------ experiments/fedot_warm_start/configs_list.yaml | 3 +++ experiments/fedot_warm_start/evaluation_config.yaml | 11 +++++++++++ experiments/fedot_warm_start/fedot_config.yaml | 6 ++++++ 6 files changed, 21 insertions(+), 36 deletions(-) create mode 100644 experiments/fedot_warm_start/configs_list.yaml create mode 100644 experiments/fedot_warm_start/evaluation_config.yaml create mode 100644 experiments/fedot_warm_start/fedot_config.yaml diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml index 4c41e365..fe17c7ad 100644 --- a/experiments/fedot_warm_start/config.yaml +++ b/experiments/fedot_warm_start/config.yaml @@ -15,15 +15,3 @@ assessor_params: advisor_params: minimal_distance: 1 n_best_to_advise: 5 -#evaluation_params: -collect_metrics: - - f1 - - roc_auc - - accuracy - - neg_log_loss - - precision -common_fedot_params: - problem: classification - n_jobs: -1 - show_progress: false -baseline_model: 'xgboost' diff --git a/experiments/fedot_warm_start/config_debug.yaml b/experiments/fedot_warm_start/config_debug.yaml index 11d0d26a..339e2826 100644 --- a/experiments/fedot_warm_start/config_debug.yaml +++ b/experiments/fedot_warm_start/config_debug.yaml @@ -1,5 +1,6 @@ --- seed: 42 +save_dir_prefix: debug_ #data_settings: n_datasets: 3 # null for all available datasets test_size: 0.33 @@ -14,15 +15,3 @@ assessor_params: advisor_params: minimal_distance: 1 n_best_to_advise: 5 -#evaluation_params: -collect_metrics: - - f1 - - roc_auc - - accuracy - - neg_log_loss - - precision -common_fedot_params: - problem: classification - n_jobs: -1 - show_progress: false -baseline_model: 'xgboost' diff --git a/experiments/fedot_warm_start/config_light.yaml b/experiments/fedot_warm_start/config_light.yaml index 78617968..23e9b409 100644 --- a/experiments/fedot_warm_start/config_light.yaml +++ b/experiments/fedot_warm_start/config_light.yaml @@ -15,15 +15,3 @@ assessor_params: advisor_params: minimal_distance: 1 n_best_to_advise: 5 -#evaluation_params: -collect_metrics: - - f1 - - roc_auc - - accuracy - - neg_log_loss - - precision -common_fedot_params: - problem: classification - n_jobs: -1 - show_progress: false -baseline_model: 'xgboost' diff --git a/experiments/fedot_warm_start/configs_list.yaml b/experiments/fedot_warm_start/configs_list.yaml new file mode 100644 index 00000000..b3e2b11f --- /dev/null +++ b/experiments/fedot_warm_start/configs_list.yaml @@ -0,0 +1,3 @@ +- config_debug.yaml +- evaluation_config.yaml +- fedot_config.yaml diff --git a/experiments/fedot_warm_start/evaluation_config.yaml b/experiments/fedot_warm_start/evaluation_config.yaml new file mode 100644 index 00000000..f04c6b9f --- /dev/null +++ b/experiments/fedot_warm_start/evaluation_config.yaml @@ -0,0 +1,11 @@ +n_folds: 1 +split_seed: 0 +collect_metrics: + - f1 + - roc_auc + - accuracy + - neg_log_loss + - precision +baseline_model: 'xgboost' +data_test_size: 0.25 +data_split_seed: 0 diff --git a/experiments/fedot_warm_start/fedot_config.yaml b/experiments/fedot_warm_start/fedot_config.yaml new file mode 100644 index 00000000..62f6873e --- /dev/null +++ b/experiments/fedot_warm_start/fedot_config.yaml @@ -0,0 +1,6 @@ +fedot_params: + problem: classification + logging_level: 10 + n_jobs: -1 + show_progress: false + seed: 42 From 8824679e2a071be08d290e1cacacf1edf63a83c8 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 21 Nov 2023 20:26:31 +0300 Subject: [PATCH 25/34] increase debug fedot timeout --- experiments/fedot_warm_start/config_debug.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experiments/fedot_warm_start/config_debug.yaml b/experiments/fedot_warm_start/config_debug.yaml index 339e2826..998bb5d8 100644 --- a/experiments/fedot_warm_start/config_debug.yaml +++ b/experiments/fedot_warm_start/config_debug.yaml @@ -4,8 +4,8 @@ save_dir_prefix: debug_ #data_settings: n_datasets: 3 # null for all available datasets test_size: 0.33 -train_timeout: 0.01 -test_timeout: 0.01 +train_timeout: 1 +test_timeout: 1 #meta_learning_params: n_best_dataset_models_to_memorize: 10 mf_extractor_params: From 82eb33c5e804f282251349ff860b04a28cdcc5f3 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 16 Nov 2023 16:22:53 +0300 Subject: [PATCH 26/34] minor fixes --- gamlet/components/meta_features_extractors/pymfe_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gamlet/components/meta_features_extractors/pymfe_extractor.py b/gamlet/components/meta_features_extractors/pymfe_extractor.py index f6dc68e7..91702523 100644 --- a/gamlet/components/meta_features_extractors/pymfe_extractor.py +++ b/gamlet/components/meta_features_extractors/pymfe_extractor.py @@ -5,7 +5,6 @@ from functools import partial from typing import Any, Dict, Optional, Sequence, Tuple, Union -import numpy as np import pandas as pd from pymfe.mfe import MFE from tqdm import tqdm From 61641be1060761d44825bc91b9ebc892015c9cd3 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 13 Dec 2023 12:39:58 +0000 Subject: [PATCH 27/34] various experiment improvements & fixes --- .../fedot_warm_start/config_light.yaml | 1 + .../fedot_warm_start/configs_list.yaml | 2 +- .../fedot_warm_start/fedot_config.yaml | 1 - experiments/fedot_warm_start/run.py | 104 ++++++++++-------- requirements.txt | 1 + 5 files changed, 64 insertions(+), 45 deletions(-) diff --git a/experiments/fedot_warm_start/config_light.yaml b/experiments/fedot_warm_start/config_light.yaml index 23e9b409..aa1359b8 100644 --- a/experiments/fedot_warm_start/config_light.yaml +++ b/experiments/fedot_warm_start/config_light.yaml @@ -6,6 +6,7 @@ n_datasets: 16 # null for all available datasets test_size: 0.25 train_timeout: 15 test_timeout: 15 +n_automl_repetitions: 10 #meta_learning_params: n_best_dataset_models_to_memorize: 10 mf_extractor_params: diff --git a/experiments/fedot_warm_start/configs_list.yaml b/experiments/fedot_warm_start/configs_list.yaml index b3e2b11f..175a939e 100644 --- a/experiments/fedot_warm_start/configs_list.yaml +++ b/experiments/fedot_warm_start/configs_list.yaml @@ -1,3 +1,3 @@ -- config_debug.yaml +- config_light.yaml - evaluation_config.yaml - fedot_config.yaml diff --git a/experiments/fedot_warm_start/fedot_config.yaml b/experiments/fedot_warm_start/fedot_config.yaml index 62f6873e..5795163f 100644 --- a/experiments/fedot_warm_start/fedot_config.yaml +++ b/experiments/fedot_warm_start/fedot_config.yaml @@ -3,4 +3,3 @@ fedot_params: logging_level: 10 n_jobs: -1 show_progress: false - seed: 42 diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 5fecac02..2aeed39b 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -10,7 +10,9 @@ from functools import partial, wraps from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from uuid import uuid4 +import loguru import openml import pandas as pd import yaml @@ -51,6 +53,7 @@ DATA_TEST_SIZE = config['data_test_size'] DATA_SPLIT_SEED = config['data_split_seed'] BASELINE_MODEL = config['baseline_model'] +N_AUTOML_REPETITIONS = config['n_automl_repetitions'] # Optional values TMPDIR = config.get('tmpdir') SAVE_DIR_PREFIX = config.get('save_dir_prefix') @@ -64,6 +67,7 @@ def setup_logging(save_dir: Path): """ Creates "log.txt" at the "save_dir" and redirects all logging output to it. """ + loguru.logger.add(save_dir / "file_{time}.log") log_file = save_dir.joinpath('log.txt') logging.basicConfig( filename=log_file, @@ -216,11 +220,12 @@ def save_evaluation(save_dir: Path, dataset, pipeline, **kwargs): dataset_id = run_results['dataset_id'] run_label = run_results['run_label'] # define saving paths - model_path = models_dir.joinpath(f'{dataset_id}_{run_label}') - history_path = histories_dir.joinpath(f'{dataset_id}_{run_label}_history.json') + uid = str(uuid4()) + model_path = models_dir.joinpath(f'{dataset_id}_{run_label}_{uid}') + history_path = histories_dir.joinpath(f'{dataset_id}_{run_label}_{uid}_history.json') # replace objects with export paths for csv run_results['model_path'] = str(model_path) - run_results.pop('model_obj').save(model_path) + run_results.pop('model_obj').save(model_path, create_subdir=False) run_results['history_path'] = str(history_path) if 'history_obj' in run_results: history_obj = run_results.pop('history_obj') @@ -249,7 +254,7 @@ def run_fedot(train_data: TabularData, test_data: TabularData, timeout: float, fit_func = partial(fedot.fit, features=train_data.x, target=train_data.y) evaluate_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data) run_date = datetime.now() - cache_key = f'{run_label}_{train_data.id}' + cache_key = f'{run_label}_{train_data.id}_{timeout}' with fedot_evaluations_cache as cache_dict: cached_run = cache_dict[cache_key] if cached_run: @@ -272,6 +277,7 @@ def run_fedot(train_data: TabularData, test_data: TabularData, timeout: float, meta_learning_time_sec=meta_learning_time_sec, automl_time_min=fit_time, automl_timeout_min=fedot.params.timeout, + generations_count=fedot.history.generations_count, history_obj=fedot.history, run_data=run_date, experiment_date=experiment_date, @@ -302,6 +308,7 @@ def run_pipeline(train_data: TabularData, test_data: TabularData, pipeline: Pipe return pipeline +@loguru.logger.catch def main(): experiment_date, experiment_date_iso, experiment_date_for_path = get_current_formatted_date() save_dir = get_save_dir(experiment_date_for_path) @@ -342,26 +349,27 @@ def main(): random_state=DATA_SPLIT_SEED) train_data, test_data = dataset_data[idx_train], dataset_data[idx_test] dataset_splits[dataset_id] = dict(train=train_data, test=test_data) - knowledge_base = {} - fedot_evaluations_cache = CacheDict(get_cache_dir() / 'fedot_runs.pkl', access='rew') + knowledge_base = {dataset_id: [] for dataset_id in dataset_ids_train} + fedot_evaluations_cache = CacheDict(get_cache_dir() / 'fedot_runs.pkl', access='e') description = 'FEDOT, all datasets' for dataset_id in (pbar := tqdm(dataset_ids, description)): + if dataset_id != 40975: continue pbar.set_description(description + f' ({dataset_id})') - try: - timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT - train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] - run_label = 'FEDOT' - fedot = run_fedot(train_data, test_data, timeout, run_label, experiment_date, save_dir, - fedot_evaluations_cache) - # TODO: - # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run - if dataset_id not in dataset_ids_test: - if fedot.history: - knowledge_base[dataset_id] = [fedot.history] - except Exception as e: - logging.exception(f'Train dataset "{dataset_id}"') - if __debug__: - raise e + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT + train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] + run_label = 'FEDOT' + for repetition in range(N_AUTOML_REPETITIONS): + try: + fedot = run_fedot(train_data, test_data, timeout, run_label, experiment_date, save_dir, + fedot_evaluations_cache) + # TODO: + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run + if dataset_id in dataset_ids_train and fedot.history: + knowledge_base[dataset_id].append(fedot.history) + except Exception as e: + logging.exception(f'Train dataset "{dataset_id}"') + if __debug__: + raise e knowledge_base_data = [OpenMLDataset(dataset).get_data() for dataset in knowledge_base.keys()] knowledge_base_histories = list(knowledge_base.values()) # Learning @@ -378,33 +386,43 @@ def main(): description = 'MetaFEDOT, Test datasets' for dataset_id in (pbar := tqdm(dataset_ids_test, description)): pbar.set_description(description + f' ({dataset_id})') + train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] + # Run meta AutoML + # 1 + initial_assumptions, meta_learning_time_sec = timed(algorithm.predict, resolution='sec')([train_data]) + initial_assumptions = initial_assumptions[0] + assumption_pipelines = [model.predictor for model in initial_assumptions] + # 2 + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT + baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() + run_label = 'MetaFEDOT' + for repetition in range(N_AUTOML_REPETITIONS): + try: + run_fedot(train_data, test_data, timeout, run_label, experiment_date, save_dir, + fedot_evaluations_cache, initial_assumption=assumption_pipelines, + meta_learning_time_sec=meta_learning_time_sec) + except Exception as e: + logging.exception(f'Test dataset "{dataset_id}"') + if __debug__: + raise e + # Fit & evaluate simple baseline + run_label = 'simple baseline' try: - train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] - # Run meta AutoML - # 1 - initial_assumptions, meta_learning_time_sec = timed(algorithm.predict, resolution='sec')([train_data]) - initial_assumptions = initial_assumptions[0] - assumption_pipelines = [model.predictor for model in initial_assumptions] - # 2 - timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT - run_label = 'MetaFEDOT' - run_fedot(train_data, test_data, timeout, run_label, experiment_date, save_dir, - fedot_evaluations_cache, initial_assumption=assumption_pipelines, - meta_learning_time_sec=meta_learning_time_sec) - # Fit & evaluate simple baseline - baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() - run_label = 'simple baseline' run_pipeline(train_data, test_data, baseline_pipeline, run_label, experiment_date, save_dir) - - # Fit & evaluate initial assumptions - for i, assumption in enumerate(initial_assumptions): - pipeline = assumption.predictor - run_label = f'MetaFEDOT - initial assumption {i}' - run_pipeline(train_data, test_data, pipeline, run_label, experiment_date, save_dir) except Exception as e: - logging.exception(f'Test dataset "{dataset_id}"') + logging.exception(f'Test dataset "{dataset_id}", {run_label}') if __debug__: raise e + # Fit & evaluate initial assumptions + for i, assumption in enumerate(initial_assumptions): + try: + pipeline = assumption.predictor + run_label = f'MetaFEDOT - initial assumption {i}' + run_pipeline(train_data, test_data, pipeline, run_label, experiment_date, save_dir) + except Exception as e: + logging.exception(f'Test dataset "{dataset_id}", {run_label}') + if __debug__: + raise e if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index fa7411ed..9a3fc778 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,4 +22,5 @@ pytest>=7.4.0 scikit-learn>=1.0.0 scipy>=1.7.3 tqdm>=4.65.0 +loguru pecapiku @ git+https://github.com/MorrisNein/pecapiku From 3a09a4d4fc2dccd820e60a9e1eef013a3187e4af Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 14 Dec 2023 13:06:52 +0000 Subject: [PATCH 28/34] add cache for AutoML repetitions --- experiments/fedot_warm_start/run.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 2aeed39b..591e6f00 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -248,13 +248,14 @@ def save_evaluation(save_dir: Path, dataset, pipeline, **kwargs): def run_fedot(train_data: TabularData, test_data: TabularData, timeout: float, - run_label: str, experiment_date: datetime, save_dir: Path, fedot_evaluations_cache: CacheDict, + run_label: str, repetition: int, experiment_date: datetime, save_dir: Path, + fedot_evaluations_cache: CacheDict, initial_assumption: Optional[Sequence[Pipeline]] = None, meta_learning_time_sec: float = 0.): fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **FEDOT_PARAMS) fit_func = partial(fedot.fit, features=train_data.x, target=train_data.y) evaluate_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data) run_date = datetime.now() - cache_key = f'{run_label}_{train_data.id}_{timeout}' + cache_key = f'{run_label}_{train_data.id}_{timeout}_{repetition}' with fedot_evaluations_cache as cache_dict: cached_run = cache_dict[cache_key] if cached_run: @@ -350,17 +351,16 @@ def main(): train_data, test_data = dataset_data[idx_train], dataset_data[idx_test] dataset_splits[dataset_id] = dict(train=train_data, test=test_data) knowledge_base = {dataset_id: [] for dataset_id in dataset_ids_train} - fedot_evaluations_cache = CacheDict(get_cache_dir() / 'fedot_runs.pkl', access='e') + fedot_evaluations_cache = CacheDict(get_cache_dir() / 'fedot_runs.pkl') description = 'FEDOT, all datasets' for dataset_id in (pbar := tqdm(dataset_ids, description)): - if dataset_id != 40975: continue pbar.set_description(description + f' ({dataset_id})') timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] run_label = 'FEDOT' for repetition in range(N_AUTOML_REPETITIONS): try: - fedot = run_fedot(train_data, test_data, timeout, run_label, experiment_date, save_dir, + fedot = run_fedot(train_data, test_data, timeout, run_label, repetition, experiment_date, save_dir, fedot_evaluations_cache) # TODO: # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run @@ -398,7 +398,7 @@ def main(): run_label = 'MetaFEDOT' for repetition in range(N_AUTOML_REPETITIONS): try: - run_fedot(train_data, test_data, timeout, run_label, experiment_date, save_dir, + run_fedot(train_data, test_data, timeout, run_label, repetition, experiment_date, save_dir, fedot_evaluations_cache, initial_assumption=assumption_pipelines, meta_learning_time_sec=meta_learning_time_sec) except Exception as e: From 238483f375a503fdb459ccd1fa2ecdb52fd49471 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 14 Dec 2023 13:11:11 +0000 Subject: [PATCH 29/34] adjust configs to advise 3 initial assumptions; add prefix for config_light.yaml --- experiments/fedot_warm_start/config.yaml | 2 +- experiments/fedot_warm_start/config_debug.yaml | 2 +- experiments/fedot_warm_start/config_light.yaml | 3 ++- experiments/fedot_warm_start/evaluation_config.yaml | 3 +-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml index fe17c7ad..aff82cca 100644 --- a/experiments/fedot_warm_start/config.yaml +++ b/experiments/fedot_warm_start/config.yaml @@ -14,4 +14,4 @@ assessor_params: n_neighbors: 5 advisor_params: minimal_distance: 1 - n_best_to_advise: 5 + n_best_to_advise: 3 diff --git a/experiments/fedot_warm_start/config_debug.yaml b/experiments/fedot_warm_start/config_debug.yaml index 998bb5d8..c8e23f0a 100644 --- a/experiments/fedot_warm_start/config_debug.yaml +++ b/experiments/fedot_warm_start/config_debug.yaml @@ -14,4 +14,4 @@ assessor_params: n_neighbors: 2 advisor_params: minimal_distance: 1 - n_best_to_advise: 5 + n_best_to_advise: 3 diff --git a/experiments/fedot_warm_start/config_light.yaml b/experiments/fedot_warm_start/config_light.yaml index aa1359b8..c486f1de 100644 --- a/experiments/fedot_warm_start/config_light.yaml +++ b/experiments/fedot_warm_start/config_light.yaml @@ -1,6 +1,7 @@ --- seed: 42 tmpdir: '/var/essdata/tmp' +save_dir_prefix: light_ #data_settings: n_datasets: 16 # null for all available datasets test_size: 0.25 @@ -15,4 +16,4 @@ assessor_params: n_neighbors: 5 advisor_params: minimal_distance: 1 - n_best_to_advise: 5 + n_best_to_advise: 3 diff --git a/experiments/fedot_warm_start/evaluation_config.yaml b/experiments/fedot_warm_start/evaluation_config.yaml index f04c6b9f..a3143e14 100644 --- a/experiments/fedot_warm_start/evaluation_config.yaml +++ b/experiments/fedot_warm_start/evaluation_config.yaml @@ -1,4 +1,3 @@ -n_folds: 1 split_seed: 0 collect_metrics: - f1 @@ -6,6 +5,6 @@ collect_metrics: - accuracy - neg_log_loss - precision -baseline_model: 'xgboost' +baseline_model: 'catboost' data_test_size: 0.25 data_split_seed: 0 From da6168b7a4e0d3cdc58217e733597975a2692c49 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 14 Dec 2023 18:29:47 +0000 Subject: [PATCH 30/34] fix after rebase --- experiments/fedot_warm_start/run.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 591e6f00..06202e55 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -33,10 +33,19 @@ from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split from meta_automl.data_preparation.file_system import get_cache_dir -CONFIG_PATH = Path(__file__).parent.joinpath('config_light.yaml') +CONFIGS_DIR = Path(__file__).parent -with open(CONFIG_PATH, 'r') as config_file: - config = yaml.load(config_file, yaml.Loader) +with open(CONFIGS_DIR / 'configs_list.yaml', 'r') as config_file: + configs_list = yaml.load(config_file, yaml.Loader) + +config = {} +for conf_name in configs_list: + with open(CONFIGS_DIR / conf_name, 'r') as config_file: + conf = yaml.load(config_file, yaml.Loader) + intersection = set(config).intersection(set(conf)) + if intersection: + raise ValueError(f'Parameter values given twice: {conf_name}, {intersection}.') + config.update(conf) # Load constants SEED = config['seed'] @@ -94,8 +103,8 @@ def get_current_formatted_date() -> Tuple[datetime, str, str]: def get_save_dir(time_now_for_path) -> Path: save_dir = get_cache_dir(). \ joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') - if 'debug' in CONFIG_PATH.name: - save_dir = save_dir.with_name('debug_' + save_dir.name) + if SAVE_DIR_PREFIX: + save_dir = save_dir.with_name(SAVE_DIR_PREFIX + save_dir.name) if save_dir.exists(): shutil.rmtree(save_dir) save_dir.mkdir(parents=True) @@ -320,6 +329,7 @@ def main(): dataset_ids = get_dataset_ids() dataset_ids_train, dataset_ids_test = split_datasets(dataset_ids, N_DATASETS, UPDATE_TRAIN_TEST_DATASETS_SPLIT) + dataset_ids = dataset_ids_train + dataset_ids_test algorithm = KNNSimilarityModelAdvice( N_BEST_DATASET_MODELS_TO_MEMORIZE, @@ -350,6 +360,7 @@ def main(): random_state=DATA_SPLIT_SEED) train_data, test_data = dataset_data[idx_train], dataset_data[idx_test] dataset_splits[dataset_id] = dict(train=train_data, test=test_data) + knowledge_base = {dataset_id: [] for dataset_id in dataset_ids_train} fedot_evaluations_cache = CacheDict(get_cache_dir() / 'fedot_runs.pkl') description = 'FEDOT, all datasets' From 71dac6403f12f5dc364f22616be22aa64f5fb538 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 26 Mar 2024 14:51:58 +0300 Subject: [PATCH 31/34] fix after rebase --- .../models_loaders/fedot_history_loader.py | 2 +- meta_automl/approaches/__init__.py | 1 - .../approaches/knn_similarity_model_advice.py | 125 ------------------ .../approaches/meta_learning_approach.py | 20 --- .../model_fitness_scalers/__init__.py | 1 - .../dataset_models_fitness_scaler.py | 45 ------- 6 files changed, 1 insertion(+), 193 deletions(-) delete mode 100644 meta_automl/approaches/__init__.py delete mode 100644 meta_automl/approaches/knn_similarity_model_advice.py delete mode 100644 meta_automl/approaches/meta_learning_approach.py delete mode 100644 meta_automl/data_preparation/model_fitness_scalers/__init__.py delete mode 100644 meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py diff --git a/gamlet/components/models_loaders/fedot_history_loader.py b/gamlet/components/models_loaders/fedot_history_loader.py index c65fc079..a7bbfd18 100644 --- a/gamlet/components/models_loaders/fedot_history_loader.py +++ b/gamlet/components/models_loaders/fedot_history_loader.py @@ -28,7 +28,7 @@ def extract_best_models_from_history( best_individuals.insert(0, individual) best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - best_individuals = best_individuals[:n_best_models_to_load - 1] + best_individuals = best_individuals[:n_best_models_to_load] node_params_repo = DefaultOperationParamsRepository() for individual in best_individuals: diff --git a/meta_automl/approaches/__init__.py b/meta_automl/approaches/__init__.py deleted file mode 100644 index a7c6bef0..00000000 --- a/meta_automl/approaches/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .meta_learning_approach import MetaLearningApproach diff --git a/meta_automl/approaches/knn_similarity_model_advice.py b/meta_automl/approaches/knn_similarity_model_advice.py deleted file mode 100644 index 6ea5fe37..00000000 --- a/meta_automl/approaches/knn_similarity_model_advice.py +++ /dev/null @@ -1,125 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import Callable, List, Optional, Sequence - -from golem.core.optimisers.opt_history_objects.opt_history import OptHistory -from sklearn.preprocessing import MinMaxScaler - -from meta_automl.approaches import MetaLearningApproach -from meta_automl.data_preparation.dataset import DatasetIDType, OpenMLDataset, TabularData -from meta_automl.data_preparation.evaluated_model import EvaluatedModel -from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor -from meta_automl.data_preparation.meta_features_extractors.dataset_meta_features import DatasetMetaFeatures -from meta_automl.data_preparation.model_fitness_scalers import DatasetModelsFitnessScaler, ScalerType -from meta_automl.data_preparation.models_loaders.fedot_history_loader import FedotHistoryLoader -from meta_automl.meta_algorithm.dataset_similarity_assessors import KNeighborsSimilarityAssessor -from meta_automl.meta_algorithm.model_advisors import DiverseModelAdvisor - - -class KNNSimilarityModelAdvice(MetaLearningApproach): - def __init__(self, n_best_dataset_models_to_memorize: int, - mf_extractor_params: dict, assessor_params: dict, advisor_params: dict): - self.parameters = self.Parameters( - n_best_dataset_models_to_memorize=n_best_dataset_models_to_memorize, - mf_extractor_params=mf_extractor_params, - assessor_params=assessor_params, - advisor_params=advisor_params, - ) - self.components = self.Components( - models_loader=FedotHistoryLoader(), - models_fitness_scaler=DatasetModelsFitnessScaler(MinMaxScaler), - mf_extractor=PymfeExtractor(**mf_extractor_params), - mf_scaler=MinMaxScaler(), - datasets_similarity_assessor=KNeighborsSimilarityAssessor(**assessor_params), - model_advisor=DiverseModelAdvisor(**advisor_params), - ) - self.data = self.Data() - - @dataclass - class Parameters: - n_best_dataset_models_to_memorize: int - mf_extractor_params: dict = field(default_factory=dict) - assessor_params: dict = field(default_factory=dict) - advisor_params: dict = field(default_factory=dict) - - @dataclass - class Data: - meta_features: DatasetMetaFeatures = None - datasets: List[OpenMLDataset] = None - datasets_data: List[OpenMLDataset] = None - dataset_ids: List[DatasetIDType] = None - best_models: List[List[EvaluatedModel]] = None - - @dataclass - class Components: - models_loader: FedotHistoryLoader - models_fitness_scaler: DatasetModelsFitnessScaler - mf_extractor: PymfeExtractor - mf_scaler: ScalerType - datasets_similarity_assessor: KNeighborsSimilarityAssessor - model_advisor: DiverseModelAdvisor - - def fit(self, - datasets_data: Sequence[TabularData], - histories: Sequence[Sequence[OptHistory]], - evaluate_model_func: Optional[Sequence[Callable]] = None): - data = self.data - params = self.parameters - - data.datasets_data = list(datasets_data) - data.datasets = [d.dataset for d in datasets_data] - data.dataset_ids = [d.id for d in datasets_data] - - data.meta_features = self.extract_train_meta_features(data.datasets_data) - self.fit_datasets_similarity_assessor(data.meta_features, data.dataset_ids) - - data.best_models = self.load_models(data.datasets, histories, params.n_best_dataset_models_to_memorize, - evaluate_model_func) - self.fit_model_advisor(data.dataset_ids, data.best_models) - - return self - - def load_models( - self, datasets: Sequence[OpenMLDataset], - histories: Sequence[Sequence[OptHistory]], - n_best_dataset_models_to_load: int, - evaluate_model_func: Optional[Sequence[Callable]] = None) -> Sequence[Sequence[EvaluatedModel]]: - models = self.components.models_loader.load(datasets, histories, n_best_dataset_models_to_load, - evaluate_model_func) - models = self.components.models_fitness_scaler.fit_transform(models, datasets) - return models - - def extract_train_meta_features(self, datasets_data: List[TabularData]) -> DatasetMetaFeatures: - components = self.components - - meta_features = components.mf_extractor.extract( - datasets_data, fill_input_nans=True) - - meta_features.fillna(0, inplace=True) - - meta_features[meta_features.columns] = components.mf_scaler.fit_transform(meta_features) - - return meta_features - - def fit_datasets_similarity_assessor(self, meta_features: DatasetMetaFeatures, dataset_ids: List[DatasetIDType] - ) -> KNeighborsSimilarityAssessor: - return self.components.datasets_similarity_assessor.fit(meta_features, dataset_ids) - - def fit_model_advisor(self, dataset_ids: List[DatasetIDType], best_models: Sequence[Sequence[EvaluatedModel]] - ) -> DiverseModelAdvisor: - return self.components.model_advisor.fit(dataset_ids, best_models) - - def predict(self, datasets_data: Sequence[TabularData]) -> List[List[EvaluatedModel]]: - mf_extractor = self.components.mf_extractor - mf_scaler = self.components.mf_scaler - assessor = self.components.datasets_similarity_assessor - advisor = self.components.model_advisor - - meta_features = mf_extractor.extract(datasets_data, fill_input_nans=True) - meta_features.fillna(0, inplace=True) - meta_features[meta_features.columns] = mf_scaler.transform(meta_features) - similar_dataset_ids = assessor.predict(meta_features) - models = advisor.predict(similar_dataset_ids) - - return models diff --git a/meta_automl/approaches/meta_learning_approach.py b/meta_automl/approaches/meta_learning_approach.py deleted file mode 100644 index 56535768..00000000 --- a/meta_automl/approaches/meta_learning_approach.py +++ /dev/null @@ -1,20 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass - - -class MetaLearningApproach(ABC): - @dataclass - class Parameters: - pass - - @dataclass - class Data: - pass - - @dataclass - class Components: - pass - - @abstractmethod - def predict(self, *args, **kwargs): - raise NotImplementedError() diff --git a/meta_automl/data_preparation/model_fitness_scalers/__init__.py b/meta_automl/data_preparation/model_fitness_scalers/__init__.py deleted file mode 100644 index 544991f1..00000000 --- a/meta_automl/data_preparation/model_fitness_scalers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .dataset_models_fitness_scaler import DatasetModelsFitnessScaler, ScalerType \ No newline at end of file diff --git a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py b/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py deleted file mode 100644 index 23377028..00000000 --- a/meta_automl/data_preparation/model_fitness_scalers/dataset_models_fitness_scaler.py +++ /dev/null @@ -1,45 +0,0 @@ -from copy import copy -from typing import Dict, Sequence, Type, TypeVar - -from sklearn.base import OneToOneFeatureMixin, TransformerMixin -from sklearn.preprocessing import MinMaxScaler -from typing_extensions import Self - -from meta_automl.data_preparation.dataset.dataset_base import DatasetType_co -from meta_automl.data_preparation.evaluated_model import EvaluatedModel - -ScalerType = TypeVar('ScalerType', OneToOneFeatureMixin, TransformerMixin) - - -class DatasetModelsFitnessScaler: - def __init__(self, scaler_class: Type[ScalerType] = MinMaxScaler): - self.scaler_class = scaler_class - self.scalers: Dict[str, ScalerType] = {} - - def fit(self, models: Sequence[Sequence[EvaluatedModel]], datasets: Sequence[DatasetType_co]) -> Self: - dataset_representations = map(repr, datasets) - for dataset_repr, dataset_models in zip(dataset_representations, models): - scaler = self.scaler_class() - self.scalers[dataset_repr] = scaler - fitness_values_array = [model.fitness.values for model in dataset_models] - scaler.fit(fitness_values_array) - return self - - def transform(self, models: Sequence[Sequence[EvaluatedModel]], datasets: Sequence[DatasetType_co]): - new_models = [[copy(model) for model in dataset_models] for dataset_models in models] - dataset_representations = map(repr, datasets) - for dataset_repr, dataset_models in zip(dataset_representations, new_models): - scaler = self.scalers[dataset_repr] - fitness_values_array = [model.fitness.values for model in dataset_models] - fitness_values_array = scaler.transform(fitness_values_array) - for model, fitness_values in zip(dataset_models, fitness_values_array): - fitness = copy(model.fitness) - fitness.values = fitness_values - model.fitness = fitness - return new_models - - def fit_transform(self, - models: Sequence[Sequence[EvaluatedModel]], - datasets: Sequence[DatasetType_co]) -> Sequence[Sequence[EvaluatedModel]]: - self.fit(models, datasets) - return self.transform(models, datasets) From 176c71d7e981263b871fae56883d5a39c758c2d1 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Mon, 1 Apr 2024 12:48:26 +0000 Subject: [PATCH 32/34] some experiment fixes --- experiments/fedot_warm_start/config.yaml | 1 + .../fedot_warm_start/config_debug.yaml | 1 + .../fedot_warm_start/configs_list.yaml | 2 +- .../fedot_warm_start/fedot_config.yaml | 4 +- experiments/fedot_warm_start/run.py | 275 +++++++++++------- experiments/fedot_warm_start/run_v2.py | 0 6 files changed, 173 insertions(+), 110 deletions(-) create mode 100644 experiments/fedot_warm_start/run_v2.py diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml index aff82cca..5effe3f9 100644 --- a/experiments/fedot_warm_start/config.yaml +++ b/experiments/fedot_warm_start/config.yaml @@ -6,6 +6,7 @@ n_datasets: null # null for all available datasets test_size: 0.25 train_timeout: 15 test_timeout: 15 +n_automl_repetitions: 10 #meta_learning_params: n_best_dataset_models_to_memorize: 10 mf_extractor_params: diff --git a/experiments/fedot_warm_start/config_debug.yaml b/experiments/fedot_warm_start/config_debug.yaml index c8e23f0a..45cfbf20 100644 --- a/experiments/fedot_warm_start/config_debug.yaml +++ b/experiments/fedot_warm_start/config_debug.yaml @@ -6,6 +6,7 @@ n_datasets: 3 # null for all available datasets test_size: 0.33 train_timeout: 1 test_timeout: 1 +n_automl_repetitions: 1 #meta_learning_params: n_best_dataset_models_to_memorize: 10 mf_extractor_params: diff --git a/experiments/fedot_warm_start/configs_list.yaml b/experiments/fedot_warm_start/configs_list.yaml index 175a939e..b3e2b11f 100644 --- a/experiments/fedot_warm_start/configs_list.yaml +++ b/experiments/fedot_warm_start/configs_list.yaml @@ -1,3 +1,3 @@ -- config_light.yaml +- config_debug.yaml - evaluation_config.yaml - fedot_config.yaml diff --git a/experiments/fedot_warm_start/fedot_config.yaml b/experiments/fedot_warm_start/fedot_config.yaml index 5795163f..951024cc 100644 --- a/experiments/fedot_warm_start/fedot_config.yaml +++ b/experiments/fedot_warm_start/fedot_config.yaml @@ -1,5 +1,7 @@ fedot_params: problem: classification logging_level: 10 - n_jobs: -1 + n_jobs: -2 show_progress: false + cache_dir: '/var/essdata/tmp/fedot_cache' + use_auto_preprocessing: true diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 06202e55..2d3280bd 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -7,7 +7,7 @@ import shutil import timeit from datetime import datetime -from functools import partial, wraps +from functools import partial, wraps, reduce from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Tuple, Union from uuid import uuid4 @@ -21,17 +21,18 @@ from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.pipeline_builder import PipelineBuilder -from fedot.core.repository.quality_metrics_repository import MetricsRepository, QualityMetricsEnum +from fedot.core.repository.metrics_repository import MetricsRepository, QualityMetricsEnum from golem.core.optimisers.fitness import Fitness +from golem.core.optimisers.opt_history_objects.opt_history import OptHistory from pecapiku import CacheDict from sklearn.model_selection import train_test_split from tqdm import tqdm from typing_extensions import Literal -from meta_automl.approaches.knn_similarity_model_advice import KNNSimilarityModelAdvice -from meta_automl.data_preparation.dataset import DatasetIDType, OpenMLDataset, TabularData -from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split -from meta_automl.data_preparation.file_system import get_cache_dir +from gamlet.approaches.knn_similarity_model_advice import KNNSimilarityModelAdvice +from gamlet.data_preparation.dataset import DatasetIDType, OpenMLDataset, TabularData +from gamlet.data_preparation.datasets_train_test_split import openml_datasets_train_test_split +from gamlet.data_preparation.file_system import get_cache_dir CONFIGS_DIR = Path(__file__).parent @@ -70,7 +71,7 @@ UPDATE_TRAIN_TEST_DATASETS_SPLIT = config.get('update_train_test_datasets_split') # Postprocess constants -COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.metric_by_id, COLLECT_METRICS)) +COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.get_metric, COLLECT_METRICS)) COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss' @@ -256,44 +257,47 @@ def save_evaluation(save_dir: Path, dataset, pipeline, **kwargs): raise e -def run_fedot(train_data: TabularData, test_data: TabularData, timeout: float, - run_label: str, repetition: int, experiment_date: datetime, save_dir: Path, - fedot_evaluations_cache: CacheDict, - initial_assumption: Optional[Sequence[Pipeline]] = None, meta_learning_time_sec: float = 0.): +def run_fedot_attempt(train_data: TabularData, test_data: TabularData, timeout: float, + run_label: str, repetition: int, experiment_date: datetime, save_dir: Path, + fedot_evaluations_cache: CacheDict, + initial_assumption: Optional[Sequence[Pipeline]] = None, meta_learning_time_sec: float = 0.): fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **FEDOT_PARAMS) fit_func = partial(fedot.fit, features=train_data.x, target=train_data.y) evaluate_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data) run_date = datetime.now() - cache_key = f'{run_label}_{train_data.id}_{timeout}_{repetition}' - with fedot_evaluations_cache as cache_dict: - cached_run = cache_dict[cache_key] - if cached_run: - fedot = cached_run['fedot'] - pipeline = cached_run['pipeline'] - metrics = cached_run['metrics'] - fit_time = cached_run['fit_time'] - else: - pipeline, metrics, fit_time = fit_evaluate_automl(fit_func=fit_func, evaluate_func=evaluate_func) - cached_run = dict( - fedot=fedot, - pipeline=pipeline, - metrics=metrics, - fit_time=fit_time, - ) - cache_dict[cache_key] = cached_run - save_evaluation(dataset=train_data.dataset, - run_label=run_label, - pipeline=pipeline, - meta_learning_time_sec=meta_learning_time_sec, - automl_time_min=fit_time, - automl_timeout_min=fedot.params.timeout, - generations_count=fedot.history.generations_count, - history_obj=fedot.history, - run_data=run_date, - experiment_date=experiment_date, - save_dir=save_dir, - **metrics) - return fedot + # cache_key = f'{run_label}_{train_data.id}_{timeout}_{repetition}' + # with fedot_evaluations_cache as cache_dict: + # cached_run = cache_dict[cache_key] + # if cached_run: + # fedot = cached_run['fedot'] + # pipeline = cached_run['pipeline'] + # metrics = cached_run['metrics'] + # fit_time = cached_run['fit_time'] + # else: + # pipeline, metrics, fit_time = fit_evaluate_automl(fit_func=fit_func, evaluate_func=evaluate_func) + # cached_run = dict( + # fedot=fedot, + # pipeline=pipeline, + # metrics=metrics, + # fit_time=fit_time, + # ) + # cache_dict[cache_key] = cached_run + pipeline, metrics, fit_time = fit_evaluate_automl(fit_func=fit_func, evaluate_func=evaluate_func) + eval_result = dict( + dataset=train_data.dataset, + run_label=run_label, + pipeline=pipeline, + meta_learning_time_sec=meta_learning_time_sec, + automl_time_min=fit_time, + automl_timeout_min=fedot.params.timeout, + generations_count=fedot.history.generations_count, + history_obj=fedot.history, + run_data=run_date, + experiment_date=experiment_date, + save_dir=save_dir, + **metrics + ) + return eval_result def run_pipeline(train_data: TabularData, test_data: TabularData, pipeline: Pipeline, @@ -320,16 +324,11 @@ def run_pipeline(train_data: TabularData, test_data: TabularData, pipeline: Pipe @loguru.logger.catch def main(): - experiment_date, experiment_date_iso, experiment_date_for_path = get_current_formatted_date() - save_dir = get_save_dir(experiment_date_for_path) - setup_logging(save_dir) - if TMPDIR: - os.environ.putenv('TMPDIR', TMPDIR) - meta_learner_path = save_dir.joinpath('meta_learner.pkl') + dataset_ids_test, dataset_ids_train, experiment_date, meta_learner_path, save_dir = setup_experiment() - dataset_ids = get_dataset_ids() - dataset_ids_train, dataset_ids_test = split_datasets(dataset_ids, N_DATASETS, UPDATE_TRAIN_TEST_DATASETS_SPLIT) - dataset_ids = dataset_ids_train + dataset_ids_test + # fit_fedot_cached = CacheDict.decorate(fit_evaluate_automl, get_cache_dir() / 'fedot_runs.pkl', inner_key='dataset.id') + dataset_splits = get_datasets_data_splits(dataset_ids_test + dataset_ids_train) + datasets_eval_funcs = get_datasets_eval_funcs(dataset_ids_train, dataset_splits) algorithm = KNNSimilarityModelAdvice( N_BEST_DATASET_MODELS_TO_MEMORIZE, @@ -338,62 +337,40 @@ def main(): ADVISOR_PARAMS ) - experiment_params_dict = dict( - experiment_start_date_iso=experiment_date_iso, - input_config=config, - dataset_ids=dataset_ids, - dataset_ids_train=dataset_ids_train, - dataset_ids_test=dataset_ids_test, - baseline_pipeline=BASELINE_MODEL, - ) - save_experiment_params(experiment_params_dict, save_dir) - # Gathering knowledge base - # fit_fedot_cached = CacheDict.decorate(fit_evaluate_automl, get_cache_dir() / 'fedot_runs.pkl', inner_key='dataset.id') - dataset_splits = {} - for dataset_id in dataset_ids: - dataset = OpenMLDataset(dataset_id) - dataset_data = dataset.get_data() - idx_train, idx_test = train_test_split(range(len(dataset_data.y)), - test_size=DATA_TEST_SIZE, - stratify=dataset_data.y, - shuffle=True, - random_state=DATA_SPLIT_SEED) - train_data, test_data = dataset_data[idx_train], dataset_data[idx_test] - dataset_splits[dataset_id] = dict(train=train_data, test=test_data) - + # Experiment start knowledge_base = {dataset_id: [] for dataset_id in dataset_ids_train} fedot_evaluations_cache = CacheDict(get_cache_dir() / 'fedot_runs.pkl') - description = 'FEDOT, all datasets' - for dataset_id in (pbar := tqdm(dataset_ids, description)): + description = 'FEDOT, train datasets' + for dataset_id in (pbar := tqdm(dataset_ids_train, description)): pbar.set_description(description + f' ({dataset_id})') - timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] run_label = 'FEDOT' - for repetition in range(N_AUTOML_REPETITIONS): - try: - fedot = run_fedot(train_data, test_data, timeout, run_label, repetition, experiment_date, save_dir, + evaluate_fedot_on_dataset(train_data, test_data, TRAIN_TIMEOUT, run_label, experiment_date, save_dir, fedot_evaluations_cache) - # TODO: - # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run - if dataset_id in dataset_ids_train and fedot.history: - knowledge_base[dataset_id].append(fedot.history) - except Exception as e: - logging.exception(f'Train dataset "{dataset_id}"') - if __debug__: - raise e - knowledge_base_data = [OpenMLDataset(dataset).get_data() for dataset in knowledge_base.keys()] - knowledge_base_histories = list(knowledge_base.values()) - # Learning - dataset_eval_funcs = [] - for dataset_id in dataset_ids_train: - split = dataset_splits[dataset_id] - train_data, test_data = split['train'], split['test'] - model_eval_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data, mode='fitness') - dataset_eval_funcs.append(model_eval_func) - algorithm.fit(knowledge_base_data, knowledge_base_histories, dataset_eval_funcs) + # knowledge_base[dataset_id] = gain_knowledge_base_for_dataset(dataset_id, experiment_date, + # fedot_evaluations_cache, + # run_label, save_dir, + # test_data, TRAIN_TIMEOUT, train_data) + # knowledge_base[dataset_id] = [fedot.history for fedot in fedots] + + description = 'FEDOT, test datasets' + for dataset_id in (pbar := tqdm(dataset_ids_test, description)): + pbar.set_description(description + f' ({dataset_id})') + train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] + run_label = 'FEDOT' + evaluate_fedot_on_dataset(train_data, test_data, TEST_TIMEOUT, run_label, experiment_date, save_dir, + fedot_evaluations_cache) + + ############################### + kb_datasets_data = [OpenMLDataset(dataset).get_data() for dataset in knowledge_base.keys()] + kb_histories = list(knowledge_base.values()) + ############################### + + # Meta-Learning + algorithm.fit(kb_datasets_data, kb_histories, datasets_eval_funcs) with open(meta_learner_path, 'wb') as meta_learner_file: pickle.dump(algorithm, meta_learner_file) - + # Application description = 'MetaFEDOT, Test datasets' for dataset_id in (pbar := tqdm(dataset_ids_test, description)): pbar.set_description(description + f' ({dataset_id})') @@ -404,18 +381,10 @@ def main(): initial_assumptions = initial_assumptions[0] assumption_pipelines = [model.predictor for model in initial_assumptions] # 2 - timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() run_label = 'MetaFEDOT' - for repetition in range(N_AUTOML_REPETITIONS): - try: - run_fedot(train_data, test_data, timeout, run_label, repetition, experiment_date, save_dir, - fedot_evaluations_cache, initial_assumption=assumption_pipelines, - meta_learning_time_sec=meta_learning_time_sec) - except Exception as e: - logging.exception(f'Test dataset "{dataset_id}"') - if __debug__: - raise e + evaluate_fedot_on_dataset(train_data, test_data, TEST_TIMEOUT, run_label, experiment_date, save_dir, + fedot_evaluations_cache, assumption_pipelines, meta_learning_time_sec) # Fit & evaluate simple baseline run_label = 'simple baseline' try: @@ -436,6 +405,96 @@ def main(): raise e +def get_datasets_eval_funcs(dataset_ids_train, dataset_splits): + dataset_eval_funcs = [] + for dataset_id in dataset_ids_train: + split = dataset_splits[dataset_id] + train_data, test_data = split['train'], split['test'] + model_eval_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data, mode='fitness') + dataset_eval_funcs.append(model_eval_func) + return dataset_eval_funcs + + +def get_datasets_data_splits(dataset_ids): + dataset_splits = {} + for dataset_id in dataset_ids: + dataset = OpenMLDataset(dataset_id) + dataset_data = dataset.get_data() + idx_train, idx_test = train_test_split(range(len(dataset_data.y)), + test_size=DATA_TEST_SIZE, + stratify=dataset_data.y, + shuffle=True, + random_state=DATA_SPLIT_SEED) + train_data, test_data = dataset_data[idx_train], dataset_data[idx_test] + dataset_splits[dataset_id] = dict(train=train_data, test=test_data) + return dataset_splits + + +def setup_experiment(): + # Preparation + experiment_date, experiment_date_iso, experiment_date_for_path = get_current_formatted_date() + save_dir = get_save_dir(experiment_date_for_path) + setup_logging(save_dir) + if TMPDIR: + os.environ.putenv('TMPDIR', TMPDIR) + meta_learner_path = save_dir.joinpath('meta_learner.pkl') + dataset_ids = get_dataset_ids() + dataset_ids_train, dataset_ids_test = split_datasets(dataset_ids, N_DATASETS, UPDATE_TRAIN_TEST_DATASETS_SPLIT) + dataset_ids = dataset_ids_train + dataset_ids_test + experiment_params_dict = dict( + experiment_start_date_iso=experiment_date_iso, + input_config=config, + dataset_ids=dataset_ids, + dataset_ids_train=dataset_ids_train, + dataset_ids_test=dataset_ids_test, + baseline_pipeline=BASELINE_MODEL, + ) + save_experiment_params(experiment_params_dict, save_dir) + return dataset_ids_test, dataset_ids_train, experiment_date, meta_learner_path, save_dir + + +def evaluate_fedot_on_dataset(train_data: TabularData, test_data: TabularData, timeout: float, + run_label: str, experiment_date: datetime, save_dir: Path, + fedot_evaluations_cache: CacheDict, + initial_assumption: Optional[Sequence[Pipeline]] = None, + meta_learning_time_sec: float = 0.): + dataset = train_data.dataset + eval_results = [] + for repetition in range(N_AUTOML_REPETITIONS): + try: + eval_result, time_delta = timed( + run_fedot_attempt(train_data, test_data, timeout, run_label, repetition, experiment_date, save_dir, + fedot_evaluations_cache)) + # TODO: + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run + + # TODO: Условие на прерывание + eval_results.append(eval_result) + except Exception as e: + logging.exception(f'Dataset "{dataset.id}"') + if __debug__: + raise e + + for eval_result in eval_results: + save_evaluation(**eval_result) + + return eval_results + + +def gain_knowledge_base_for_dataset(train_data: TabularData, test_data: TabularData, timeout: float, + run_label: str, experiment_date: datetime, save_dir: Path, + fedot_evaluations_cache: CacheDict, + initial_assumption: Optional[Sequence[Pipeline]] = None, + meta_learning_time_sec: float = 0.): + eval_results = evaluate_fedot_on_dataset(train_data, test_data, timeout, + run_label, experiment_date, save_dir, + fedot_evaluations_cache, + initial_assumption, + meta_learning_time_sec) + histories = reduce([OptHistory.load, ], [res['history_path'] for res in eval_results]) + return histories + + if __name__ == "__main__": try: main() diff --git a/experiments/fedot_warm_start/run_v2.py b/experiments/fedot_warm_start/run_v2.py new file mode 100644 index 00000000..e69de29b From 26c0a520810bcf9f8a9affff5e1e559978d05f58 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Mon, 20 May 2024 15:47:44 +0000 Subject: [PATCH 33/34] experiment stability update --- .../{ => configs}/config.yaml | 2 + .../{ => configs}/config_debug.yaml | 13 +- .../{ => configs}/config_light.yaml | 0 .../{ => configs}/evaluation_config.yaml | 0 .../{ => configs}/fedot_config.yaml | 2 +- .../use_configs.yaml} | 2 +- experiments/fedot_warm_start/run.py | 693 +++++++++++------- experiments/fedot_warm_start/run_v2.py | 0 .../approaches/knn_similarity_model_advice.py | 11 +- .../pymfe_extractor.py | 8 +- .../datasets_train_test_split.py | 33 +- 11 files changed, 470 insertions(+), 294 deletions(-) rename experiments/fedot_warm_start/{ => configs}/config.yaml (90%) rename experiments/fedot_warm_start/{ => configs}/config_debug.yaml (57%) rename experiments/fedot_warm_start/{ => configs}/config_light.yaml (100%) rename experiments/fedot_warm_start/{ => configs}/evaluation_config.yaml (100%) rename experiments/fedot_warm_start/{ => configs}/fedot_config.yaml (92%) rename experiments/fedot_warm_start/{configs_list.yaml => configs/use_configs.yaml} (69%) delete mode 100644 experiments/fedot_warm_start/run_v2.py diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/configs/config.yaml similarity index 90% rename from experiments/fedot_warm_start/config.yaml rename to experiments/fedot_warm_start/configs/config.yaml index 5effe3f9..cbf58399 100644 --- a/experiments/fedot_warm_start/config.yaml +++ b/experiments/fedot_warm_start/configs/config.yaml @@ -1,6 +1,8 @@ --- seed: 42 tmpdir: '/var/essdata/tmp' +update_train_test_datasets_split: true + #data_settings: n_datasets: null # null for all available datasets test_size: 0.25 diff --git a/experiments/fedot_warm_start/config_debug.yaml b/experiments/fedot_warm_start/configs/config_debug.yaml similarity index 57% rename from experiments/fedot_warm_start/config_debug.yaml rename to experiments/fedot_warm_start/configs/config_debug.yaml index 45cfbf20..99e1ec83 100644 --- a/experiments/fedot_warm_start/config_debug.yaml +++ b/experiments/fedot_warm_start/configs/config_debug.yaml @@ -1,16 +1,19 @@ --- seed: 42 save_dir_prefix: debug_ +update_train_test_datasets_split: true #data_settings: -n_datasets: 3 # null for all available datasets -test_size: 0.33 -train_timeout: 1 -test_timeout: 1 +n_datasets: 10 # null for all available datasets +test_size: 0.4 +train_timeout: 15 +test_timeout: 15 n_automl_repetitions: 1 #meta_learning_params: n_best_dataset_models_to_memorize: 10 mf_extractor_params: - groups: general + # groups: general + features: + - nr_inst assessor_params: n_neighbors: 2 advisor_params: diff --git a/experiments/fedot_warm_start/config_light.yaml b/experiments/fedot_warm_start/configs/config_light.yaml similarity index 100% rename from experiments/fedot_warm_start/config_light.yaml rename to experiments/fedot_warm_start/configs/config_light.yaml diff --git a/experiments/fedot_warm_start/evaluation_config.yaml b/experiments/fedot_warm_start/configs/evaluation_config.yaml similarity index 100% rename from experiments/fedot_warm_start/evaluation_config.yaml rename to experiments/fedot_warm_start/configs/evaluation_config.yaml diff --git a/experiments/fedot_warm_start/fedot_config.yaml b/experiments/fedot_warm_start/configs/fedot_config.yaml similarity index 92% rename from experiments/fedot_warm_start/fedot_config.yaml rename to experiments/fedot_warm_start/configs/fedot_config.yaml index 951024cc..bd8f5825 100644 --- a/experiments/fedot_warm_start/fedot_config.yaml +++ b/experiments/fedot_warm_start/configs/fedot_config.yaml @@ -1,7 +1,7 @@ fedot_params: problem: classification logging_level: 10 - n_jobs: -2 + n_jobs: 1 show_progress: false cache_dir: '/var/essdata/tmp/fedot_cache' use_auto_preprocessing: true diff --git a/experiments/fedot_warm_start/configs_list.yaml b/experiments/fedot_warm_start/configs/use_configs.yaml similarity index 69% rename from experiments/fedot_warm_start/configs_list.yaml rename to experiments/fedot_warm_start/configs/use_configs.yaml index b3e2b11f..ac61e8d0 100644 --- a/experiments/fedot_warm_start/configs_list.yaml +++ b/experiments/fedot_warm_start/configs/use_configs.yaml @@ -1,3 +1,3 @@ -- config_debug.yaml +- config.yaml - evaluation_config.yaml - fedot_config.yaml diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 2d3280bd..154129ef 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -3,16 +3,18 @@ import json import logging import os +import sys import pickle import shutil import timeit -from datetime import datetime -from functools import partial, wraps, reduce +from datetime import datetime, timedelta +from functools import partial, wraps from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Tuple, Union from uuid import uuid4 import loguru +import numpy as np import openml import pandas as pd import yaml @@ -21,80 +23,118 @@ from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.pipeline_builder import PipelineBuilder -from fedot.core.repository.metrics_repository import MetricsRepository, QualityMetricsEnum +from fedot.core.repository.metrics_repository import ( + MetricsRepository, + QualityMetricsEnum, +) from golem.core.optimisers.fitness import Fitness -from golem.core.optimisers.opt_history_objects.opt_history import OptHistory from pecapiku import CacheDict from sklearn.model_selection import train_test_split from tqdm import tqdm from typing_extensions import Literal +sys.path.insert(0, str(Path(__file__).parents[2])) + from gamlet.approaches.knn_similarity_model_advice import KNNSimilarityModelAdvice from gamlet.data_preparation.dataset import DatasetIDType, OpenMLDataset, TabularData -from gamlet.data_preparation.datasets_train_test_split import openml_datasets_train_test_split +from gamlet.data_preparation.datasets_train_test_split import ( + openml_datasets_train_test_split, +) from gamlet.data_preparation.file_system import get_cache_dir -CONFIGS_DIR = Path(__file__).parent +CONFIGS_DIR = Path(__file__).parent / "configs" -with open(CONFIGS_DIR / 'configs_list.yaml', 'r') as config_file: +with open(CONFIGS_DIR / "use_configs.yaml", "r") as config_file: configs_list = yaml.load(config_file, yaml.Loader) config = {} for conf_name in configs_list: - with open(CONFIGS_DIR / conf_name, 'r') as config_file: + with open(CONFIGS_DIR / conf_name, "r") as config_file: conf = yaml.load(config_file, yaml.Loader) intersection = set(config).intersection(set(conf)) if intersection: - raise ValueError(f'Parameter values given twice: {conf_name}, {intersection}.') + raise ValueError(f"Parameter values given twice: {conf_name}, {intersection}.") config.update(conf) # Load constants -SEED = config['seed'] -N_DATASETS = config['n_datasets'] -TEST_SIZE = config['test_size'] -TRAIN_TIMEOUT = config['train_timeout'] -TEST_TIMEOUT = config['test_timeout'] -N_BEST_DATASET_MODELS_TO_MEMORIZE = config['n_best_dataset_models_to_memorize'] -ASSESSOR_PARAMS = config['assessor_params'] -ADVISOR_PARAMS = config['advisor_params'] -MF_EXTRACTOR_PARAMS = config['mf_extractor_params'] -COLLECT_METRICS = config['collect_metrics'] -FEDOT_PARAMS = config['fedot_params'] -DATA_TEST_SIZE = config['data_test_size'] -DATA_SPLIT_SEED = config['data_split_seed'] -BASELINE_MODEL = config['baseline_model'] -N_AUTOML_REPETITIONS = config['n_automl_repetitions'] +SEED = config["seed"] +N_DATASETS = config["n_datasets"] +TEST_SIZE = config["test_size"] +TRAIN_TIMEOUT = config["train_timeout"] +TEST_TIMEOUT = config["test_timeout"] +N_BEST_DATASET_MODELS_TO_MEMORIZE = config["n_best_dataset_models_to_memorize"] +ASSESSOR_PARAMS = config["assessor_params"] +ADVISOR_PARAMS = config["advisor_params"] +MF_EXTRACTOR_PARAMS = config["mf_extractor_params"] +COLLECT_METRICS = config["collect_metrics"] +FEDOT_PARAMS = config["fedot_params"] +DATA_TEST_SIZE = config["data_test_size"] +DATA_SPLIT_SEED = config["data_split_seed"] +BASELINE_MODEL = config["baseline_model"] +N_AUTOML_REPETITIONS = config["n_automl_repetitions"] # Optional values -TMPDIR = config.get('tmpdir') -SAVE_DIR_PREFIX = config.get('save_dir_prefix') +TMPDIR = config.get("tmpdir") +SAVE_DIR_PREFIX = config.get("save_dir_prefix") -UPDATE_TRAIN_TEST_DATASETS_SPLIT = config.get('update_train_test_datasets_split') +UPDATE_TRAIN_TEST_DATASETS_SPLIT = config.get("update_train_test_datasets_split") # Postprocess constants COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.get_metric, COLLECT_METRICS)) -COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss' +COLLECT_METRICS[COLLECT_METRICS.index("neg_log_loss")] = "logloss" + + +def setup_experiment(): + # Preparation + experiment_date, experiment_date_iso, experiment_date_for_path = ( + get_current_formatted_date() + ) + save_dir = get_save_dir(experiment_date_for_path) + setup_logging(save_dir) + if TMPDIR: + os.environ.putenv("TMPDIR", TMPDIR) + meta_learner_path = save_dir.joinpath("meta_learner.pkl") + dataset_ids = get_dataset_ids() + dataset_ids_train, dataset_ids_test = split_datasets( + dataset_ids, N_DATASETS, UPDATE_TRAIN_TEST_DATASETS_SPLIT + ) + dataset_ids = dataset_ids_train + dataset_ids_test + experiment_params_dict = dict( + experiment_start_date_iso=experiment_date_iso, + input_config=config, + dataset_ids=dataset_ids, + dataset_ids_train=dataset_ids_train, + dataset_ids_test=dataset_ids_test, + baseline_pipeline=BASELINE_MODEL, + ) + save_experiment_params(experiment_params_dict, save_dir) + return ( + dataset_ids_test, + dataset_ids_train, + experiment_date, + meta_learner_path, + save_dir, + ) def setup_logging(save_dir: Path): - """ Creates "log.txt" at the "save_dir" and redirects all logging output to it. """ + """Creates "log.txt" at the "save_dir" and redirects all logging output to it.""" loguru.logger.add(save_dir / "file_{time}.log") - log_file = save_dir.joinpath('log.txt') + log_file = save_dir.joinpath("log.txt") logging.basicConfig( filename=log_file, - filemode='a', - format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', - datefmt='%H:%M:%S', + filemode="a", + format="%(asctime)s %(name)s %(levelname)s %(message)s", force=True, - level=logging.DEBUG, + level=logging.NOTSET, ) def get_current_formatted_date() -> Tuple[datetime, str, str]: - """ Returns current date in the following formats: + """Returns current date in the following formats: - 1. datetime - 2. str: ISO - 3. str: ISO compatible with Windows file system path (with "." instead of ":") """ + 1. datetime + 2. str: ISO + 3. str: ISO compatible with Windows file system path (with "." instead of ":")""" time_now = datetime.now() time_now_iso = time_now.isoformat(timespec="minutes") time_now_for_path = time_now_iso.replace(":", ".") @@ -102,8 +142,12 @@ def get_current_formatted_date() -> Tuple[datetime, str, str]: def get_save_dir(time_now_for_path) -> Path: - save_dir = get_cache_dir(). \ - joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') + save_dir = ( + get_cache_dir() + .joinpath("experiments") + .joinpath("fedot_warm_start") + .joinpath(f"run_{time_now_for_path}") + ) if SAVE_DIR_PREFIX: save_dir = save_dir.with_name(SAVE_DIR_PREFIX + save_dir.name) if save_dir.exists(): @@ -121,18 +165,21 @@ def get_dataset_ids() -> List[DatasetIDType]: return list(dataset_ids) -def split_datasets(dataset_ids, n_datasets: Optional[int] = None, update_train_test_split: bool = False) \ - -> Tuple[pd.DataFrame, pd.DataFrame]: - split_path = Path(__file__).parent / 'train_test_datasets_split.csv' +def split_datasets( + dataset_ids, n_datasets: Optional[int] = None, update_train_test_split: bool = False +) -> Tuple[list, list]: + split_path = Path(__file__).parent / "train_test_datasets_split.csv" if update_train_test_split: - df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED) + df_split_datasets = openml_datasets_train_test_split( + dataset_ids, test_size=TEST_SIZE, seed=SEED + ) df_split_datasets.to_csv(split_path) else: df_split_datasets = pd.read_csv(split_path, index_col=0) - df_train = df_split_datasets[df_split_datasets['is_train'] == 1] - df_test = df_split_datasets[df_split_datasets['is_train'] == 0] + df_train = df_split_datasets[df_split_datasets["is_train"] == 1] + df_test = df_split_datasets[df_split_datasets["is_train"] == 0] if n_datasets is not None: frac = n_datasets / len(df_split_datasets) @@ -145,13 +192,14 @@ def split_datasets(dataset_ids, n_datasets: Optional[int] = None, update_train_t return datasets_train, datasets_test -def evaluate_pipeline(pipeline: Pipeline, - train_data: TabularData, - test_data: TabularData, - metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM, - metric_names: Sequence[str] = COLLECT_METRICS, - mode: Literal['fitness', 'float'] = 'float' - ) -> Union[Dict[str, float], Tuple[Fitness, Sequence[str]]]: +def evaluate_pipeline( + pipeline: Pipeline, + train_data: TabularData, + test_data: TabularData, + metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM, + metric_names: Sequence[str] = COLLECT_METRICS, + mode: Literal["fitness", "float"] = "float", +) -> Union[Dict[str, float], Tuple[Fitness, Sequence[str]]]: """Gets quality metrics for the fitted pipeline. The function is based on `Fedot.get_metrics()` @@ -165,87 +213,97 @@ def data_producer(): yield train_data, test_data objective = MetricsObjective(metrics) - obj_eval = PipelineObjectiveEvaluate(objective=objective, - data_producer=data_producer, - eval_n_jobs=-1) + obj_eval = PipelineObjectiveEvaluate( + objective=objective, data_producer=data_producer, eval_n_jobs=-1 + ) fitness = obj_eval.evaluate(pipeline) - if mode == 'float': + if mode == "float": metric_values = fitness.values - metric_values = {metric_name: round(value, 3) for (metric_name, value) in zip(metric_names, metric_values)} + metric_values = { + metric_name: round(value, 3) + for (metric_name, value) in zip(metric_names, metric_values) + } return metric_values - if mode == 'fitness': + if mode == "fitness": return fitness, metric_names -def timed(func, resolution: Literal['sec', 'min'] = 'min'): +def timed(func): @wraps(func) def wrapper(*args, **kwargs): time_start = timeit.default_timer() result = func(*args, **kwargs) - time_delta = timeit.default_timer() - time_start - if resolution == 'min': - time_delta /= 60 + time_delta = timedelta(seconds=timeit.default_timer() - time_start) return result, time_delta return wrapper -def fit_evaluate_automl(fit_func, evaluate_func) -> (Fedot, Dict[str, Any]): - """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset.. """ +def fit_evaluate_automl( + fit_func, evaluate_func +) -> Tuple[Fedot, Dict[str, Any], timedelta]: + """Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset..""" result, fit_time = timed(fit_func)() metrics = evaluate_func(result) return result, metrics, fit_time -def fit_evaluate_pipeline(pipeline, fit_func, evaluate_func) -> (Fedot, Dict[str, Any]): - """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset.. """ +def fit_evaluate_pipeline( + pipeline, fit_func, evaluate_func +) -> Tuple[Fedot, Dict[str, Any], timedelta]: + """Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset..""" _, fit_time = timed(fit_func)() metrics = evaluate_func(pipeline) return pipeline, metrics, fit_time def save_experiment_params(params_dict: Dict[str, Any], save_dir: Path): - """ Save the hyperparameters of the experiment """ - params_file_path = save_dir.joinpath('parameters.json') - with open(params_file_path, 'w') as params_file: + """Save the hyperparameters of the experiment""" + params_file_path = save_dir.joinpath("parameters.json") + with open(params_file_path, "w") as params_file: json.dump(params_dict, params_file, indent=2) def save_evaluation(save_dir: Path, dataset, pipeline, **kwargs): - run_results: Dict[str, Any] = dict(dataset_id=dataset.id, - dataset_name=dataset.name, - model_obj=pipeline, - model_str=pipeline.descriptive_id, - task_type='classification', - **kwargs) + run_results: Dict[str, Any] = dict( + dataset_id=dataset.id, + dataset_name=dataset.name, + model_obj=pipeline, + model_str=pipeline.descriptive_id, + task_type="classification", + **kwargs, + ) try: - histories_dir = save_dir.joinpath('histories') - models_dir = save_dir.joinpath('models') - eval_results_path = save_dir.joinpath('evaluation_results.csv') + histories_dir = save_dir.joinpath("histories") + models_dir = save_dir.joinpath("models") + eval_results_path = save_dir.joinpath("evaluation_results.csv") histories_dir.mkdir(exist_ok=True) models_dir.mkdir(exist_ok=True) - dataset_id = run_results['dataset_id'] - run_label = run_results['run_label'] + dataset_id = run_results["dataset_id"] + run_label = run_results["run_label"] # define saving paths uid = str(uuid4()) - model_path = models_dir.joinpath(f'{dataset_id}_{run_label}_{uid}') - history_path = histories_dir.joinpath(f'{dataset_id}_{run_label}_{uid}_history.json') + model_path = models_dir.joinpath(f"{dataset_id}_{run_label}_{uid}") + history_path = histories_dir.joinpath( + f"{dataset_id}_{run_label}_{uid}_history.json" + ) # replace objects with export paths for csv - run_results['model_path'] = str(model_path) - run_results.pop('model_obj').save(model_path, create_subdir=False) - run_results['history_path'] = str(history_path) - if 'history_obj' in run_results: - history_obj = run_results.pop('history_obj') + run_results["model_path"] = str(model_path) + run_results.pop("model_obj").save(model_path, create_subdir=False) + run_results["history_path"] = str(history_path) + if "history_obj" in run_results: + history_obj = run_results.pop("history_obj") if history_obj is not None: - history_obj.save(run_results['history_path']) + history_obj.save(run_results["history_path"]) + run_results["history_obj"] = history_obj df_evaluation_properties = pd.DataFrame([run_results]) if eval_results_path.exists(): - df_results = pd.read_csv(eval_results_path) + df_results = pd.read_csv(eval_results_path, index_col=None) df_results = pd.concat([df_results, df_evaluation_properties]) else: df_results = df_evaluation_properties @@ -257,138 +315,327 @@ def save_evaluation(save_dir: Path, dataset, pipeline, **kwargs): raise e -def run_fedot_attempt(train_data: TabularData, test_data: TabularData, timeout: float, - run_label: str, repetition: int, experiment_date: datetime, save_dir: Path, - fedot_evaluations_cache: CacheDict, - initial_assumption: Optional[Sequence[Pipeline]] = None, meta_learning_time_sec: float = 0.): - fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **FEDOT_PARAMS) +def run_fedot_attempt( + train_data: TabularData, + test_data: TabularData, + timeout: float, + run_label: str, + repetition: int, + experiment_date: datetime, + save_dir: Path, + initial_assumption: Optional[Sequence[Pipeline]] = None, + fedot_evaluations_cache=None, +): + fedot = Fedot( + timeout=timeout, initial_assumption=initial_assumption, **FEDOT_PARAMS + ) fit_func = partial(fedot.fit, features=train_data.x, target=train_data.y) - evaluate_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data) + evaluate_func = partial( + evaluate_pipeline, train_data=train_data, test_data=test_data + ) run_date = datetime.now() - # cache_key = f'{run_label}_{train_data.id}_{timeout}_{repetition}' - # with fedot_evaluations_cache as cache_dict: - # cached_run = cache_dict[cache_key] - # if cached_run: - # fedot = cached_run['fedot'] - # pipeline = cached_run['pipeline'] - # metrics = cached_run['metrics'] - # fit_time = cached_run['fit_time'] - # else: - # pipeline, metrics, fit_time = fit_evaluate_automl(fit_func=fit_func, evaluate_func=evaluate_func) - # cached_run = dict( - # fedot=fedot, - # pipeline=pipeline, - # metrics=metrics, - # fit_time=fit_time, - # ) - # cache_dict[cache_key] = cached_run - pipeline, metrics, fit_time = fit_evaluate_automl(fit_func=fit_func, evaluate_func=evaluate_func) + cache_key = f"{run_label}_{train_data.id}_{timeout}_{repetition}" + with fedot_evaluations_cache as cache_dict: + cached_run = cache_dict[cache_key] + if cached_run: + fedot = cached_run["fedot"] + pipeline = cached_run["pipeline"] + metrics = cached_run["metrics"] + fit_time = cached_run["fit_time"] + else: + # pipeline, metrics, fit_time = fit_evaluate_automl(fit_func=fit_func, evaluate_func=evaluate_func) + # cached_run = dict( + # fedot=fedot, + # pipeline=pipeline, + # metrics=metrics, + # fit_time=fit_time, + # ) + # cache_dict[cache_key] = cached_run + pipeline, metrics, fit_time = fit_evaluate_automl( + fit_func=fit_func, evaluate_func=evaluate_func + ) eval_result = dict( dataset=train_data.dataset, run_label=run_label, pipeline=pipeline, - meta_learning_time_sec=meta_learning_time_sec, - automl_time_min=fit_time, + automl_time_min=fit_time.total_seconds() / 60, automl_timeout_min=fedot.params.timeout, generations_count=fedot.history.generations_count, history_obj=fedot.history, run_data=run_date, experiment_date=experiment_date, save_dir=save_dir, - **metrics + **metrics, ) return eval_result -def run_pipeline(train_data: TabularData, test_data: TabularData, pipeline: Pipeline, - run_label: str, experiment_date: datetime, save_dir: Path): +def run_pipeline( + train_data: TabularData, + test_data: TabularData, + pipeline: Pipeline, + run_label: str, + experiment_date: datetime, + save_dir: Path, +): train_data_for_fedot = array_to_input_data(train_data.x, train_data.y) fit_func = partial(pipeline.fit, train_data_for_fedot) - evaluate_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data) + evaluate_func = partial( + evaluate_pipeline, train_data=train_data, test_data=test_data + ) run_date = datetime.now() - pipeline, metrics, fit_time = fit_evaluate_pipeline(pipeline=pipeline, fit_func=fit_func, - evaluate_func=evaluate_func) - save_evaluation(dataset=train_data.dataset, - run_label=run_label, - pipeline=pipeline, - automl_time_min=0, - pipeline_fit_time=fit_time, - automl_timeout_min=0, - meta_learning_time_sec=0, - run_data=run_date, - experiment_date=experiment_date, - save_dir=save_dir, - **metrics) + pipeline, metrics, fit_time = fit_evaluate_pipeline( + pipeline=pipeline, fit_func=fit_func, evaluate_func=evaluate_func + ) + save_evaluation( + dataset=train_data.dataset, + run_label=run_label, + pipeline=pipeline, + automl_time_min=0, + pipeline_fit_time_sec=fit_time.total_seconds(), + automl_timeout_min=0, + meta_learning_time_sec=0, + run_data=run_date, + experiment_date=experiment_date, + save_dir=save_dir, + **metrics, + ) return pipeline +def get_datasets_eval_funcs(dataset_ids_train, dataset_splits): + dataset_eval_funcs = [] + for dataset_id in dataset_ids_train: + split = dataset_splits[dataset_id] + train_data, test_data = split["train"], split["test"] + model_eval_func = partial( + evaluate_pipeline, + train_data=train_data, + test_data=test_data, + mode="fitness", + ) + dataset_eval_funcs.append(model_eval_func) + return dataset_eval_funcs + + +def get_datasets_data_splits(dataset_ids): + dataset_splits = {} + for dataset_id in dataset_ids: + dataset = OpenMLDataset(dataset_id) + dataset_data = dataset.get_data() + if isinstance(dataset_data.y[0], bool): + dataset_data.y = np.array(list(map(str, dataset_data.y))) + idx_train, idx_test = train_test_split( + range(len(dataset_data.y)), + test_size=DATA_TEST_SIZE, + stratify=dataset_data.y, + shuffle=True, + random_state=DATA_SPLIT_SEED, + ) + train_data, test_data = dataset_data[idx_train], dataset_data[idx_test] + dataset_splits[dataset_id] = dict(train=train_data, test=test_data) + return dataset_splits + + +def evaluate_fedot_on_dataset( + train_data: TabularData, + test_data: TabularData, + timeout: float, + run_label: str, + experiment_date: datetime, + save_dir: Path, + fedot_evaluations_cache: CacheDict, + initial_assumption: Optional[Sequence[Pipeline]] = None, + meta_learning_time: Optional[timedelta] = None, +): + meta_learning_time = meta_learning_time or timedelta(0) + dataset = train_data.dataset + + eval_results = [] + for repetition in range(N_AUTOML_REPETITIONS): + try: + eval_result, time_delta = timed(run_fedot_attempt)( + train_data, + test_data, + timeout, + run_label, + repetition, + experiment_date, + save_dir, + initial_assumption, + fedot_evaluations_cache, + ) + time_limit = timedelta(minutes=timeout * 2) + if time_delta > time_limit: + logging.warning( + f'Dataset "{dataset.id}" TIMEOUT REACHED, {time_delta}.' + ) + return None + + eval_results.append(eval_result) + except Exception as e: + logging.warning(f'Dataset "{dataset.id}" skipepd: {e}') + logging.exception(f'Dataset "{dataset.id}"') + if __debug__: + raise e + return None + + generations_total = sum( + map(lambda ev_res: ev_res["history_obj"].generations_count, eval_results) + ) + if generations_total == 0: + logging.warning(f'Dataset "{dataset.id}": zero generations obtained.') + return None + + for eval_result in eval_results: + eval_result["meta_learning_time_sec"] = meta_learning_time.total_seconds() + save_evaluation(**eval_result) + + histories = list(map(lambda r: r["history_obj"], eval_results)) + + return histories + + @loguru.logger.catch def main(): - dataset_ids_test, dataset_ids_train, experiment_date, meta_learner_path, save_dir = setup_experiment() + ( + dataset_ids_test, + dataset_ids_train, + experiment_date, + meta_learner_path, + save_dir, + ) = setup_experiment() - # fit_fedot_cached = CacheDict.decorate(fit_evaluate_automl, get_cache_dir() / 'fedot_runs.pkl', inner_key='dataset.id') dataset_splits = get_datasets_data_splits(dataset_ids_test + dataset_ids_train) - datasets_eval_funcs = get_datasets_eval_funcs(dataset_ids_train, dataset_splits) algorithm = KNNSimilarityModelAdvice( N_BEST_DATASET_MODELS_TO_MEMORIZE, MF_EXTRACTOR_PARAMS, ASSESSOR_PARAMS, - ADVISOR_PARAMS + ADVISOR_PARAMS, ) - # Experiment start - knowledge_base = {dataset_id: [] for dataset_id in dataset_ids_train} - fedot_evaluations_cache = CacheDict(get_cache_dir() / 'fedot_runs.pkl') - description = 'FEDOT, train datasets' - for dataset_id in (pbar := tqdm(dataset_ids_train, description)): - pbar.set_description(description + f' ({dataset_id})') - train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] - run_label = 'FEDOT' - evaluate_fedot_on_dataset(train_data, test_data, TRAIN_TIMEOUT, run_label, experiment_date, save_dir, - fedot_evaluations_cache) - # knowledge_base[dataset_id] = gain_knowledge_base_for_dataset(dataset_id, experiment_date, - # fedot_evaluations_cache, - # run_label, save_dir, - # test_data, TRAIN_TIMEOUT, train_data) - # knowledge_base[dataset_id] = [fedot.history for fedot in fedots] - - description = 'FEDOT, test datasets' - for dataset_id in (pbar := tqdm(dataset_ids_test, description)): - pbar.set_description(description + f' ({dataset_id})') - train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] - run_label = 'FEDOT' - evaluate_fedot_on_dataset(train_data, test_data, TEST_TIMEOUT, run_label, experiment_date, save_dir, - fedot_evaluations_cache) + # knowledge_base = {dataset_id: [] for dataset_id in dataset_ids_train} + knowledge_base = {} + skipped_datasets = set() + fedot_evaluations_cache = CacheDict(get_cache_dir() / "fedot_runs.pkl") + # fedot_evaluations_cache = None + # evaluate_fedot_on_dataset_cached = CacheDict.decorate(evaluate_fedot_on_dataset, get_cache_dir() / 'fedot_runs.pkl', inner_key='train_data.id') + description = "FEDOT, all datasets ({dataset_id})" + for dataset_id in (pbar := tqdm(dataset_ids_train + dataset_ids_test, description)): + pbar.set_description(description.format(dataset_id=dataset_id)) + train_data, test_data = ( + dataset_splits[dataset_id]["train"], + dataset_splits[dataset_id]["test"], + ) + run_label = "FEDOT" + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_test else TEST_TIMEOUT + histories = evaluate_fedot_on_dataset( + train_data, + test_data, + timeout, + run_label, + experiment_date, + save_dir, + fedot_evaluations_cache, + ) + if histories is not None: + if dataset_id in dataset_ids_train: + knowledge_base[dataset_id] = histories + continue + # Error processing - throw the dataset out + skipped_datasets.add(dataset_id) + if dataset_id in dataset_ids_train: + del dataset_ids_train[dataset_ids_train.index(dataset_id)] + else: + del dataset_ids_test[dataset_ids_test.index(dataset_id)] + + with open(save_dir / "skipped_datasets.txt", "w") as f: + f.write("\n".join(map(str, skipped_datasets))) ############################### - kb_datasets_data = [OpenMLDataset(dataset).get_data() for dataset in knowledge_base.keys()] + kb_datasets_data = [ + OpenMLDataset(dataset).get_data() for dataset in knowledge_base.keys() + ] + # datasets_eval_funcs = get_datasets_eval_funcs(dataset_ids_train, dataset_splits) + datasets_eval_funcs = None kb_histories = list(knowledge_base.values()) ############################### # Meta-Learning algorithm.fit(kb_datasets_data, kb_histories, datasets_eval_funcs) - with open(meta_learner_path, 'wb') as meta_learner_file: + for dataset_id in dataset_ids_train: + if dataset_id not in algorithm.data.dataset_ids: + skipped_datasets.add(dataset_id) + del dataset_ids_train[dataset_ids_train.index(dataset_id)] + with open(save_dir / "skipped_datasets.txt", "w") as f: + f.write("\n".join(map(str, skipped_datasets))) + + with open(meta_learner_path, "wb") as meta_learner_file: pickle.dump(algorithm, meta_learner_file) # Application - description = 'MetaFEDOT, Test datasets' + # evaluate_metafedot_on_dataset_cached = CacheDict.decorate(evaluate_fedot_on_dataset, get_cache_dir() / 'metafedot_runs.pkl', inner_key='train_data.id') + fedot_evaluations_cache = CacheDict(get_cache_dir() / "metafedot_runs.pkl") + description = "FEDOT, test datasets ({dataset_id})" for dataset_id in (pbar := tqdm(dataset_ids_test, description)): - pbar.set_description(description + f' ({dataset_id})') - train_data, test_data = dataset_splits[dataset_id]['train'], dataset_splits[dataset_id]['test'] + pbar.set_description(description.format(dataset_id=dataset_id)) + train_data, test_data = ( + dataset_splits[dataset_id]["train"], + dataset_splits[dataset_id]["test"], + ) # Run meta AutoML # 1 - initial_assumptions, meta_learning_time_sec = timed(algorithm.predict, resolution='sec')([train_data]) + try: + initial_assumptions, meta_learning_time = timed(algorithm.predict)( + [train_data] + ) + if not initial_assumptions: + raise ValueError("No intial assumptions.") + except Exception: + logging.exception( + f'Dataset "{dataset_id}" skipepd, meta learner could not predict: {e}' + ) + skipped_datasets.add(dataset_id) + del dataset_ids_test[dataset_ids_test.index(dataset_id)] + continue + initial_assumptions = initial_assumptions[0] assumption_pipelines = [model.predictor for model in initial_assumptions] # 2 baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() - run_label = 'MetaFEDOT' - evaluate_fedot_on_dataset(train_data, test_data, TEST_TIMEOUT, run_label, experiment_date, save_dir, - fedot_evaluations_cache, assumption_pipelines, meta_learning_time_sec) + run_label = "MetaFEDOT" + try: + histories = evaluate_fedot_on_dataset( + train_data, + test_data, + TEST_TIMEOUT, + run_label, + experiment_date, + save_dir, + fedot_evaluations_cache, + assumption_pipelines, + meta_learning_time, + ) + if histories is None: + raise ValueError("No results.") + except Exception as e: + logging.exception( + f'Dataset "{dataset_id}" skipepd, meta fedot could not finish: {e}' + ) + skipped_datasets.add(dataset_id) + del dataset_ids_test[dataset_ids_test.index(dataset_id)] + continue # Fit & evaluate simple baseline - run_label = 'simple baseline' + run_label = "simple baseline" try: - run_pipeline(train_data, test_data, baseline_pipeline, run_label, experiment_date, save_dir) + run_pipeline( + train_data, + test_data, + baseline_pipeline, + run_label, + experiment_date, + save_dir, + ) except Exception as e: logging.exception(f'Test dataset "{dataset_id}", {run_label}') if __debug__: @@ -397,107 +644,27 @@ def main(): for i, assumption in enumerate(initial_assumptions): try: pipeline = assumption.predictor - run_label = f'MetaFEDOT - initial assumption {i}' - run_pipeline(train_data, test_data, pipeline, run_label, experiment_date, save_dir) + run_label = f"MetaFEDOT - initial assumption {i}" + run_pipeline( + train_data, + test_data, + pipeline, + run_label, + experiment_date, + save_dir, + ) except Exception as e: logging.exception(f'Test dataset "{dataset_id}", {run_label}') if __debug__: raise e - -def get_datasets_eval_funcs(dataset_ids_train, dataset_splits): - dataset_eval_funcs = [] - for dataset_id in dataset_ids_train: - split = dataset_splits[dataset_id] - train_data, test_data = split['train'], split['test'] - model_eval_func = partial(evaluate_pipeline, train_data=train_data, test_data=test_data, mode='fitness') - dataset_eval_funcs.append(model_eval_func) - return dataset_eval_funcs - - -def get_datasets_data_splits(dataset_ids): - dataset_splits = {} - for dataset_id in dataset_ids: - dataset = OpenMLDataset(dataset_id) - dataset_data = dataset.get_data() - idx_train, idx_test = train_test_split(range(len(dataset_data.y)), - test_size=DATA_TEST_SIZE, - stratify=dataset_data.y, - shuffle=True, - random_state=DATA_SPLIT_SEED) - train_data, test_data = dataset_data[idx_train], dataset_data[idx_test] - dataset_splits[dataset_id] = dict(train=train_data, test=test_data) - return dataset_splits - - -def setup_experiment(): - # Preparation - experiment_date, experiment_date_iso, experiment_date_for_path = get_current_formatted_date() - save_dir = get_save_dir(experiment_date_for_path) - setup_logging(save_dir) - if TMPDIR: - os.environ.putenv('TMPDIR', TMPDIR) - meta_learner_path = save_dir.joinpath('meta_learner.pkl') - dataset_ids = get_dataset_ids() - dataset_ids_train, dataset_ids_test = split_datasets(dataset_ids, N_DATASETS, UPDATE_TRAIN_TEST_DATASETS_SPLIT) - dataset_ids = dataset_ids_train + dataset_ids_test - experiment_params_dict = dict( - experiment_start_date_iso=experiment_date_iso, - input_config=config, - dataset_ids=dataset_ids, - dataset_ids_train=dataset_ids_train, - dataset_ids_test=dataset_ids_test, - baseline_pipeline=BASELINE_MODEL, - ) - save_experiment_params(experiment_params_dict, save_dir) - return dataset_ids_test, dataset_ids_train, experiment_date, meta_learner_path, save_dir - - -def evaluate_fedot_on_dataset(train_data: TabularData, test_data: TabularData, timeout: float, - run_label: str, experiment_date: datetime, save_dir: Path, - fedot_evaluations_cache: CacheDict, - initial_assumption: Optional[Sequence[Pipeline]] = None, - meta_learning_time_sec: float = 0.): - dataset = train_data.dataset - eval_results = [] - for repetition in range(N_AUTOML_REPETITIONS): - try: - eval_result, time_delta = timed( - run_fedot_attempt(train_data, test_data, timeout, run_label, repetition, experiment_date, save_dir, - fedot_evaluations_cache)) - # TODO: - # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run - - # TODO: Условие на прерывание - eval_results.append(eval_result) - except Exception as e: - logging.exception(f'Dataset "{dataset.id}"') - if __debug__: - raise e - - for eval_result in eval_results: - save_evaluation(**eval_result) - - return eval_results - - -def gain_knowledge_base_for_dataset(train_data: TabularData, test_data: TabularData, timeout: float, - run_label: str, experiment_date: datetime, save_dir: Path, - fedot_evaluations_cache: CacheDict, - initial_assumption: Optional[Sequence[Pipeline]] = None, - meta_learning_time_sec: float = 0.): - eval_results = evaluate_fedot_on_dataset(train_data, test_data, timeout, - run_label, experiment_date, save_dir, - fedot_evaluations_cache, - initial_assumption, - meta_learning_time_sec) - histories = reduce([OptHistory.load, ], [res['history_path'] for res in eval_results]) - return histories + with open(save_dir / "skipped_datasets.txt", "w") as f: + f.write("\n".join(map(str, skipped_datasets))) if __name__ == "__main__": try: main() except Exception as e: - logging.exception('Exception at main().') + logging.exception("Exception at main().") raise e diff --git a/experiments/fedot_warm_start/run_v2.py b/experiments/fedot_warm_start/run_v2.py deleted file mode 100644 index e69de29b..00000000 diff --git a/gamlet/approaches/knn_similarity_model_advice.py b/gamlet/approaches/knn_similarity_model_advice.py index d47b0835..9a42c204 100644 --- a/gamlet/approaches/knn_similarity_model_advice.py +++ b/gamlet/approaches/knn_similarity_model_advice.py @@ -4,6 +4,7 @@ from typing import Callable, List, Optional, Sequence from golem.core.optimisers.opt_history_objects.opt_history import OptHistory +import pandas as pd from sklearn.preprocessing import MinMaxScaler from gamlet.approaches import MetaLearningApproach @@ -55,7 +56,7 @@ class Components: class Data: meta_features: DatasetMetaFeatures = None datasets: List[OpenMLDataset] = None - datasets_data: List[OpenMLDataset] = None + datasets_data: List[TabularData] = None dataset_ids: List[DatasetIDType] = None best_models: List[List[EvaluatedModel]] = None @@ -66,11 +67,11 @@ def fit(self, data = self.data params = self.parameters - data.datasets_data = list(datasets_data) - data.datasets = [d.dataset for d in datasets_data] - data.dataset_ids = [d.id for d in datasets_data] + data.meta_features = self.extract_train_meta_features(datasets_data) + data.dataset_ids = list(data.meta_features.index) + data.datasets_data = [d_d for d_d in datasets_data if d_d.id in data.dataset_ids] + data.datasets = [d_d.dataset for d_d in data.datasets_data] - data.meta_features = self.extract_train_meta_features(data.datasets_data) self.fit_datasets_similarity_assessor(data.meta_features, data.dataset_ids) data.best_models = self.load_models(data.datasets, histories, params.n_best_dataset_models_to_memorize, diff --git a/gamlet/components/meta_features_extractors/pymfe_extractor.py b/gamlet/components/meta_features_extractors/pymfe_extractor.py index 91702523..4a02f29b 100644 --- a/gamlet/components/meta_features_extractors/pymfe_extractor.py +++ b/gamlet/components/meta_features_extractors/pymfe_extractor.py @@ -5,6 +5,7 @@ from functools import partial from typing import Any, Dict, Optional, Sequence, Tuple, Union +import numpy as np import pandas as pd from pymfe.mfe import MFE from tqdm import tqdm @@ -31,8 +32,11 @@ def extract(self, data_sequence: Sequence[Union[DatasetBase, TabularData]], for i, dataset_data in enumerate(tqdm(data_sequence, desc='Extracting meta features of the datasets')): if isinstance(dataset_data, DatasetBase): dataset_data = dataset_data.get_data() - meta_features = self._extract_single(dataset_data, fill_input_nans, fit_kwargs, extract_kwargs) - accumulated_meta_features.append(meta_features) + try: + meta_features = self._extract_single(dataset_data, fill_input_nans, fit_kwargs, extract_kwargs) + accumulated_meta_features.append(meta_features) + except Exception: + logger.exception(f'Dataset {dataset_data.dataset}: error while meta-features extractin.') output = DatasetMetaFeatures(pd.concat(accumulated_meta_features), is_summarized=self.summarize_features, features=self.features) diff --git a/gamlet/data_preparation/datasets_train_test_split.py b/gamlet/data_preparation/datasets_train_test_split.py index 75e97e19..ebd26abf 100644 --- a/gamlet/data_preparation/datasets_train_test_split.py +++ b/gamlet/data_preparation/datasets_train_test_split.py @@ -29,24 +29,23 @@ def openml_datasets_train_test_split(dataset_ids: List[OpenMLDatasetIDType], tes single_value_categories = cat_counts[cat_counts == 1].index idx = df_split_categories[df_split_categories['category'].isin(single_value_categories)].index df_split_categories.loc[idx, 'category'] = 'single_value' - df_datasets_to_split = df_split_categories[df_split_categories['category'] != 'single_value'] - df_test_only_datasets = df_split_categories[df_split_categories['category'] == 'single_value'] - if not df_datasets_to_split.empty: - df_train_datasets, df_test_datasets = train_test_split( - df_datasets_to_split, - test_size=test_size, - shuffle=True, - stratify=df_datasets_to_split['category'], - random_state=seed - ) - df_test_datasets = pd.concat([df_test_datasets, df_test_only_datasets]) + signle_value_datasets = df_split_categories[df_split_categories['category'] == 'single_value'] + if len(signle_value_datasets) >= 1: + df_datasets_to_split = df_split_categories + additional_datasets = pd.DataFrame([]) else: - df_train_datasets, df_test_datasets = train_test_split( - df_split_categories, - test_size=test_size, - shuffle=True, - random_state=seed - ) + df_datasets_to_split = df_split_categories[df_split_categories['category'] != 'single_value'] + additional_datasets = signle_value_datasets + + df_train_datasets, df_test_datasets = train_test_split( + df_datasets_to_split, + test_size=test_size, + shuffle=True, + stratify=df_datasets_to_split['category'], + random_state=seed + ) + df_train_datasets = pd.concat([df_train_datasets, additional_datasets]) + df_train_datasets['is_train'] = 1 df_test_datasets['is_train'] = 0 df_split_datasets = pd.concat([df_train_datasets, df_test_datasets]).join( From 295c7e2fa782be0b4dce0a21463b97ab7045f665 Mon Sep 17 00:00:00 2001 From: Peter Shevcnenko <57573631+MorrisNein@users.noreply.github.com> Date: Tue, 21 May 2024 12:09:19 +0300 Subject: [PATCH 34/34] Builds fix (#98) * remove redundant test * remove codecov badge --- README.md | 2 +- tests/unit/surrogate/test_surrogate_model.py | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/README.md b/README.md index 8c5fffcb..746860da 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![package](https://badge.fury.io/py/gamlet.svg)](https://badge.fury.io/py/gamlet) [![Build](https://github.com/ITMO-NSS-team/MetaFEDOT/actions/workflows/build.yml/badge.svg)](https://github.com/ITMO-NSS-team/MetaFEDOT/actions/workflows/build.yml) [![Documentation Status](https://readthedocs.org/projects/gamlet/badge/?version=latest)](https://gamlet.readthedocs.io/en/latest/?badge=latest) -[![codecov](https://codecov.io/gh/ITMO-NSS-team/GAMLET/graph/badge.svg?token=N3Z9YTPHP9)](https://codecov.io/gh/ITMO-NSS-team/GAMLET) + [![Visitors](https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fgithub.com%2FITMO-NSS-team%2FMetaFEDOT&countColor=%23263759&style=plastic&labelStyle=lower)](https://visitorbadge.io/status?path=https%3A%2F%2Fgithub.com%2FITMO-NSS-team%2FMetaFEDOT) GAMLET (previously known as MetaFEDOT) is an open platform for sharing meta-learning experiences in **AutoML** and more diff --git a/tests/unit/surrogate/test_surrogate_model.py b/tests/unit/surrogate/test_surrogate_model.py index a7572b42..92d93f1f 100644 --- a/tests/unit/surrogate/test_surrogate_model.py +++ b/tests/unit/surrogate/test_surrogate_model.py @@ -29,10 +29,3 @@ def get_test_data(): x_pipe = torch.load(path / 'data_pipe_test.pt') x_dset = torch.load(path / 'data_dset_test.pt') return x_pipe, x_dset - - -def test_model_output(read_config): - x_pipe, x_dset = get_test_data() - model = create_model_from_config(read_config, x_pipe, x_dset) - pred = torch.squeeze(model.forward(x_pipe, x_dset)) - assert pred.shape[0] == 256