From 107714e03efa60f507eb3411c27a8dc7a4d69459 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 26 Feb 2023 17:44:30 +0300 Subject: [PATCH 01/60] create requirements.txt --- requirements.txt | Bin 0 -> 280 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..17db3011d9c2832987c96b696a85b52c145131e1 GIT binary patch literal 280 zcmY+9Ne;p=5CrRt#8W_V!gAs^N}LBxcI)lyFs&`Q(oOKY#lN-!S_mE@~HDXW1*?EwdK^Q70h) literal 0 HcmV?d00001 From e67bde826ffd27b397dac94d3f1861859f831a26 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 26 Feb 2023 17:53:07 +0300 Subject: [PATCH 02/60] move to FEDOT 0.7.0 --- .../advise_models_from_similar_datasets.py | 2 +- meta_automl/data_preparation/model.py | 2 +- .../models_loaders/fedot_pipelines_loader.py | 2 +- requirements.txt | Bin 280 -> 310 bytes 4 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py index 85d62b48..f7d583c5 100644 --- a/examples/4_advising_models/advise_models_from_similar_datasets.py +++ b/examples/4_advising_models/advise_models_from_similar_datasets.py @@ -1,5 +1,5 @@ -from fedot.core.optimisers.fitness import SingleObjFitness from fedot.core.pipelines.pipeline_builder import PipelineBuilder +from golem.core.optimisers.fitness import SingleObjFitness from sklearn.model_selection import train_test_split from meta_automl.data_preparation.dataset import DatasetCache diff --git a/meta_automl/data_preparation/model.py b/meta_automl/data_preparation/model.py index 44543dfe..f999368d 100644 --- a/meta_automl/data_preparation/model.py +++ b/meta_automl/data_preparation/model.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from typing import Any -from fedot.core.optimisers.fitness import Fitness +from golem.core.optimisers.fitness import Fitness from meta_automl.data_preparation.dataset import DatasetCache diff --git a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py index b5fe2abf..bb66c3aa 100644 --- a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py +++ b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py @@ -4,7 +4,6 @@ import numpy as np from fedot.core.data.data import InputData -from fedot.core.log import default_log from fedot.core.optimisers.objective import PipelineObjectiveEvaluate from fedot.core.optimisers.objective.metrics_objective import MetricsObjective from fedot.core.pipelines.pipeline import Pipeline @@ -12,6 +11,7 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.core.validation.split import tabular_cv_generator +from golem.core.log import default_log from tqdm import tqdm from meta_automl.data_preparation.data_manager import PathType diff --git a/requirements.txt b/requirements.txt index 17db3011d9c2832987c96b696a85b52c145131e1..1f48eb66a4e24225c52ee8361e241051939a511e 100644 GIT binary patch delta 38 pcmbQiw2f(l1fyIDLk2@CLpnn~Lk^J6Wv~T80|q??BOo?l005_?2KN8} delta 7 OcmdnSG=ph_1S0?nP68YN From 245865421caadbb29027df0dbebd90d11547e1be Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 26 Feb 2023 18:07:22 +0300 Subject: [PATCH 03/60] create Dockerfile --- Dockerfile | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..e17e17cd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +# Download base image ubuntu 20.04 +FROM ubuntu:20.04 + +# For apt to be noninteractive +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +# Preseed tzdata, update package index, upgrade packages and install needed software +RUN truncate -s0 /tmp/preseed.cfg; \ + echo "tzdata tzdata/Areas select Europe" >> /tmp/preseed.cfg; \ + echo "tzdata tzdata/Zones/Europe select Berlin" >> /tmp/preseed.cfg; \ + debconf-set-selections /tmp/preseed.cfg && \ + rm -f /etc/timezone /etc/localtime && \ + apt-get update && \ + apt-get install -y nano && \ + apt-get install -y mc && \ + apt-get install -y python3.9 python3-pip && \ + apt-get install -y git && \ + rm -rf /var/lib/apt/lists/* + +# Set the workdir +ENV WORKDIR /home/meta-automl-research +WORKDIR $WORKDIR +COPY . $WORKDIR + +RUN pip3 install pip && \ + pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt + +ENV PYTHONPATH $WORKDIR From e8fee3014ed570ec39d672d1d95c7953cf26c196 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 26 Feb 2023 22:06:22 +0300 Subject: [PATCH 04/60] prepare experiment demo --- experiments/fedot_warm_start/run.py | 81 +++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 experiments/fedot_warm_start/run.py diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py new file mode 100644 index 00000000..f0342126 --- /dev/null +++ b/experiments/fedot_warm_start/run.py @@ -0,0 +1,81 @@ +import functools +import timeit + +import openml +import pandas as pd +from fedot.api.main import Fedot +from sklearn.model_selection import train_test_split + +from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor + +SEED = 42 + + +def prepare_data(): + dataset_ids = pd.Series(openml.study.get_suite(99).data) + dataset_ids = dataset_ids.sample(n=15, random_state=SEED) + dataset_ids = list(dataset_ids) + return OpenMLDatasetsLoader().load(dataset_ids) + + +def timeit_decorator(function): + @functools.wraps(function) + def wrapped(*args, **kwargs): + start_time = timeit.default_timer() + res = function(*args, **kwargs) + time = timeit.default_timer() - start_time + return res, time + + return wrapped + + +def main(): + datasets_cache = prepare_data() + datasets_train, datasets_test = train_test_split(datasets_cache, test_size=0.33, random_state=SEED) + + # TODO: + # - Extract meta-features for train datasets + # - Fit 'DatasetsSimilarityAssessor' + + results_pre = [] + for cache in datasets_train: + data = cache.from_cache() + fedot = Fedot('classification', timeout=15, n_jobs=-1, seed=SEED) + _, automl_time = timeit_decorator(fedot.fit)(data.x, data.y) + results_pre.append({'dataset': data.name, 'model': fedot, 'automl_time': automl_time}) + + # TODO: + # - Prepare 'ModelAdvisor' + + results = [] + for cache in datasets_test: + data = cache.from_cache() + fedot_naive = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED) + _, automl_time_naive = timeit_decorator(fedot_naive.fit)(data.x, data.y) + + time_start = timeit.default_timer() + # TODO: + # - Extract meta-features for current test dataset + # - Get suitable assumptions from 'ModelAdvisor' + initial_assumption = ... + fedot_meta = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED, initial_assumption=initial_assumption) + automl_time_meta = timeit.default_timer() - time_start + + metrics_naive = fedot_naive.get_metrics() + metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()} + metrics_meta = fedot_meta.get_metrics() + metrics_meta = {f'{key}_meta': val for key, val in metrics_meta.items()} + + results.append({ + 'dataset': data.name, + 'model_naive': fedot_naive, + 'model_meta': fedot_meta, + 'automl_time_naive': automl_time_naive, + 'automl_time_meta': automl_time_meta, + **metrics_naive, **metrics_meta + }) + + +if __name__ == "__main__": + main() From 5fb00f0f760947739c6d4a8bef84a877f05c2df6 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 3 Mar 2023 11:24:52 +0300 Subject: [PATCH 05/60] adapt to FEDOT 0.7.0 again --- .../advise_models_from_similar_datasets.py | 8 ++++---- .../model_advisors/diverse_fedot_pipeline_advisor.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py index f7d583c5..5d948e0b 100644 --- a/examples/4_advising_models/advise_models_from_similar_datasets.py +++ b/examples/4_advising_models/advise_models_from_similar_datasets.py @@ -21,13 +21,13 @@ def main(): # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42) y_train = x_train.index - assessor = KNNSimilarityAssessor({'n_neighbors': 2}, n_best=2) + assessor = KNNSimilarityAssessor({'n_neighbors': 3}, n_best=2) assessor.fit(x_train, y_train) # Define best models for datasets. best_pipelines = [ - PipelineBuilder().add_node('scaling').add_node('rf').to_pipeline(), - PipelineBuilder().add_node('normalization').add_node('logit').to_pipeline(), - PipelineBuilder().add_node('rf').add_node('logit').to_pipeline() + PipelineBuilder().add_node('scaling').add_node('rf').build(), + PipelineBuilder().add_node('normalization').add_node('logit').build(), + PipelineBuilder().add_node('rf').add_node('logit').build() ] best_models = [[Model(pipeline, SingleObjFitness(1), DatasetCache(dataset_name))] for dataset_name, pipeline in zip(y_train, best_pipelines)] diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py index aa91e0db..15ef1f57 100644 --- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py @@ -1,7 +1,7 @@ from typing import Callable, List, Iterable -from fedot.core.dag.linked_graph import get_distance_between from fedot.core.pipelines.pipeline import Pipeline +from golem.core.dag.linked_graph import get_distance_between from meta_automl.data_preparation.model import Model from meta_automl.meta_algorithm.datasets_similarity_assessors import DatasetsSimilarityAssessor From 310a5788baad24a360f838c8d1bd9f743dd95952 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 3 Mar 2023 17:40:35 +0300 Subject: [PATCH 06/60] fix similarity assessors --- .../select_similar_datasets_by_knn.py | 6 +- .../advise_models_from_similar_datasets.py | 4 +- .../datasets_similarity_assessors/__init__.py | 2 +- .../model_based_similarity_assessors.py | 51 ++++++++++++++++ .../predict_proba_similarity_assessors.py | 59 ------------------- 5 files changed, 57 insertions(+), 65 deletions(-) create mode 100644 meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py delete mode 100644 meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py diff --git a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py index dc1c190c..b6f2bb8c 100644 --- a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py +++ b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py @@ -2,7 +2,7 @@ from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor -from meta_automl.meta_algorithm.datasets_similarity_assessors import KNNSimilarityAssessor +from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor def main(): @@ -16,10 +16,10 @@ def main(): # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42) y_train = x_train.index - assessor = KNNSimilarityAssessor({'n_neighbors': 1}, n_best=2) + assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=3) assessor.fit(x_train, y_train) # Get models for the best fitting datasets from train. - return x_test.index, assessor.predict(x_test) + return x_test.index, assessor.predict(x_test, return_distance=True) if __name__ == '__main__': diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py index 5d948e0b..d10dad85 100644 --- a/examples/4_advising_models/advise_models_from_similar_datasets.py +++ b/examples/4_advising_models/advise_models_from_similar_datasets.py @@ -6,7 +6,7 @@ from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor from meta_automl.data_preparation.model import Model -from meta_automl.meta_algorithm.datasets_similarity_assessors import KNNSimilarityAssessor +from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor @@ -21,7 +21,7 @@ def main(): # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42) y_train = x_train.index - assessor = KNNSimilarityAssessor({'n_neighbors': 3}, n_best=2) + assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=2) assessor.fit(x_train, y_train) # Define best models for datasets. best_pipelines = [ diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py index 621a68e0..0c33e2c4 100644 --- a/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py +++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py @@ -1,2 +1,2 @@ from .datasets_similarity_assessor import DatasetsSimilarityAssessor -from .predict_proba_similarity_assessors import KNNSimilarityAssessor, PredictProbaSimilarityAssessor +from .model_based_similarity_assessors import KNeighborsBasedSimilarityAssessor, ModelBasedSimilarityAssessor diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py new file mode 100644 index 00000000..09720a1e --- /dev/null +++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py @@ -0,0 +1,51 @@ +from abc import ABC +from typing import Optional, Dict, Any, List, Iterable + +import numpy as np +import pandas as pd +from sklearn.neighbors import NearestNeighbors + +from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \ + DatasetsSimilarityAssessor + + +class ModelBasedSimilarityAssessor(ABC, DatasetsSimilarityAssessor): + def __init__(self, model, n_best: int = 1): + self._inner_model = model + self.n_best = n_best + self._datasets: Optional[Iterable[str]] = None + + +class KNeighborsBasedSimilarityAssessor(ModelBasedSimilarityAssessor): + def __init__(self, n_neighbors: int = 1, **model_params): + model = NearestNeighbors(n_neighbors=n_neighbors, **model_params) + super().__init__(model, n_neighbors) + + def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]): + meta_features = self.preprocess_meta_features(meta_features) + self._datasets = np.array(datasets) + self._inner_model.fit(meta_features) + + @staticmethod + def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame: + return meta_features.dropna(axis=1, how='any') + + def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[str]]: + dataset_indexes = self._inner_model.kneighbors(meta_features, return_distance=return_distance) + if return_distance: + distances, dataset_indexes = dataset_indexes + dataset_names = np.take(self._datasets, dataset_indexes, axis=0) + return distances, dataset_names + else: + return np.take(self._datasets, dataset_indexes, axis=0) + + @property + def datasets(self) -> Optional[Iterable[str]]: + return self._datasets + + @property + def feature_names(self) -> List[str]: + return self._inner_model.feature_names_in_ + + def _preprocess_predict_features(self, meta_features: pd.DataFrame) -> pd.DataFrame: + return meta_features[self.feature_names] diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py deleted file mode 100644 index 8254c745..00000000 --- a/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import Optional, Dict, Any, List, Iterable - -import numpy as np -import pandas as pd -from sklearn.neighbors import KNeighborsClassifier - -from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \ - DatasetsSimilarityAssessor - - -class PredictProbaSimilarityAssessor(DatasetsSimilarityAssessor): - def __init__(self, model, n_best: int = 1): - self._inner_model = model - self.n_best = n_best - - @property - def datasets(self) -> List[str]: - return self._inner_model.classes_ - - @property - def feature_names(self) -> List[str]: - return self._inner_model.feature_names_in_ - - @staticmethod - def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame: - return meta_features.dropna(axis=1, how='any') - - def _preprocess_predict_features(self, meta_features: pd.DataFrame) -> pd.DataFrame: - return meta_features[self.feature_names] - - def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]): - meta_features = self.preprocess_meta_features(meta_features) - self._inner_model.fit(meta_features, datasets) - - def predict_proba(self, meta_features: pd.DataFrame) -> List[List[float]]: - return self._inner_model.predict_proba(meta_features) - - def predict(self, meta_features: pd.DataFrame) -> List[List[str]]: - meta_features = self._preprocess_predict_features(meta_features) - predict_probs = self.predict_proba(meta_features) - final_prediction = [] - for probabilities in predict_probs: - probabilities = list(probabilities) - predictions = [] - for _ in range(self.n_best): - predicted_class_idx = np.argmax(probabilities) - predicted_class = self.datasets[predicted_class_idx] - predictions.append(predicted_class) - probabilities.pop(predicted_class_idx) - final_prediction.append(predictions) - - return final_prediction - - -class KNNSimilarityAssessor(PredictProbaSimilarityAssessor): - def __init__(self, model_params: Optional[Dict[str, Any]] = None, n_best: int = 1): - model_params = model_params or dict() - model = KNeighborsClassifier(**model_params) - super().__init__(model, n_best) From ae9c9098959fb4bb00f5f9b958309ca2ba118712 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sat, 4 Mar 2023 13:32:47 +0300 Subject: [PATCH 07/60] allow PymfeExtractor fill values with median --- .../meta_features_extractors/pymfe_extractor.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py index 2848998e..3a379f6f 100644 --- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py @@ -25,7 +25,7 @@ def datasets_loader(self) -> DatasetsLoader: raise ValueError("Datasets loader not provided!") return self._datasets_loader - def extract(self, datasets: List[Union[DatasetCache, str]]) -> pd.DataFrame: + def extract(self, datasets: List[Union[DatasetCache, str]], fill_nans: bool = False) -> pd.DataFrame: meta_features = {} meta_feature_names = self._extractor.extract_metafeature_names() load_dataset = self.datasets_loader.cache_to_memory @@ -37,10 +37,21 @@ def extract(self, datasets: List[Union[DatasetCache, str]]) -> pd.DataFrame: else: loaded_dataset = load_dataset(dataset) cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val] - mfe = self._extractor.fit(loaded_dataset.x, loaded_dataset.y, cat_cols=cat_cols) + x = loaded_dataset.x + y = loaded_dataset.y + if fill_nans: + x = self.fill_nans(x) + mfe = self._extractor.fit(x, y, cat_cols=cat_cols) feature_names, dataset_features = mfe.extract(out_type=tuple) mfs = dict(zip(feature_names, dataset_features)) self._update_meta_features_cache(dataset.name, mfs) meta_features[dataset.name] = mfs meta_features = pd.DataFrame.from_dict(meta_features, orient='index') return meta_features + + @staticmethod + def fill_nans(x): + if not isinstance(x, pd.DataFrame): + x = pd.DataFrame(x) + x = x.fillna(x.median()) + return x.to_numpy() From 60dc77ad0b50c33bd5537647db7ce991df0c7917 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 24 Mar 2023 14:10:48 +0300 Subject: [PATCH 08/60] use FEDOT version with fixed initial assumptions --- requirements.txt | Bin 310 -> 460 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1f48eb66a4e24225c52ee8361e241051939a511e..4785a7e31c157dd88d70c0da7317bb52f92cfde0 100644 GIT binary patch delta 185 zcmXAjOA5k35JaEm6fRu36q25h#19BA)U6vY5Mw@qA%f%DGr92;o=Ght%~VfUJ(_+E z-@)VisX61B(nvj5ZY+sZf4||*k&uFpR&Fl3)pY6oh#gxt>^ZO&SCEllob()(dO|9V q)vNyR Date: Thu, 30 Mar 2023 13:47:06 +0300 Subject: [PATCH 09/60] optional cache usage for MFE extractor --- .../meta_features_extractors/pymfe_extractor.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py index 3a379f6f..36cb9d45 100644 --- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py @@ -25,26 +25,30 @@ def datasets_loader(self) -> DatasetsLoader: raise ValueError("Datasets loader not provided!") return self._datasets_loader - def extract(self, datasets: List[Union[DatasetCache, str]], fill_nans: bool = False) -> pd.DataFrame: + def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: bool = False, + use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame: meta_features = {} meta_feature_names = self._extractor.extract_metafeature_names() load_dataset = self.datasets_loader.cache_to_memory for dataset in datasets: if isinstance(dataset, str): dataset = DatasetCache(dataset) - if mfs := self._get_meta_features_cache(dataset.name, meta_feature_names): + + if (use_cached and + (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))): meta_features[dataset.name] = mfs else: loaded_dataset = load_dataset(dataset) cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val] x = loaded_dataset.x y = loaded_dataset.y - if fill_nans: + if fill_input_nans: x = self.fill_nans(x) mfe = self._extractor.fit(x, y, cat_cols=cat_cols) feature_names, dataset_features = mfe.extract(out_type=tuple) mfs = dict(zip(feature_names, dataset_features)) - self._update_meta_features_cache(dataset.name, mfs) + if update_cached: + self._update_meta_features_cache(dataset.name, mfs) meta_features[dataset.name] = mfs meta_features = pd.DataFrame.from_dict(meta_features, orient='index') return meta_features From a5a0c8abf96729de915da5a69d1cfc89aba25cca Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 30 Mar 2023 13:47:54 +0300 Subject: [PATCH 10/60] allow to advise only the n best models --- .../model_advisors/diverse_fedot_pipeline_advisor.py | 8 +++++++- .../meta_algorithm/model_advisors/model_advisor.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py index 15ef1f57..6f7e4a66 100644 --- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Iterable +from typing import Callable, List, Iterable, Optional from fedot.core.pipelines.pipeline import Pipeline from golem.core.dag.linked_graph import get_distance_between @@ -11,10 +11,12 @@ class DiverseFEDOTPipelineAdvisor(SimpleSimilarityModelAdvisor): def __init__(self, fitted_similarity_assessor: DatasetsSimilarityAssessor, + n_best_to_advise: Optional[int] = None, minimal_distance: int = 1, distance_func: Callable[[Pipeline, Pipeline], int] = get_distance_between): super().__init__(fitted_similarity_assessor) self.minimal_distance = minimal_distance + self.n_best_to_advise = n_best_to_advise self.distance_func = distance_func def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]: @@ -24,4 +26,8 @@ def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]: for model in dataset_advice[1:]: if self.distance_func(first_model.predictor, model.predictor) > self.minimal_distance: diverse_dataset_advice.append(model) + + if self.n_best_to_advise is not None: + diverse_dataset_advice = list(sorted(diverse_dataset_advice, key=lambda m: m.fitness, reverse=True)) + diverse_dataset_advice = diverse_dataset_advice[:self.n_best_to_advise] return diverse_dataset_advice diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py index b585bf27..a9ca0d97 100644 --- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py @@ -1,5 +1,5 @@ from abc import abstractmethod -from typing import List, Dict, Iterable +from typing import List, Dict, Iterable, Optional import pandas as pd From 3bfaf5010f968f23dc98b3121948598a66626de2 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 30 Mar 2023 13:58:49 +0300 Subject: [PATCH 11/60] finalize experiment --- experiments/fedot_warm_start/run.py | 145 +++++++++++++++++++++++----- 1 file changed, 119 insertions(+), 26 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index f0342126..be5f45f7 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,22 +1,58 @@ import functools import timeit +from datetime import datetime +from itertools import chain +from pathlib import Path +from typing import Dict +import numpy as np import openml import pandas as pd from fedot.api.main import Fedot +from fedot.core.pipelines.adapters import PipelineAdapter from sklearn.model_selection import train_test_split +from tqdm import tqdm +from meta_automl.data_preparation.dataset import DatasetCache, Dataset from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor +from meta_automl.data_preparation.model import Model +from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor +from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor +# Meta-alg hyperparameters SEED = 42 - - -def prepare_data(): - dataset_ids = pd.Series(openml.study.get_suite(99).data) - dataset_ids = dataset_ids.sample(n=15, random_state=SEED) +# Datasets sampling +N_DATASETS = None +TEST_SIZE = 0.33 +# Evaluation timeouts +TRAIN_TIMEOUT = 15 +TEST_TIMEOUT = 10 +# Models & datasets +N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 +N_CLOSEST_DATASETS_TO_PROPOSE = 5 +MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1 +N_BEST_MODELS_TO_ADVISE = 5 + + +COMMON_FEDOT_PARAMS = dict( + problem='classification', + with_tuning=False, + logging_level=50, + n_jobs=-1, + seed=SEED, +) + + +def prepare_data() -> Dict[str, DatasetCache]: + """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" + + dataset_ids = openml.study.get_suite(99).data + if N_DATASETS is not None: + dataset_ids = pd.Series(dataset_ids) + dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) dataset_ids = list(dataset_ids) - return OpenMLDatasetsLoader().load(dataset_ids) + return {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} def timeit_decorator(function): @@ -30,37 +66,80 @@ def wrapped(*args, **kwargs): return wrapped +def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): + x = data.x + y = data.y + if len(y.shape) == 1: + y = y.reshape(-1, 1) + return x, y + + def main(): datasets_cache = prepare_data() - datasets_train, datasets_test = train_test_split(datasets_cache, test_size=0.33, random_state=SEED) + datasets_train, datasets_test = train_test_split(list(datasets_cache.keys()), + test_size=TEST_SIZE, random_state=SEED) - # TODO: - # - Extract meta-features for train datasets - # - Fit 'DatasetsSimilarityAssessor' + extractor = PymfeExtractor(extractor_params={'groups': 'general'}) + meta_features_train = extractor.extract(datasets_train, fill_input_nans=True) + meta_features_train = meta_features_train.fillna(0) + data_similarity_assessor = KNeighborsBasedSimilarityAssessor( + n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE)) + data_similarity_assessor.fit(meta_features_train, datasets_train) results_pre = [] - for cache in datasets_train: + best_models_per_dataset = {} + for name in tqdm(datasets_train, 'Train datasets'): + cache = datasets_cache[name] data = cache.from_cache() - fedot = Fedot('classification', timeout=15, n_jobs=-1, seed=SEED) - _, automl_time = timeit_decorator(fedot.fit)(data.x, data.y) - results_pre.append({'dataset': data.name, 'model': fedot, 'automl_time': automl_time}) - # TODO: - # - Prepare 'ModelAdvisor' + fedot = Fedot(timeout=TRAIN_TIMEOUT, **COMMON_FEDOT_PARAMS) + x, y = transform_data_for_fedot(data) + _, automl_time = timeit_decorator(fedot.fit)(x, y) + results_pre.append({'dataset': name, + 'model': fedot.current_pipeline.descriptive_id, + 'automl_time': automl_time}) + # TODO: + # x Turn the tuned pipeline into a model (evaluate its fitness on the data) + # x Evaluate historical pipelines on the data instead of using fitness + + # Filter out unique individuals with the best fitness + best_individuals = sorted(chain(*fedot.history.individuals), + key=lambda ind: ind.fitness, + reverse=True) + best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) + # best_models = list(fedot.best_models) or [] + best_models = [] + for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: + pipeline = PipelineAdapter().restore(individual.graph) + model = Model(pipeline, individual.fitness, cache) + best_models.append(model) + best_models_per_dataset[name] = best_models + + model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, + minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) + model_advisor.fit(best_models_per_dataset) results = [] - for cache in datasets_test: + for name in tqdm(datasets_test, 'Test datasets'): + cache = datasets_cache[name] data = cache.from_cache() - fedot_naive = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED) - _, automl_time_naive = timeit_decorator(fedot_naive.fit)(data.x, data.y) + x, y = transform_data_for_fedot(data) + + fedot_naive = Fedot(timeout=TEST_TIMEOUT, **COMMON_FEDOT_PARAMS) + _, automl_time_naive = timeit_decorator(fedot_naive.fit)(x, y) + fedot_naive.test_data = fedot_naive.train_data + fedot_naive.prediction = fedot_naive.train_data time_start = timeit.default_timer() - # TODO: - # - Extract meta-features for current test dataset - # - Get suitable assumptions from 'ModelAdvisor' - initial_assumption = ... - fedot_meta = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED, initial_assumption=initial_assumption) + meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) + meta_features = meta_features.fillna(0) + initial_assumptions = model_advisor.predict(meta_features)[0] + initial_assumptions = [model.predictor for model in initial_assumptions] + fedot_meta = Fedot(timeout=TEST_TIMEOUT, initial_assumption=initial_assumptions, **COMMON_FEDOT_PARAMS) + fedot_meta.fit(x, y) automl_time_meta = timeit.default_timer() - time_start + fedot_meta.test_data = fedot_meta.train_data + fedot_meta.prediction = fedot_meta.train_data metrics_naive = fedot_naive.get_metrics() metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()} @@ -69,13 +148,27 @@ def main(): results.append({ 'dataset': data.name, - 'model_naive': fedot_naive, - 'model_meta': fedot_meta, + 'model_naive': fedot_naive.current_pipeline.descriptive_id, + 'model_meta': fedot_meta.current_pipeline.descriptive_id, + 'history_naive': fedot_naive.history, + 'history_meta': fedot_meta.history, 'automl_time_naive': automl_time_naive, 'automl_time_meta': automl_time_meta, **metrics_naive, **metrics_meta }) + time_now = datetime.now().isoformat(timespec="minutes").replace(":", ".") + save_dir = Path(f'run_{time_now}') + save_dir.mkdir() + history_dir = save_dir.joinpath('histories') + history_dir.mkdir() + for res in results: + dataset = res['dataset'] + res.pop('history_naive').save(history_dir.joinpath(f'{dataset}_history_naive.json')) + res.pop('history_meta').save(history_dir.joinpath(f'{dataset}_history_meta.json')) + pd.DataFrame(results_pre).to_csv(save_dir.joinpath(f'results_pre_{time_now}.csv')) + pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now}.csv')) + if __name__ == "__main__": main() From 75ea275a8f446cefdd5ec654398d31c21a30ad54 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 7 Apr 2023 21:08:49 +0300 Subject: [PATCH 12/60] finalize experiment [2] --- experiments/fedot_warm_start/run.py | 224 +++++++++++++++++++--------- requirements.txt | Bin 460 -> 460 bytes 2 files changed, 155 insertions(+), 69 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index be5f45f7..6e043d55 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,16 +1,20 @@ import functools +import json import timeit from datetime import datetime from itertools import chain from pathlib import Path -from typing import Dict +from typing import Dict, List, Tuple import numpy as np import openml import pandas as pd from fedot.api.main import Fedot +from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate from fedot.core.pipelines.adapters import PipelineAdapter -from sklearn.model_selection import train_test_split +from fedot.core.pipelines.pipeline_builder import PipelineBuilder +from fedot.core.validation.split import tabular_cv_generator +from sklearn.model_selection import train_test_split, StratifiedKFold from tqdm import tqdm from meta_automl.data_preparation.dataset import DatasetCache, Dataset @@ -24,27 +28,28 @@ SEED = 42 # Datasets sampling N_DATASETS = None -TEST_SIZE = 0.33 +TEST_SIZE = 0.2 # Evaluation timeouts -TRAIN_TIMEOUT = 15 -TEST_TIMEOUT = 10 +TRAIN_TIMEOUT = 5 +TEST_TIMEOUT = 5 # Models & datasets N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 N_CLOSEST_DATASETS_TO_PROPOSE = 5 MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1 N_BEST_MODELS_TO_ADVISE = 5 - +# Meta-features +MF_EXTRACTOR_PARAMS = {'groups': 'general'} COMMON_FEDOT_PARAMS = dict( problem='classification', - with_tuning=False, logging_level=50, n_jobs=-1, seed=SEED, + show_progress=False, ) -def prepare_data() -> Dict[str, DatasetCache]: +def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]: """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" dataset_ids = openml.study.get_suite(99).data @@ -52,18 +57,7 @@ def prepare_data() -> Dict[str, DatasetCache]: dataset_ids = pd.Series(dataset_ids) dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) dataset_ids = list(dataset_ids) - return {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} - - -def timeit_decorator(function): - @functools.wraps(function) - def wrapped(*args, **kwargs): - start_time = timeit.default_timer() - res = function(*args, **kwargs) - time = timeit.default_timer() - start_time - return res, time - - return wrapped + return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): @@ -74,40 +68,99 @@ def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): return x, y -def main(): - datasets_cache = prepare_data() - datasets_train, datasets_test = train_test_split(list(datasets_cache.keys()), - test_size=TEST_SIZE, random_state=SEED) +def get_pipeline_metrics(pipeline, + input_data, + metrics_obj) -> dict: + """Gets quality metrics for the fitted pipeline. + The function is based on `Fedot.get_metrics()` + + Returns: + the values of quality metrics + """ + metrics = metrics_obj.metric_functions + metric_names = metrics_obj.get_metric_names(metrics) + + data_producer = functools.partial(tabular_cv_generator, input_data, 10, StratifiedKFold) + + objective = MetricsObjective(metrics) + obj_eval = PipelineObjectiveEvaluate(objective=objective, + data_producer=data_producer, + eval_n_jobs=-1) - extractor = PymfeExtractor(extractor_params={'groups': 'general'}) + metrics = obj_eval.evaluate(pipeline).values + metrics = {metric_name: round(metric, 3) for (metric_name, metric) in zip(metric_names, metrics)} + + return metrics + + +def prepare_extractor_and_assessor(datasets_train: List[str]): + extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS) meta_features_train = extractor.extract(datasets_train, fill_input_nans=True) meta_features_train = meta_features_train.fillna(0) data_similarity_assessor = KNeighborsBasedSimilarityAssessor( n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE)) data_similarity_assessor.fit(meta_features_train, datasets_train) + return data_similarity_assessor, extractor + + +def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=None): + x, y = transform_data_for_fedot(data) + + time_start = timeit.default_timer() + fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS) + fedot.fit(x, y) + automl_time = timeit.default_timer() - time_start + + metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data, fedot.metrics) + pipeline = fedot.current_pipeline + run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time, + automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics) + return fedot, run_results - results_pre = [] + +def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., automl_timeout_min=0., + **metrics): + run_results = dict(dataset_id=dataset.id, + dataset_name=dataset.name, + run_label=run_label, + model_obj=pipeline, + model_str=pipeline.descriptive_id, + history_obj=history_obj, + automl_time_sec=automl_time_sec, + automl_timeout_min=automl_timeout_min, + **metrics) + return run_results + + +def main(): + baseline_pipeline = PipelineBuilder().add_node('rf').build() + + dataset_ids, datasets_cache = prepare_data() + + datasets_train, datasets_test = \ + train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED) + + data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) + + results = [] best_models_per_dataset = {} for name in tqdm(datasets_train, 'Train datasets'): cache = datasets_cache[name] data = cache.from_cache() - fedot = Fedot(timeout=TRAIN_TIMEOUT, **COMMON_FEDOT_PARAMS) - x, y = transform_data_for_fedot(data) - _, automl_time = timeit_decorator(fedot.fit)(x, y) - results_pre.append({'dataset': name, - 'model': fedot.current_pipeline.descriptive_id, - 'automl_time': automl_time}) + fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') + results.append(run_results) # TODO: # x Turn the tuned pipeline into a model (evaluate its fitness on the data) # x Evaluate historical pipelines on the data instead of using fitness + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run # Filter out unique individuals with the best fitness + history = fedot.history best_individuals = sorted(chain(*fedot.history.individuals), key=lambda ind: ind.fitness, reverse=True) best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - # best_models = list(fedot.best_models) or [] best_models = [] for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: pipeline = PipelineAdapter().restore(individual.graph) @@ -119,55 +172,88 @@ def main(): minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) - results = [] for name in tqdm(datasets_test, 'Test datasets'): cache = datasets_cache[name] data = cache.from_cache() - x, y = transform_data_for_fedot(data) - fedot_naive = Fedot(timeout=TEST_TIMEOUT, **COMMON_FEDOT_PARAMS) - _, automl_time_naive = timeit_decorator(fedot_naive.fit)(x, y) - fedot_naive.test_data = fedot_naive.train_data - fedot_naive.prediction = fedot_naive.train_data + # Run pure AutoML + fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT') + results.append(fedot_naive_results) + # Run meta AutoML + # 1 time_start = timeit.default_timer() meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) meta_features = meta_features.fillna(0) + meta_learning_time = timeit.default_timer() - time_start initial_assumptions = model_advisor.predict(meta_features)[0] - initial_assumptions = [model.predictor for model in initial_assumptions] - fedot_meta = Fedot(timeout=TEST_TIMEOUT, initial_assumption=initial_assumptions, **COMMON_FEDOT_PARAMS) - fedot_meta.fit(x, y) - automl_time_meta = timeit.default_timer() - time_start - fedot_meta.test_data = fedot_meta.train_data - fedot_meta.prediction = fedot_meta.train_data - - metrics_naive = fedot_naive.get_metrics() - metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()} - metrics_meta = fedot_meta.get_metrics() - metrics_meta = {f'{key}_meta': val for key, val in metrics_meta.items()} - - results.append({ - 'dataset': data.name, - 'model_naive': fedot_naive.current_pipeline.descriptive_id, - 'model_meta': fedot_meta.current_pipeline.descriptive_id, - 'history_naive': fedot_naive.history, - 'history_meta': fedot_meta.history, - 'automl_time_naive': automl_time_naive, - 'automl_time_meta': automl_time_meta, - **metrics_naive, **metrics_meta - }) - - time_now = datetime.now().isoformat(timespec="minutes").replace(":", ".") - save_dir = Path(f'run_{time_now}') + assumption_pipelines = [model.predictor for model in initial_assumptions] + # 2 + fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', + initial_assumption=assumption_pipelines) + fedot_meta_results['meta_learning_time'] = meta_learning_time + results.append(fedot_meta_results) + + # Fit & evaluate simple baseline + baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics) + baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline, + **baseline_metrics) + results.append(baseline_res) + + # Fit & evaluate initial assumptions + for i, assumption in enumerate(initial_assumptions): + pipeline = assumption.predictor + assumption_metrics = get_pipeline_metrics(assumption.predictor, fedot_meta.train_data, fedot_meta.metrics) + assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}', + pipeline=assumption.predictor, **assumption_metrics) + results.append(assumption_res) + + # Save the accumulated results + time_now = datetime.now().isoformat(timespec="minutes") + time_now_for_path = time_now.replace(":", ".") + save_dir = Path(f'run_{time_now_for_path}') save_dir.mkdir() history_dir = save_dir.joinpath('histories') history_dir.mkdir() + models_dir = save_dir.joinpath('models') for res in results: - dataset = res['dataset'] - res.pop('history_naive').save(history_dir.joinpath(f'{dataset}_history_naive.json')) - res.pop('history_meta').save(history_dir.joinpath(f'{dataset}_history_meta.json')) - pd.DataFrame(results_pre).to_csv(save_dir.joinpath(f'results_pre_{time_now}.csv')) - pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now}.csv')) + res['run_date'] = time_now + dataset_name = res['dataset_name'] + run_label = res['run_label'] + # define saving paths + model_path = models_dir.joinpath(f'{dataset_name}_{run_label}') + history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json') + # replace objects with export paths for csv + res['model_path'] = str(model_path) + res.pop('model_obj').save(res['model_path']) + res['history_path'] = str(history_path) + history_obj = res.pop('history_obj') + if history_obj is not None: + history_obj.save(res['history_path']) + pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_pre_{time_now_for_path}.csv')) + pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) + + # save experiment hyperparameters + params = { + 'run_date': time_now, + 'seed': SEED, + 'n_datasets': N_DATASETS or len(dataset_ids), + 'test_size': TEST_SIZE, + 'dataset_ids': dataset_ids, + 'dataset_names': list(datasets_cache.keys()), + 'dataset_names_train': datasets_train, + 'dataset_names_test': datasets_test, + 'train_timeout': TRAIN_TIMEOUT, + 'test_timeout': TEST_TIMEOUT, + 'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE, + 'n_closest_datasets_to_propose': N_CLOSEST_DATASETS_TO_PROPOSE, + 'minimal_distance_between_advised_models': MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS, + 'n_best_models_to_advise': N_BEST_MODELS_TO_ADVISE, + 'common_fedot_params': COMMON_FEDOT_PARAMS, + 'baseline_pipeline': baseline_pipeline.descriptive_id, + } + with open(save_dir.joinpath('parameters.json'), 'w') as params_file: + json.dump(params, params_file, indent=2) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 4785a7e31c157dd88d70c0da7317bb52f92cfde0..106d90f8a77d6ca43c6b65218930c14f0f7924ec 100644 GIT binary patch delta 90 zcmX@Ze1>^~kG} Z0C5VCXAabt3dCkWHI_h`q>Zx;83C?M4l4iv delta 90 zcmX@Ze1>^~kAD(FGJ_d|34;Mc3J@DIqycFY22&tu1f91w$fGG?~F1ti}K+ YYXnqp2IN@+VIol60I0@b<7`7l0Lz9BW&i*H From 8f29cf72de174fd810d35ced81df8ec1cf76ba70 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sat, 8 Apr 2023 13:14:10 +0300 Subject: [PATCH 13/60] wrap & log exceptions; log progress to file --- experiments/fedot_warm_start/run.py | 193 ++++++++++++++++------------ 1 file changed, 111 insertions(+), 82 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 6e043d55..3e5a2a28 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,5 +1,6 @@ import functools import json +import logging import timeit from datetime import datetime from itertools import chain @@ -14,6 +15,7 @@ from fedot.core.pipelines.adapters import PipelineAdapter from fedot.core.pipelines.pipeline_builder import PipelineBuilder from fedot.core.validation.split import tabular_cv_generator +from golem.core.log import Log from sklearn.model_selection import train_test_split, StratifiedKFold from tqdm import tqdm @@ -27,11 +29,11 @@ # Meta-alg hyperparameters SEED = 42 # Datasets sampling -N_DATASETS = None -TEST_SIZE = 0.2 +N_DATASETS = 3 +TEST_SIZE = 0.33 # Evaluation timeouts -TRAIN_TIMEOUT = 5 -TEST_TIMEOUT = 5 +TRAIN_TIMEOUT = 0.5 +TEST_TIMEOUT = 0.5 # Models & datasets N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 N_CLOSEST_DATASETS_TO_PROPOSE = 5 @@ -42,12 +44,25 @@ COMMON_FEDOT_PARAMS = dict( problem='classification', - logging_level=50, n_jobs=-1, seed=SEED, show_progress=False, ) +# Setup logging +time_now = datetime.now().isoformat(timespec="minutes") +time_now_for_path = time_now.replace(":", ".") +save_dir = Path(f'run_{time_now_for_path}') +save_dir.mkdir() +log_file = save_dir.joinpath('log.txt') +Log(log_file=log_file) +logging.basicConfig(filename=log_file, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', + datefmt='%H:%M:%S', + force=True, + ) + def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]: """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" @@ -132,6 +147,19 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut return run_results +def extract_best_history_models(dataset_cache, history): + best_individuals = sorted(chain(*history.individuals), + key=lambda ind: ind.fitness, + reverse=True) + best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) + best_models = [] + for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: + pipeline = PipelineAdapter().restore(individual.graph) + model = Model(pipeline, individual.fitness, dataset_cache) + best_models.append(model) + return best_models + + def main(): baseline_pipeline = PipelineBuilder().add_node('rf').build() @@ -144,93 +172,91 @@ def main(): results = [] best_models_per_dataset = {} - for name in tqdm(datasets_train, 'Train datasets'): - cache = datasets_cache[name] - data = cache.from_cache() - - fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') - results.append(run_results) - # TODO: - # x Turn the tuned pipeline into a model (evaluate its fitness on the data) - # x Evaluate historical pipelines on the data instead of using fitness - # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run - - # Filter out unique individuals with the best fitness - history = fedot.history - best_individuals = sorted(chain(*fedot.history.individuals), - key=lambda ind: ind.fitness, - reverse=True) - best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - best_models = [] - for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: - pipeline = PipelineAdapter().restore(individual.graph) - model = Model(pipeline, individual.fitness, cache) - best_models.append(model) - best_models_per_dataset[name] = best_models + progress_file = open(save_dir.joinpath('progress.txt'), 'a') + for name in tqdm(datasets_train, 'Train datasets', file=progress_file): + try: + cache = datasets_cache[name] + data = cache.from_cache() + + fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') + results.append(run_results) + # TODO: + # x Turn the tuned pipeline into a model (evaluate its fitness on the data) + # x Evaluate historical pipelines on the data instead of using fitness + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run + + # Filter out unique individuals with the best fitness + history = fedot.history + best_models = extract_best_history_models(cache, history) + best_models_per_dataset[name] = best_models + except Exception: + logging.exception(f'Train dataset "{name}"') model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) - for name in tqdm(datasets_test, 'Test datasets'): - cache = datasets_cache[name] - data = cache.from_cache() - - # Run pure AutoML - fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT') - results.append(fedot_naive_results) - - # Run meta AutoML - # 1 - time_start = timeit.default_timer() - meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) - meta_features = meta_features.fillna(0) - meta_learning_time = timeit.default_timer() - time_start - initial_assumptions = model_advisor.predict(meta_features)[0] - assumption_pipelines = [model.predictor for model in initial_assumptions] - # 2 - fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', - initial_assumption=assumption_pipelines) - fedot_meta_results['meta_learning_time'] = meta_learning_time - results.append(fedot_meta_results) - - # Fit & evaluate simple baseline - baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics) - baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline, - **baseline_metrics) - results.append(baseline_res) - - # Fit & evaluate initial assumptions - for i, assumption in enumerate(initial_assumptions): - pipeline = assumption.predictor - assumption_metrics = get_pipeline_metrics(assumption.predictor, fedot_meta.train_data, fedot_meta.metrics) - assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}', - pipeline=assumption.predictor, **assumption_metrics) - results.append(assumption_res) + for name in tqdm(datasets_test, 'Test datasets', file=progress_file): + try: + cache = datasets_cache[name] + data = cache.from_cache() + + # Run pure AutoML + fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT') + results.append(fedot_naive_results) + + # Run meta AutoML + # 1 + time_start = timeit.default_timer() + meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) + meta_features = meta_features.fillna(0) + meta_learning_time = timeit.default_timer() - time_start + initial_assumptions = model_advisor.predict(meta_features)[0] + assumption_pipelines = [model.predictor for model in initial_assumptions] + # 2 + fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', + initial_assumption=assumption_pipelines) + fedot_meta_results['meta_learning_time'] = meta_learning_time + results.append(fedot_meta_results) + + # Fit & evaluate simple baseline + baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics) + baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline, + **baseline_metrics) + results.append(baseline_res) + + # Fit & evaluate initial assumptions + for i, assumption in enumerate(initial_assumptions): + pipeline = assumption.predictor + assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data, fedot_meta.metrics) + assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}', + pipeline=pipeline, **assumption_metrics) + results.append(assumption_res) + except Exception: + logging.exception(f'Test dataset "{name}"') # Save the accumulated results - time_now = datetime.now().isoformat(timespec="minutes") - time_now_for_path = time_now.replace(":", ".") - save_dir = Path(f'run_{time_now_for_path}') - save_dir.mkdir() history_dir = save_dir.joinpath('histories') history_dir.mkdir() models_dir = save_dir.joinpath('models') for res in results: - res['run_date'] = time_now - dataset_name = res['dataset_name'] - run_label = res['run_label'] - # define saving paths - model_path = models_dir.joinpath(f'{dataset_name}_{run_label}') - history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json') - # replace objects with export paths for csv - res['model_path'] = str(model_path) - res.pop('model_obj').save(res['model_path']) - res['history_path'] = str(history_path) - history_obj = res.pop('history_obj') - if history_obj is not None: - history_obj.save(res['history_path']) - pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_pre_{time_now_for_path}.csv')) + try: + res['run_date'] = time_now + dataset_name = res['dataset_name'] + run_label = res['run_label'] + # define saving paths + model_path = models_dir.joinpath(f'{dataset_name}_{run_label}') + history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json') + # replace objects with export paths for csv + res['model_path'] = str(model_path) + res.pop('model_obj').save(res['model_path']) + res['history_path'] = str(history_path) + history_obj = res.pop('history_obj') + if history_obj is not None: + history_obj.save(res['history_path']) + except Exception: + logging.exception(f'Saving results "{res}"') + pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) # save experiment hyperparameters @@ -257,4 +283,7 @@ def main(): if __name__ == "__main__": - main() + try: + main() + except Exception: + logging.exception(f'Main level cached the error') From 168a4dd68be5a47879bdeb942334e5c2490cc8bd Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sat, 8 Apr 2023 13:14:20 +0300 Subject: [PATCH 14/60] update requirements.txt --- requirements.txt | Bin 460 -> 460 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/requirements.txt b/requirements.txt index 106d90f8a77d6ca43c6b65218930c14f0f7924ec..ad0a22332f176f2c866188116575624428ac1536 100644 GIT binary patch delta 43 rcmX@Ze1>@g6CMKB=Z?fSxv#BD;bpm#pwvS delta 43 vcmX@Ze1>@g6C Date: Sat, 8 Apr 2023 13:16:28 +0300 Subject: [PATCH 15/60] update timeouts --- experiments/fedot_warm_start/run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 3e5a2a28..3113ad97 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -29,11 +29,11 @@ # Meta-alg hyperparameters SEED = 42 # Datasets sampling -N_DATASETS = 3 -TEST_SIZE = 0.33 +N_DATASETS = None +TEST_SIZE = 0.2 # Evaluation timeouts -TRAIN_TIMEOUT = 0.5 -TEST_TIMEOUT = 0.5 +TRAIN_TIMEOUT = 15 +TEST_TIMEOUT = 15 # Models & datasets N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 N_CLOSEST_DATASETS_TO_PROPOSE = 5 From cc71c4745077aadeec07af66b9b6feeb5a151d19 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 18 Apr 2023 18:02:21 +0300 Subject: [PATCH 16/60] remove GOLEM from requirements.txt to inherit version required by FEDOT --- requirements.txt | Bin 460 -> 430 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/requirements.txt b/requirements.txt index ad0a22332f176f2c866188116575624428ac1536..eca13d853ca1f8e55c583bd3790a78a679ffee4d 100644 GIT binary patch delta 7 OcmX@ZypDOpIz|8ti~`^Q delta 38 pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006 Date: Tue, 18 Apr 2023 18:04:02 +0300 Subject: [PATCH 17/60] clean openml cache --- .../datasets_loaders/openml_datasets_loader.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py index f22bb0c6..01584c23 100644 --- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py +++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py @@ -1,5 +1,7 @@ from __future__ import annotations +import shutil +from pathlib import Path from typing import List, Union import openml @@ -10,6 +12,12 @@ OpenMLDatasetID = Union[str, int] +def _clear_openml_cache(): + cache_dir = openml.config.get_cache_directory() + cache_dir = Path(cache_dir) + shutil.rmtree(cache_dir) + + class OpenMLDatasetsLoader(DatasetsLoader): def __init__(self): @@ -27,7 +35,10 @@ def load(self, dataset_sources: List[OpenMLDatasetID]) -> List[DatasetCache]: return datasets def load_single(self, source: OpenMLDatasetID): - return self.get_openml_dataset(source) + try: + return self.get_openml_dataset(source) + finally: + _clear_openml_cache() def get_openml_dataset(self, dataset_id: OpenMLDatasetID, force_download: bool = False) -> DatasetCache: openml_dataset = openml.datasets.get_dataset(dataset_id, download_data=False) From a10174ce62b5d2fe4e3ab85ee5e38546c6da4047 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 18 Apr 2023 18:04:35 +0300 Subject: [PATCH 18/60] update Dockerfile --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index e17e17cd..7958082a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ WORKDIR $WORKDIR COPY . $WORKDIR RUN pip3 install pip && \ + pip install wheel && \ pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt ENV PYTHONPATH $WORKDIR From a309eef64e86a559880aad5451c33b50143eab46 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 20 Apr 2023 15:42:57 +0300 Subject: [PATCH 19/60] make experiment safer --- experiments/fedot_warm_start/run.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 3113ad97..66c80192 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -19,6 +19,7 @@ from sklearn.model_selection import train_test_split, StratifiedKFold from tqdm import tqdm +from meta_automl.data_preparation.data_manager import DataManager from meta_automl.data_preparation.dataset import DatasetCache, Dataset from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor @@ -52,7 +53,8 @@ # Setup logging time_now = datetime.now().isoformat(timespec="minutes") time_now_for_path = time_now.replace(":", ".") -save_dir = Path(f'run_{time_now_for_path}') +save_dir = DataManager.get_data_dir()\ + .joinpath(f'run_{time_now_for_path}').joinpath('experiments').joinpath('fedot_warm_start') save_dir.mkdir() log_file = save_dir.joinpath('log.txt') Log(log_file=log_file) @@ -168,17 +170,16 @@ def main(): datasets_train, datasets_test = \ train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED) - data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) - results = [] best_models_per_dataset = {} progress_file = open(save_dir.joinpath('progress.txt'), 'a') - for name in tqdm(datasets_train, 'Train datasets', file=progress_file): + for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file): try: cache = datasets_cache[name] data = cache.from_cache() - fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') + timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT + fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT') results.append(run_results) # TODO: # x Turn the tuned pipeline into a model (evaluate its fitness on the data) @@ -192,19 +193,16 @@ def main(): except Exception: logging.exception(f'Train dataset "{name}"') + data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) - for name in tqdm(datasets_test, 'Test datasets', file=progress_file): + for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file): try: cache = datasets_cache[name] data = cache.from_cache() - # Run pure AutoML - fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT') - results.append(fedot_naive_results) - # Run meta AutoML # 1 time_start = timeit.default_timer() @@ -234,6 +232,7 @@ def main(): results.append(assumption_res) except Exception: logging.exception(f'Test dataset "{name}"') + progress_file.close() # Save the accumulated results history_dir = save_dir.joinpath('histories') From 066cd3e6d110a5918010795b310f07599d985d7f Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 20 Apr 2023 16:38:47 +0300 Subject: [PATCH 20/60] add .dockerignore --- .dockerignore | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..2bfa6863 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +# Config & info files +.pep8speaks.yml +Dockerfile +LICENSE +README.md + +# Unnecessary files +examples +notebooks +test + +# User data +data From 69b4915f8620c1b6753fb1a058d6839f1fe374ab Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 20 Apr 2023 17:39:14 +0300 Subject: [PATCH 21/60] fix save path --- experiments/fedot_warm_start/run.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 66c80192..9bf33ccb 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -4,7 +4,6 @@ import timeit from datetime import datetime from itertools import chain -from pathlib import Path from typing import Dict, List, Tuple import numpy as np @@ -53,9 +52,9 @@ # Setup logging time_now = datetime.now().isoformat(timespec="minutes") time_now_for_path = time_now.replace(":", ".") -save_dir = DataManager.get_data_dir()\ - .joinpath(f'run_{time_now_for_path}').joinpath('experiments').joinpath('fedot_warm_start') -save_dir.mkdir() +save_dir = DataManager.get_data_dir().\ + joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') +save_dir.mkdir(parents=True) log_file = save_dir.joinpath('log.txt') Log(log_file=log_file) logging.basicConfig(filename=log_file, From b490f05b643cc856d9616f2f02c7700813d71cb1 Mon Sep 17 00:00:00 2001 From: max Date: Tue, 2 May 2023 03:43:03 +0300 Subject: [PATCH 22/60] Making code more reusable and qualitative --- experiments/fedot_warm_start/__init__.py | 0 experiments/fedot_warm_start/run.py | 105 ++++++++++++++--------- 2 files changed, 66 insertions(+), 39 deletions(-) create mode 100644 experiments/fedot_warm_start/__init__.py diff --git a/experiments/fedot_warm_start/__init__.py b/experiments/fedot_warm_start/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 3113ad97..4a9d8afd 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -26,6 +26,7 @@ from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor + # Meta-alg hyperparameters SEED = 42 # Datasets sampling @@ -49,30 +50,41 @@ show_progress=False, ) -# Setup logging -time_now = datetime.now().isoformat(timespec="minutes") -time_now_for_path = time_now.replace(":", ".") -save_dir = Path(f'run_{time_now_for_path}') -save_dir.mkdir() -log_file = save_dir.joinpath('log.txt') -Log(log_file=log_file) -logging.basicConfig(filename=log_file, - filemode='a', - format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', - datefmt='%H:%M:%S', - force=True, - ) - - -def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]: - """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" +SAVE_DIR = None +TIME_NOW = None +TIME_NOW_FOR_PATH = None + +DEBUG = False + + +def setup_logging(): + global TIME_NOW + TIME_NOW = time_now = datetime.now().isoformat(timespec="minutes") + global TIME_NOW_FOR_PATH + TIME_NOW_FOR_PATH = time_now_for_path = time_now.replace(":", ".") + global SAVE_DIR + SAVE_DIR = save_dir = Path(f'run_{time_now_for_path}') + save_dir.mkdir() + log_file = save_dir.joinpath('log.txt') + Log(log_file=log_file) + logging.basicConfig(filename=log_file, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', + datefmt='%H:%M:%S', + force=True, + ) + +def fetch_openml_data() -> Tuple[List[int], Dict[str, DatasetCache]]: + """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" dataset_ids = openml.study.get_suite(99).data if N_DATASETS is not None: dataset_ids = pd.Series(dataset_ids) dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) - dataset_ids = list(dataset_ids) - return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} + dataset_ids = list(dataset_ids) + + datasets = {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} + return dataset_ids, datasets def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): @@ -92,6 +104,7 @@ def get_pipeline_metrics(pipeline, Returns: the values of quality metrics """ + # print(str(metrics_obj)) metrics = metrics_obj.metric_functions metric_names = metrics_obj.get_metric_names(metrics) @@ -160,22 +173,34 @@ def extract_best_history_models(dataset_cache, history): return best_models +def prepare_data() -> Tuple[Tuple[List[int], Dict[str, DatasetCache]], Tuple[List[str], List[str]]]: + dataset_ids, datasets = fetch_openml_data() + + train_data_names, test_data_names = train_test_split( + list(datasets.keys()), + test_size=TEST_SIZE, + random_state=SEED + ) + return (dataset_ids, datasets), (train_data_names, test_data_names) + + def main(): baseline_pipeline = PipelineBuilder().add_node('rf').build() - dataset_ids, datasets_cache = prepare_data() + ds_with_ids, dataset_names = prepare_data() + + train_ds_names, test_ds_names = dataset_names - datasets_train, datasets_test = \ - train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED) + ds_ids, datasets = ds_with_ids - data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) + data_similarity_assessor, extractor = prepare_extractor_and_assessor(train_ds_names) results = [] best_models_per_dataset = {} - progress_file = open(save_dir.joinpath('progress.txt'), 'a') - for name in tqdm(datasets_train, 'Train datasets', file=progress_file): + progress_file = open(SAVE_DIR.joinpath('progress.txt'), 'a') + for name in tqdm(train_ds_names, 'Train datasets', file=progress_file): try: - cache = datasets_cache[name] + cache = datasets[name] data = cache.from_cache() fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') @@ -196,9 +221,9 @@ def main(): minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) - for name in tqdm(datasets_test, 'Test datasets', file=progress_file): + for name in tqdm(test_ds_names, 'Test datasets', file=progress_file): try: - cache = datasets_cache[name] + cache = datasets[name] data = cache.from_cache() # Run pure AutoML @@ -236,12 +261,12 @@ def main(): logging.exception(f'Test dataset "{name}"') # Save the accumulated results - history_dir = save_dir.joinpath('histories') + history_dir = SAVE_DIR.joinpath('histories') history_dir.mkdir() - models_dir = save_dir.joinpath('models') + models_dir = SAVE_DIR.joinpath('models') for res in results: try: - res['run_date'] = time_now + res['run_date'] = TIME_NOW dataset_name = res['dataset_name'] run_label = res['run_label'] # define saving paths @@ -257,18 +282,18 @@ def main(): except Exception: logging.exception(f'Saving results "{res}"') - pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) + pd.DataFrame(results).to_csv(SAVE_DIR.joinpath(f'results_{TIME_NOW_FOR_PATH}.csv')) # save experiment hyperparameters params = { - 'run_date': time_now, + 'run_date': TIME_NOW, 'seed': SEED, - 'n_datasets': N_DATASETS or len(dataset_ids), + 'n_datasets': N_DATASETS or len(ds_ids), 'test_size': TEST_SIZE, - 'dataset_ids': dataset_ids, - 'dataset_names': list(datasets_cache.keys()), - 'dataset_names_train': datasets_train, - 'dataset_names_test': datasets_test, + 'dataset_ids': ds_ids, + 'dataset_names': list(dataset_names.keys()), + 'dataset_names_train': train_ds_names, + 'dataset_names_test': test_ds_names, 'train_timeout': TRAIN_TIMEOUT, 'test_timeout': TEST_TIMEOUT, 'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE, @@ -278,12 +303,14 @@ def main(): 'common_fedot_params': COMMON_FEDOT_PARAMS, 'baseline_pipeline': baseline_pipeline.descriptive_id, } - with open(save_dir.joinpath('parameters.json'), 'w') as params_file: + with open(SAVE_DIR.joinpath('parameters.json'), 'w') as params_file: json.dump(params, params_file, indent=2) if __name__ == "__main__": try: + if DEBUG: + setup_logging() main() except Exception: logging.exception(f'Main level cached the error') From e7e4bf8b733fe8e562c6cc6923beffb4c7b34acb Mon Sep 17 00:00:00 2001 From: max Date: Tue, 2 May 2023 03:45:12 +0300 Subject: [PATCH 23/60] Adding auto-sklearn run script with an example --- experiments/auto-sklearn_run/__init__.py | 0 experiments/auto-sklearn_run/openml_suite.py | 91 ++++++++++++++++++++ experiments/auto-sklearn_run/results.json | 45 ++++++++++ 3 files changed, 136 insertions(+) create mode 100644 experiments/auto-sklearn_run/__init__.py create mode 100644 experiments/auto-sklearn_run/openml_suite.py create mode 100644 experiments/auto-sklearn_run/results.json diff --git a/experiments/auto-sklearn_run/__init__.py b/experiments/auto-sklearn_run/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/auto-sklearn_run/openml_suite.py b/experiments/auto-sklearn_run/openml_suite.py new file mode 100644 index 00000000..588d3b93 --- /dev/null +++ b/experiments/auto-sklearn_run/openml_suite.py @@ -0,0 +1,91 @@ +import pickle +import re + +import numpy as np +import json + +import autosklearn.classification +from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import Balancing +from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice +from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice +from autosklearn.pipeline.components.classification import AutoSklearnClassificationAlgorithm, ClassifierChoice + +from experiments.fedot_warm_start.run import prepare_data +from sklearn import model_selection, metrics +from sklearn.base import ClassifierMixin + + +class AutoSklearnEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, ClassifierChoice): + return repr(o.choice.estimator) + # if isinstance(o, (DataPreprocessorChoice, FeaturePreprocessorChoice)): + # return None + if isinstance(o, ClassifierMixin): + return re.sub(r'\s{2,}', ' ', repr(o)) + elif isinstance(o, Balancing): + return repr(o) + elif isinstance(o, np.integer): + return int(o) + elif isinstance(o, np.floating): + return float(o) + + +class AutoSklearnValidator: + + def __init__(self): + pass + + @staticmethod + def main(): + ds_with_ids, ds_names = prepare_data() + train_ds_names, test_ds_names = ds_names + + ds_ids, datasets = ds_with_ids + + for ds_name in train_ds_names: + # if train_ds_names[0] is not None: + print("Sanity check") + dataset = datasets[ds_name].from_cache() + + # cannot wait longer because of the slow data fetching, issue#9 + estimator = autosklearn.classification.AutoSklearnClassifier( + time_left_for_this_task=60 + ) + + X_train, X_test, y_train, y_test = model_selection.train_test_split( + dataset.x, + dataset.y, + test_size=0.2, + random_state=42 + ) + + pipeline = estimator.fit(X_train, y_train) + + predictions = estimator.predict(X_test) + + quality_estimation = metrics.roc_auc_score(y_test, predictions) + + results = { + 'ensemble': pipeline.show_models(), + 'score': quality_estimation + } + + # pickle.dump(pipeline.show_models(), open("results.pickle", "wb")) + + # print(type(pipeline.show_models().get(list(pipeline.show_models().keys())[0]).get("classifier"))) + + with open("results.json", "w") as file: + json.dump( + results, + file, + cls=AutoSklearnEncoder, + indent=2 + ) + +if __name__ == '__main__': + AutoSklearnValidator.main() + + + + diff --git a/experiments/auto-sklearn_run/results.json b/experiments/auto-sklearn_run/results.json new file mode 100644 index 00000000..b4ce4cbf --- /dev/null +++ b/experiments/auto-sklearn_run/results.json @@ -0,0 +1,45 @@ +{ + "ensemble": { + "2": { + "model_id": 2, + "rank": 1, + "cost": 0.02008032128514059, + "ensemble_weight": 0.1, + "balancing": "Balancing(random_state=1)", + "sklearn_classifier": "RandomForestClassifier(max_features=5, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)" + }, + "6": { + "model_id": 6, + "rank": 2, + "cost": 0.04216867469879515, + "ensemble_weight": 0.02, + "balancing": "Balancing(random_state=1)", + "sklearn_classifier": "RandomForestClassifier(bootstrap=False, max_features=4, min_samples_leaf=4, min_samples_split=20, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)" + }, + "7": { + "model_id": 7, + "rank": 3, + "cost": 0.025100401606425682, + "ensemble_weight": 0.08, + "balancing": "Balancing(random_state=1)", + "sklearn_classifier": "HistGradientBoostingClassifier(early_stopping=True, l2_regularization=5.759216242427118e-07, learning_rate=0.14515873247977112, loss='auto', max_iter=64, max_leaf_nodes=11, min_samples_leaf=1, n_iter_no_change=18, random_state=1, validation_fraction=0.06967552984405034, warm_start=True)" + }, + "8": { + "model_id": 8, + "rank": 4, + "cost": 0.02208835341365467, + "ensemble_weight": 0.54, + "balancing": "Balancing(random_state=1, strategy='weighting')", + "sklearn_classifier": "AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=2), learning_rate=1.7653851967971248, n_estimators=290, random_state=1)" + }, + "11": { + "model_id": 11, + "rank": 5, + "cost": 0.017068273092369468, + "ensemble_weight": 0.26, + "balancing": "Balancing(random_state=1)", + "sklearn_classifier": "AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=5), learning_rate=0.9772078202526538, n_estimators=418, random_state=1)" + } + }, + "score": 0.9182632313000073 +} \ No newline at end of file From 7f74e70fb0b5f3409b7af98defb2f6b6ace11e9e Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 26 Feb 2023 17:53:07 +0300 Subject: [PATCH 24/60] move to FEDOT 0.7.0 --- requirements.txt | Bin 430 -> 460 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/requirements.txt b/requirements.txt index eca13d853ca1f8e55c583bd3790a78a679ffee4d..ad0a22332f176f2c866188116575624428ac1536 100644 GIT binary patch delta 38 pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006 Date: Sun, 26 Feb 2023 18:07:22 +0300 Subject: [PATCH 25/60] create Dockerfile --- Dockerfile | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..e17e17cd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +# Download base image ubuntu 20.04 +FROM ubuntu:20.04 + +# For apt to be noninteractive +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +# Preseed tzdata, update package index, upgrade packages and install needed software +RUN truncate -s0 /tmp/preseed.cfg; \ + echo "tzdata tzdata/Areas select Europe" >> /tmp/preseed.cfg; \ + echo "tzdata tzdata/Zones/Europe select Berlin" >> /tmp/preseed.cfg; \ + debconf-set-selections /tmp/preseed.cfg && \ + rm -f /etc/timezone /etc/localtime && \ + apt-get update && \ + apt-get install -y nano && \ + apt-get install -y mc && \ + apt-get install -y python3.9 python3-pip && \ + apt-get install -y git && \ + rm -rf /var/lib/apt/lists/* + +# Set the workdir +ENV WORKDIR /home/meta-automl-research +WORKDIR $WORKDIR +COPY . $WORKDIR + +RUN pip3 install pip && \ + pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt + +ENV PYTHONPATH $WORKDIR From d24247fd2512bfc580e7664c5a88bea926d6799c Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 26 Feb 2023 22:06:22 +0300 Subject: [PATCH 26/60] prepare experiment demo --- experiments/fedot_warm_start/run.py | 81 +++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 experiments/fedot_warm_start/run.py diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py new file mode 100644 index 00000000..f0342126 --- /dev/null +++ b/experiments/fedot_warm_start/run.py @@ -0,0 +1,81 @@ +import functools +import timeit + +import openml +import pandas as pd +from fedot.api.main import Fedot +from sklearn.model_selection import train_test_split + +from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor + +SEED = 42 + + +def prepare_data(): + dataset_ids = pd.Series(openml.study.get_suite(99).data) + dataset_ids = dataset_ids.sample(n=15, random_state=SEED) + dataset_ids = list(dataset_ids) + return OpenMLDatasetsLoader().load(dataset_ids) + + +def timeit_decorator(function): + @functools.wraps(function) + def wrapped(*args, **kwargs): + start_time = timeit.default_timer() + res = function(*args, **kwargs) + time = timeit.default_timer() - start_time + return res, time + + return wrapped + + +def main(): + datasets_cache = prepare_data() + datasets_train, datasets_test = train_test_split(datasets_cache, test_size=0.33, random_state=SEED) + + # TODO: + # - Extract meta-features for train datasets + # - Fit 'DatasetsSimilarityAssessor' + + results_pre = [] + for cache in datasets_train: + data = cache.from_cache() + fedot = Fedot('classification', timeout=15, n_jobs=-1, seed=SEED) + _, automl_time = timeit_decorator(fedot.fit)(data.x, data.y) + results_pre.append({'dataset': data.name, 'model': fedot, 'automl_time': automl_time}) + + # TODO: + # - Prepare 'ModelAdvisor' + + results = [] + for cache in datasets_test: + data = cache.from_cache() + fedot_naive = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED) + _, automl_time_naive = timeit_decorator(fedot_naive.fit)(data.x, data.y) + + time_start = timeit.default_timer() + # TODO: + # - Extract meta-features for current test dataset + # - Get suitable assumptions from 'ModelAdvisor' + initial_assumption = ... + fedot_meta = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED, initial_assumption=initial_assumption) + automl_time_meta = timeit.default_timer() - time_start + + metrics_naive = fedot_naive.get_metrics() + metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()} + metrics_meta = fedot_meta.get_metrics() + metrics_meta = {f'{key}_meta': val for key, val in metrics_meta.items()} + + results.append({ + 'dataset': data.name, + 'model_naive': fedot_naive, + 'model_meta': fedot_meta, + 'automl_time_naive': automl_time_naive, + 'automl_time_meta': automl_time_meta, + **metrics_naive, **metrics_meta + }) + + +if __name__ == "__main__": + main() From c4b3f9173ac8eca24e61e6d299467286de170854 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 3 Mar 2023 17:40:35 +0300 Subject: [PATCH 27/60] fix similarity assessors --- .../select_similar_datasets_by_knn.py | 6 +- .../advise_models_from_similar_datasets.py | 4 +- .../datasets_similarity_assessors/__init__.py | 2 +- .../model_based_similarity_assessors.py | 51 ++++++++++++++++ .../predict_proba_similarity_assessors.py | 59 ------------------- 5 files changed, 57 insertions(+), 65 deletions(-) create mode 100644 meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py delete mode 100644 meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py diff --git a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py index dc1c190c..b6f2bb8c 100644 --- a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py +++ b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py @@ -2,7 +2,7 @@ from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor -from meta_automl.meta_algorithm.datasets_similarity_assessors import KNNSimilarityAssessor +from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor def main(): @@ -16,10 +16,10 @@ def main(): # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42) y_train = x_train.index - assessor = KNNSimilarityAssessor({'n_neighbors': 1}, n_best=2) + assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=3) assessor.fit(x_train, y_train) # Get models for the best fitting datasets from train. - return x_test.index, assessor.predict(x_test) + return x_test.index, assessor.predict(x_test, return_distance=True) if __name__ == '__main__': diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py index 993ac04a..37c3b2db 100644 --- a/examples/4_advising_models/advise_models_from_similar_datasets.py +++ b/examples/4_advising_models/advise_models_from_similar_datasets.py @@ -6,7 +6,7 @@ from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor from meta_automl.data_preparation.model import Model -from meta_automl.meta_algorithm.datasets_similarity_assessors import KNNSimilarityAssessor +from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor @@ -21,7 +21,7 @@ def main(): # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42) y_train = x_train.index - assessor = KNNSimilarityAssessor({'n_neighbors': 3}, n_best=2) + assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=2) assessor.fit(x_train, y_train) # Define best models for datasets. best_pipelines = [ diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py index 621a68e0..0c33e2c4 100644 --- a/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py +++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py @@ -1,2 +1,2 @@ from .datasets_similarity_assessor import DatasetsSimilarityAssessor -from .predict_proba_similarity_assessors import KNNSimilarityAssessor, PredictProbaSimilarityAssessor +from .model_based_similarity_assessors import KNeighborsBasedSimilarityAssessor, ModelBasedSimilarityAssessor diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py new file mode 100644 index 00000000..09720a1e --- /dev/null +++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py @@ -0,0 +1,51 @@ +from abc import ABC +from typing import Optional, Dict, Any, List, Iterable + +import numpy as np +import pandas as pd +from sklearn.neighbors import NearestNeighbors + +from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \ + DatasetsSimilarityAssessor + + +class ModelBasedSimilarityAssessor(ABC, DatasetsSimilarityAssessor): + def __init__(self, model, n_best: int = 1): + self._inner_model = model + self.n_best = n_best + self._datasets: Optional[Iterable[str]] = None + + +class KNeighborsBasedSimilarityAssessor(ModelBasedSimilarityAssessor): + def __init__(self, n_neighbors: int = 1, **model_params): + model = NearestNeighbors(n_neighbors=n_neighbors, **model_params) + super().__init__(model, n_neighbors) + + def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]): + meta_features = self.preprocess_meta_features(meta_features) + self._datasets = np.array(datasets) + self._inner_model.fit(meta_features) + + @staticmethod + def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame: + return meta_features.dropna(axis=1, how='any') + + def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[str]]: + dataset_indexes = self._inner_model.kneighbors(meta_features, return_distance=return_distance) + if return_distance: + distances, dataset_indexes = dataset_indexes + dataset_names = np.take(self._datasets, dataset_indexes, axis=0) + return distances, dataset_names + else: + return np.take(self._datasets, dataset_indexes, axis=0) + + @property + def datasets(self) -> Optional[Iterable[str]]: + return self._datasets + + @property + def feature_names(self) -> List[str]: + return self._inner_model.feature_names_in_ + + def _preprocess_predict_features(self, meta_features: pd.DataFrame) -> pd.DataFrame: + return meta_features[self.feature_names] diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py deleted file mode 100644 index 8254c745..00000000 --- a/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import Optional, Dict, Any, List, Iterable - -import numpy as np -import pandas as pd -from sklearn.neighbors import KNeighborsClassifier - -from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \ - DatasetsSimilarityAssessor - - -class PredictProbaSimilarityAssessor(DatasetsSimilarityAssessor): - def __init__(self, model, n_best: int = 1): - self._inner_model = model - self.n_best = n_best - - @property - def datasets(self) -> List[str]: - return self._inner_model.classes_ - - @property - def feature_names(self) -> List[str]: - return self._inner_model.feature_names_in_ - - @staticmethod - def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame: - return meta_features.dropna(axis=1, how='any') - - def _preprocess_predict_features(self, meta_features: pd.DataFrame) -> pd.DataFrame: - return meta_features[self.feature_names] - - def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]): - meta_features = self.preprocess_meta_features(meta_features) - self._inner_model.fit(meta_features, datasets) - - def predict_proba(self, meta_features: pd.DataFrame) -> List[List[float]]: - return self._inner_model.predict_proba(meta_features) - - def predict(self, meta_features: pd.DataFrame) -> List[List[str]]: - meta_features = self._preprocess_predict_features(meta_features) - predict_probs = self.predict_proba(meta_features) - final_prediction = [] - for probabilities in predict_probs: - probabilities = list(probabilities) - predictions = [] - for _ in range(self.n_best): - predicted_class_idx = np.argmax(probabilities) - predicted_class = self.datasets[predicted_class_idx] - predictions.append(predicted_class) - probabilities.pop(predicted_class_idx) - final_prediction.append(predictions) - - return final_prediction - - -class KNNSimilarityAssessor(PredictProbaSimilarityAssessor): - def __init__(self, model_params: Optional[Dict[str, Any]] = None, n_best: int = 1): - model_params = model_params or dict() - model = KNeighborsClassifier(**model_params) - super().__init__(model, n_best) From e0661f35579cbce48ef3f3f20b4c025de3c48ded Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sat, 4 Mar 2023 13:32:47 +0300 Subject: [PATCH 28/60] allow PymfeExtractor fill values with median --- .../meta_features_extractors/pymfe_extractor.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py index 2848998e..3a379f6f 100644 --- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py @@ -25,7 +25,7 @@ def datasets_loader(self) -> DatasetsLoader: raise ValueError("Datasets loader not provided!") return self._datasets_loader - def extract(self, datasets: List[Union[DatasetCache, str]]) -> pd.DataFrame: + def extract(self, datasets: List[Union[DatasetCache, str]], fill_nans: bool = False) -> pd.DataFrame: meta_features = {} meta_feature_names = self._extractor.extract_metafeature_names() load_dataset = self.datasets_loader.cache_to_memory @@ -37,10 +37,21 @@ def extract(self, datasets: List[Union[DatasetCache, str]]) -> pd.DataFrame: else: loaded_dataset = load_dataset(dataset) cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val] - mfe = self._extractor.fit(loaded_dataset.x, loaded_dataset.y, cat_cols=cat_cols) + x = loaded_dataset.x + y = loaded_dataset.y + if fill_nans: + x = self.fill_nans(x) + mfe = self._extractor.fit(x, y, cat_cols=cat_cols) feature_names, dataset_features = mfe.extract(out_type=tuple) mfs = dict(zip(feature_names, dataset_features)) self._update_meta_features_cache(dataset.name, mfs) meta_features[dataset.name] = mfs meta_features = pd.DataFrame.from_dict(meta_features, orient='index') return meta_features + + @staticmethod + def fill_nans(x): + if not isinstance(x, pd.DataFrame): + x = pd.DataFrame(x) + x = x.fillna(x.median()) + return x.to_numpy() From 4f10b0386be01ec79d84033399852b09b88ab334 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 24 Mar 2023 14:10:48 +0300 Subject: [PATCH 29/60] use FEDOT version with fixed initial assumptions --- requirements.txt | Bin 460 -> 310 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/requirements.txt b/requirements.txt index ad0a22332f176f2c866188116575624428ac1536..4b8e1290af910b183fc689ad8e6fdcccc56d827e 100644 GIT binary patch delta 34 mcmX@Zyp2in|GzYbRE89We1;MRTOc%G&|@$MVuOwAWf=joy$G8C delta 185 zcmXAh%?`m(5QRVb6c$#N66N;lUm~%<*2W8{Rx8o8Nw2kMvhft22}flzGdbrwGtc46 zyMNv#7aUV6O-D;dim&?n6*n?woM@=!9+j@8uD$QGW6Op;2iC*{;sFUu?S*Deh{1B! og%OE*V&ul+7CV>q>s&I@VWAEcN(3_|(xqyp2Zb^X?lBq Date: Thu, 30 Mar 2023 13:47:06 +0300 Subject: [PATCH 30/60] optional cache usage for MFE extractor --- .../meta_features_extractors/pymfe_extractor.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py index 3a379f6f..36cb9d45 100644 --- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py @@ -25,26 +25,30 @@ def datasets_loader(self) -> DatasetsLoader: raise ValueError("Datasets loader not provided!") return self._datasets_loader - def extract(self, datasets: List[Union[DatasetCache, str]], fill_nans: bool = False) -> pd.DataFrame: + def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: bool = False, + use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame: meta_features = {} meta_feature_names = self._extractor.extract_metafeature_names() load_dataset = self.datasets_loader.cache_to_memory for dataset in datasets: if isinstance(dataset, str): dataset = DatasetCache(dataset) - if mfs := self._get_meta_features_cache(dataset.name, meta_feature_names): + + if (use_cached and + (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))): meta_features[dataset.name] = mfs else: loaded_dataset = load_dataset(dataset) cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val] x = loaded_dataset.x y = loaded_dataset.y - if fill_nans: + if fill_input_nans: x = self.fill_nans(x) mfe = self._extractor.fit(x, y, cat_cols=cat_cols) feature_names, dataset_features = mfe.extract(out_type=tuple) mfs = dict(zip(feature_names, dataset_features)) - self._update_meta_features_cache(dataset.name, mfs) + if update_cached: + self._update_meta_features_cache(dataset.name, mfs) meta_features[dataset.name] = mfs meta_features = pd.DataFrame.from_dict(meta_features, orient='index') return meta_features From 9bf6d9782a3b65af9d7754b37c328bcbea0d48ea Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 30 Mar 2023 13:47:54 +0300 Subject: [PATCH 31/60] allow to advise only the n best models --- .../model_advisors/diverse_fedot_pipeline_advisor.py | 8 +++++++- .../meta_algorithm/model_advisors/model_advisor.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py index 15ef1f57..6f7e4a66 100644 --- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Iterable +from typing import Callable, List, Iterable, Optional from fedot.core.pipelines.pipeline import Pipeline from golem.core.dag.linked_graph import get_distance_between @@ -11,10 +11,12 @@ class DiverseFEDOTPipelineAdvisor(SimpleSimilarityModelAdvisor): def __init__(self, fitted_similarity_assessor: DatasetsSimilarityAssessor, + n_best_to_advise: Optional[int] = None, minimal_distance: int = 1, distance_func: Callable[[Pipeline, Pipeline], int] = get_distance_between): super().__init__(fitted_similarity_assessor) self.minimal_distance = minimal_distance + self.n_best_to_advise = n_best_to_advise self.distance_func = distance_func def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]: @@ -24,4 +26,8 @@ def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]: for model in dataset_advice[1:]: if self.distance_func(first_model.predictor, model.predictor) > self.minimal_distance: diverse_dataset_advice.append(model) + + if self.n_best_to_advise is not None: + diverse_dataset_advice = list(sorted(diverse_dataset_advice, key=lambda m: m.fitness, reverse=True)) + diverse_dataset_advice = diverse_dataset_advice[:self.n_best_to_advise] return diverse_dataset_advice diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py index b585bf27..a9ca0d97 100644 --- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py @@ -1,5 +1,5 @@ from abc import abstractmethod -from typing import List, Dict, Iterable +from typing import List, Dict, Iterable, Optional import pandas as pd From fdee481fbd3ad0d42e19ff4f3fe08e330295c372 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 30 Mar 2023 13:58:49 +0300 Subject: [PATCH 32/60] finalize experiment --- experiments/fedot_warm_start/run.py | 145 +++++++++++++++++++++++----- 1 file changed, 119 insertions(+), 26 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index f0342126..be5f45f7 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,22 +1,58 @@ import functools import timeit +from datetime import datetime +from itertools import chain +from pathlib import Path +from typing import Dict +import numpy as np import openml import pandas as pd from fedot.api.main import Fedot +from fedot.core.pipelines.adapters import PipelineAdapter from sklearn.model_selection import train_test_split +from tqdm import tqdm +from meta_automl.data_preparation.dataset import DatasetCache, Dataset from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor +from meta_automl.data_preparation.model import Model +from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor +from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor +# Meta-alg hyperparameters SEED = 42 - - -def prepare_data(): - dataset_ids = pd.Series(openml.study.get_suite(99).data) - dataset_ids = dataset_ids.sample(n=15, random_state=SEED) +# Datasets sampling +N_DATASETS = None +TEST_SIZE = 0.33 +# Evaluation timeouts +TRAIN_TIMEOUT = 15 +TEST_TIMEOUT = 10 +# Models & datasets +N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 +N_CLOSEST_DATASETS_TO_PROPOSE = 5 +MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1 +N_BEST_MODELS_TO_ADVISE = 5 + + +COMMON_FEDOT_PARAMS = dict( + problem='classification', + with_tuning=False, + logging_level=50, + n_jobs=-1, + seed=SEED, +) + + +def prepare_data() -> Dict[str, DatasetCache]: + """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" + + dataset_ids = openml.study.get_suite(99).data + if N_DATASETS is not None: + dataset_ids = pd.Series(dataset_ids) + dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) dataset_ids = list(dataset_ids) - return OpenMLDatasetsLoader().load(dataset_ids) + return {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} def timeit_decorator(function): @@ -30,37 +66,80 @@ def wrapped(*args, **kwargs): return wrapped +def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): + x = data.x + y = data.y + if len(y.shape) == 1: + y = y.reshape(-1, 1) + return x, y + + def main(): datasets_cache = prepare_data() - datasets_train, datasets_test = train_test_split(datasets_cache, test_size=0.33, random_state=SEED) + datasets_train, datasets_test = train_test_split(list(datasets_cache.keys()), + test_size=TEST_SIZE, random_state=SEED) - # TODO: - # - Extract meta-features for train datasets - # - Fit 'DatasetsSimilarityAssessor' + extractor = PymfeExtractor(extractor_params={'groups': 'general'}) + meta_features_train = extractor.extract(datasets_train, fill_input_nans=True) + meta_features_train = meta_features_train.fillna(0) + data_similarity_assessor = KNeighborsBasedSimilarityAssessor( + n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE)) + data_similarity_assessor.fit(meta_features_train, datasets_train) results_pre = [] - for cache in datasets_train: + best_models_per_dataset = {} + for name in tqdm(datasets_train, 'Train datasets'): + cache = datasets_cache[name] data = cache.from_cache() - fedot = Fedot('classification', timeout=15, n_jobs=-1, seed=SEED) - _, automl_time = timeit_decorator(fedot.fit)(data.x, data.y) - results_pre.append({'dataset': data.name, 'model': fedot, 'automl_time': automl_time}) - # TODO: - # - Prepare 'ModelAdvisor' + fedot = Fedot(timeout=TRAIN_TIMEOUT, **COMMON_FEDOT_PARAMS) + x, y = transform_data_for_fedot(data) + _, automl_time = timeit_decorator(fedot.fit)(x, y) + results_pre.append({'dataset': name, + 'model': fedot.current_pipeline.descriptive_id, + 'automl_time': automl_time}) + # TODO: + # x Turn the tuned pipeline into a model (evaluate its fitness on the data) + # x Evaluate historical pipelines on the data instead of using fitness + + # Filter out unique individuals with the best fitness + best_individuals = sorted(chain(*fedot.history.individuals), + key=lambda ind: ind.fitness, + reverse=True) + best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) + # best_models = list(fedot.best_models) or [] + best_models = [] + for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: + pipeline = PipelineAdapter().restore(individual.graph) + model = Model(pipeline, individual.fitness, cache) + best_models.append(model) + best_models_per_dataset[name] = best_models + + model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, + minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) + model_advisor.fit(best_models_per_dataset) results = [] - for cache in datasets_test: + for name in tqdm(datasets_test, 'Test datasets'): + cache = datasets_cache[name] data = cache.from_cache() - fedot_naive = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED) - _, automl_time_naive = timeit_decorator(fedot_naive.fit)(data.x, data.y) + x, y = transform_data_for_fedot(data) + + fedot_naive = Fedot(timeout=TEST_TIMEOUT, **COMMON_FEDOT_PARAMS) + _, automl_time_naive = timeit_decorator(fedot_naive.fit)(x, y) + fedot_naive.test_data = fedot_naive.train_data + fedot_naive.prediction = fedot_naive.train_data time_start = timeit.default_timer() - # TODO: - # - Extract meta-features for current test dataset - # - Get suitable assumptions from 'ModelAdvisor' - initial_assumption = ... - fedot_meta = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED, initial_assumption=initial_assumption) + meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) + meta_features = meta_features.fillna(0) + initial_assumptions = model_advisor.predict(meta_features)[0] + initial_assumptions = [model.predictor for model in initial_assumptions] + fedot_meta = Fedot(timeout=TEST_TIMEOUT, initial_assumption=initial_assumptions, **COMMON_FEDOT_PARAMS) + fedot_meta.fit(x, y) automl_time_meta = timeit.default_timer() - time_start + fedot_meta.test_data = fedot_meta.train_data + fedot_meta.prediction = fedot_meta.train_data metrics_naive = fedot_naive.get_metrics() metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()} @@ -69,13 +148,27 @@ def main(): results.append({ 'dataset': data.name, - 'model_naive': fedot_naive, - 'model_meta': fedot_meta, + 'model_naive': fedot_naive.current_pipeline.descriptive_id, + 'model_meta': fedot_meta.current_pipeline.descriptive_id, + 'history_naive': fedot_naive.history, + 'history_meta': fedot_meta.history, 'automl_time_naive': automl_time_naive, 'automl_time_meta': automl_time_meta, **metrics_naive, **metrics_meta }) + time_now = datetime.now().isoformat(timespec="minutes").replace(":", ".") + save_dir = Path(f'run_{time_now}') + save_dir.mkdir() + history_dir = save_dir.joinpath('histories') + history_dir.mkdir() + for res in results: + dataset = res['dataset'] + res.pop('history_naive').save(history_dir.joinpath(f'{dataset}_history_naive.json')) + res.pop('history_meta').save(history_dir.joinpath(f'{dataset}_history_meta.json')) + pd.DataFrame(results_pre).to_csv(save_dir.joinpath(f'results_pre_{time_now}.csv')) + pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now}.csv')) + if __name__ == "__main__": main() From 169ab3ef409aa8580776625ae955788de73a4fc2 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 7 Apr 2023 21:08:49 +0300 Subject: [PATCH 33/60] finalize experiment [2] --- experiments/fedot_warm_start/run.py | 224 +++++++++++++++++++--------- requirements.txt | Bin 310 -> 460 bytes 2 files changed, 155 insertions(+), 69 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index be5f45f7..6e043d55 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,16 +1,20 @@ import functools +import json import timeit from datetime import datetime from itertools import chain from pathlib import Path -from typing import Dict +from typing import Dict, List, Tuple import numpy as np import openml import pandas as pd from fedot.api.main import Fedot +from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate from fedot.core.pipelines.adapters import PipelineAdapter -from sklearn.model_selection import train_test_split +from fedot.core.pipelines.pipeline_builder import PipelineBuilder +from fedot.core.validation.split import tabular_cv_generator +from sklearn.model_selection import train_test_split, StratifiedKFold from tqdm import tqdm from meta_automl.data_preparation.dataset import DatasetCache, Dataset @@ -24,27 +28,28 @@ SEED = 42 # Datasets sampling N_DATASETS = None -TEST_SIZE = 0.33 +TEST_SIZE = 0.2 # Evaluation timeouts -TRAIN_TIMEOUT = 15 -TEST_TIMEOUT = 10 +TRAIN_TIMEOUT = 5 +TEST_TIMEOUT = 5 # Models & datasets N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 N_CLOSEST_DATASETS_TO_PROPOSE = 5 MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1 N_BEST_MODELS_TO_ADVISE = 5 - +# Meta-features +MF_EXTRACTOR_PARAMS = {'groups': 'general'} COMMON_FEDOT_PARAMS = dict( problem='classification', - with_tuning=False, logging_level=50, n_jobs=-1, seed=SEED, + show_progress=False, ) -def prepare_data() -> Dict[str, DatasetCache]: +def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]: """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" dataset_ids = openml.study.get_suite(99).data @@ -52,18 +57,7 @@ def prepare_data() -> Dict[str, DatasetCache]: dataset_ids = pd.Series(dataset_ids) dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) dataset_ids = list(dataset_ids) - return {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} - - -def timeit_decorator(function): - @functools.wraps(function) - def wrapped(*args, **kwargs): - start_time = timeit.default_timer() - res = function(*args, **kwargs) - time = timeit.default_timer() - start_time - return res, time - - return wrapped + return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): @@ -74,40 +68,99 @@ def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): return x, y -def main(): - datasets_cache = prepare_data() - datasets_train, datasets_test = train_test_split(list(datasets_cache.keys()), - test_size=TEST_SIZE, random_state=SEED) +def get_pipeline_metrics(pipeline, + input_data, + metrics_obj) -> dict: + """Gets quality metrics for the fitted pipeline. + The function is based on `Fedot.get_metrics()` + + Returns: + the values of quality metrics + """ + metrics = metrics_obj.metric_functions + metric_names = metrics_obj.get_metric_names(metrics) + + data_producer = functools.partial(tabular_cv_generator, input_data, 10, StratifiedKFold) + + objective = MetricsObjective(metrics) + obj_eval = PipelineObjectiveEvaluate(objective=objective, + data_producer=data_producer, + eval_n_jobs=-1) - extractor = PymfeExtractor(extractor_params={'groups': 'general'}) + metrics = obj_eval.evaluate(pipeline).values + metrics = {metric_name: round(metric, 3) for (metric_name, metric) in zip(metric_names, metrics)} + + return metrics + + +def prepare_extractor_and_assessor(datasets_train: List[str]): + extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS) meta_features_train = extractor.extract(datasets_train, fill_input_nans=True) meta_features_train = meta_features_train.fillna(0) data_similarity_assessor = KNeighborsBasedSimilarityAssessor( n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE)) data_similarity_assessor.fit(meta_features_train, datasets_train) + return data_similarity_assessor, extractor + + +def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=None): + x, y = transform_data_for_fedot(data) + + time_start = timeit.default_timer() + fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS) + fedot.fit(x, y) + automl_time = timeit.default_timer() - time_start + + metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data, fedot.metrics) + pipeline = fedot.current_pipeline + run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time, + automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics) + return fedot, run_results - results_pre = [] + +def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., automl_timeout_min=0., + **metrics): + run_results = dict(dataset_id=dataset.id, + dataset_name=dataset.name, + run_label=run_label, + model_obj=pipeline, + model_str=pipeline.descriptive_id, + history_obj=history_obj, + automl_time_sec=automl_time_sec, + automl_timeout_min=automl_timeout_min, + **metrics) + return run_results + + +def main(): + baseline_pipeline = PipelineBuilder().add_node('rf').build() + + dataset_ids, datasets_cache = prepare_data() + + datasets_train, datasets_test = \ + train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED) + + data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) + + results = [] best_models_per_dataset = {} for name in tqdm(datasets_train, 'Train datasets'): cache = datasets_cache[name] data = cache.from_cache() - fedot = Fedot(timeout=TRAIN_TIMEOUT, **COMMON_FEDOT_PARAMS) - x, y = transform_data_for_fedot(data) - _, automl_time = timeit_decorator(fedot.fit)(x, y) - results_pre.append({'dataset': name, - 'model': fedot.current_pipeline.descriptive_id, - 'automl_time': automl_time}) + fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') + results.append(run_results) # TODO: # x Turn the tuned pipeline into a model (evaluate its fitness on the data) # x Evaluate historical pipelines on the data instead of using fitness + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run # Filter out unique individuals with the best fitness + history = fedot.history best_individuals = sorted(chain(*fedot.history.individuals), key=lambda ind: ind.fitness, reverse=True) best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - # best_models = list(fedot.best_models) or [] best_models = [] for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: pipeline = PipelineAdapter().restore(individual.graph) @@ -119,55 +172,88 @@ def main(): minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) - results = [] for name in tqdm(datasets_test, 'Test datasets'): cache = datasets_cache[name] data = cache.from_cache() - x, y = transform_data_for_fedot(data) - fedot_naive = Fedot(timeout=TEST_TIMEOUT, **COMMON_FEDOT_PARAMS) - _, automl_time_naive = timeit_decorator(fedot_naive.fit)(x, y) - fedot_naive.test_data = fedot_naive.train_data - fedot_naive.prediction = fedot_naive.train_data + # Run pure AutoML + fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT') + results.append(fedot_naive_results) + # Run meta AutoML + # 1 time_start = timeit.default_timer() meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) meta_features = meta_features.fillna(0) + meta_learning_time = timeit.default_timer() - time_start initial_assumptions = model_advisor.predict(meta_features)[0] - initial_assumptions = [model.predictor for model in initial_assumptions] - fedot_meta = Fedot(timeout=TEST_TIMEOUT, initial_assumption=initial_assumptions, **COMMON_FEDOT_PARAMS) - fedot_meta.fit(x, y) - automl_time_meta = timeit.default_timer() - time_start - fedot_meta.test_data = fedot_meta.train_data - fedot_meta.prediction = fedot_meta.train_data - - metrics_naive = fedot_naive.get_metrics() - metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()} - metrics_meta = fedot_meta.get_metrics() - metrics_meta = {f'{key}_meta': val for key, val in metrics_meta.items()} - - results.append({ - 'dataset': data.name, - 'model_naive': fedot_naive.current_pipeline.descriptive_id, - 'model_meta': fedot_meta.current_pipeline.descriptive_id, - 'history_naive': fedot_naive.history, - 'history_meta': fedot_meta.history, - 'automl_time_naive': automl_time_naive, - 'automl_time_meta': automl_time_meta, - **metrics_naive, **metrics_meta - }) - - time_now = datetime.now().isoformat(timespec="minutes").replace(":", ".") - save_dir = Path(f'run_{time_now}') + assumption_pipelines = [model.predictor for model in initial_assumptions] + # 2 + fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', + initial_assumption=assumption_pipelines) + fedot_meta_results['meta_learning_time'] = meta_learning_time + results.append(fedot_meta_results) + + # Fit & evaluate simple baseline + baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics) + baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline, + **baseline_metrics) + results.append(baseline_res) + + # Fit & evaluate initial assumptions + for i, assumption in enumerate(initial_assumptions): + pipeline = assumption.predictor + assumption_metrics = get_pipeline_metrics(assumption.predictor, fedot_meta.train_data, fedot_meta.metrics) + assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}', + pipeline=assumption.predictor, **assumption_metrics) + results.append(assumption_res) + + # Save the accumulated results + time_now = datetime.now().isoformat(timespec="minutes") + time_now_for_path = time_now.replace(":", ".") + save_dir = Path(f'run_{time_now_for_path}') save_dir.mkdir() history_dir = save_dir.joinpath('histories') history_dir.mkdir() + models_dir = save_dir.joinpath('models') for res in results: - dataset = res['dataset'] - res.pop('history_naive').save(history_dir.joinpath(f'{dataset}_history_naive.json')) - res.pop('history_meta').save(history_dir.joinpath(f'{dataset}_history_meta.json')) - pd.DataFrame(results_pre).to_csv(save_dir.joinpath(f'results_pre_{time_now}.csv')) - pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now}.csv')) + res['run_date'] = time_now + dataset_name = res['dataset_name'] + run_label = res['run_label'] + # define saving paths + model_path = models_dir.joinpath(f'{dataset_name}_{run_label}') + history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json') + # replace objects with export paths for csv + res['model_path'] = str(model_path) + res.pop('model_obj').save(res['model_path']) + res['history_path'] = str(history_path) + history_obj = res.pop('history_obj') + if history_obj is not None: + history_obj.save(res['history_path']) + pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_pre_{time_now_for_path}.csv')) + pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) + + # save experiment hyperparameters + params = { + 'run_date': time_now, + 'seed': SEED, + 'n_datasets': N_DATASETS or len(dataset_ids), + 'test_size': TEST_SIZE, + 'dataset_ids': dataset_ids, + 'dataset_names': list(datasets_cache.keys()), + 'dataset_names_train': datasets_train, + 'dataset_names_test': datasets_test, + 'train_timeout': TRAIN_TIMEOUT, + 'test_timeout': TEST_TIMEOUT, + 'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE, + 'n_closest_datasets_to_propose': N_CLOSEST_DATASETS_TO_PROPOSE, + 'minimal_distance_between_advised_models': MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS, + 'n_best_models_to_advise': N_BEST_MODELS_TO_ADVISE, + 'common_fedot_params': COMMON_FEDOT_PARAMS, + 'baseline_pipeline': baseline_pipeline.descriptive_id, + } + with open(save_dir.joinpath('parameters.json'), 'w') as params_file: + json.dump(params, params_file, indent=2) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 4b8e1290af910b183fc689ad8e6fdcccc56d827e..ad0a22332f176f2c866188116575624428ac1536 100644 GIT binary patch delta 185 zcmXAh%?`m(5QRVb6c$#N66N;lUm~%<*2W8{Rx8o8Nw2kMvhft22}flzGdbrwGtc46 zyMNv#7aUV6O-D;dim&?n6*n?woM@=!9+j@8uD$QGW6Op;2iC*{;sFUu?S*Deh{1B! og%OE*V&ul+7CV>q>s&I@VWAEcN(3_|(xqyp2Zb^X?lBq Date: Sat, 8 Apr 2023 13:14:10 +0300 Subject: [PATCH 34/60] wrap & log exceptions; log progress to file --- experiments/fedot_warm_start/run.py | 193 ++++++++++++++++------------ 1 file changed, 111 insertions(+), 82 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 6e043d55..3e5a2a28 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,5 +1,6 @@ import functools import json +import logging import timeit from datetime import datetime from itertools import chain @@ -14,6 +15,7 @@ from fedot.core.pipelines.adapters import PipelineAdapter from fedot.core.pipelines.pipeline_builder import PipelineBuilder from fedot.core.validation.split import tabular_cv_generator +from golem.core.log import Log from sklearn.model_selection import train_test_split, StratifiedKFold from tqdm import tqdm @@ -27,11 +29,11 @@ # Meta-alg hyperparameters SEED = 42 # Datasets sampling -N_DATASETS = None -TEST_SIZE = 0.2 +N_DATASETS = 3 +TEST_SIZE = 0.33 # Evaluation timeouts -TRAIN_TIMEOUT = 5 -TEST_TIMEOUT = 5 +TRAIN_TIMEOUT = 0.5 +TEST_TIMEOUT = 0.5 # Models & datasets N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 N_CLOSEST_DATASETS_TO_PROPOSE = 5 @@ -42,12 +44,25 @@ COMMON_FEDOT_PARAMS = dict( problem='classification', - logging_level=50, n_jobs=-1, seed=SEED, show_progress=False, ) +# Setup logging +time_now = datetime.now().isoformat(timespec="minutes") +time_now_for_path = time_now.replace(":", ".") +save_dir = Path(f'run_{time_now_for_path}') +save_dir.mkdir() +log_file = save_dir.joinpath('log.txt') +Log(log_file=log_file) +logging.basicConfig(filename=log_file, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', + datefmt='%H:%M:%S', + force=True, + ) + def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]: """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" @@ -132,6 +147,19 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut return run_results +def extract_best_history_models(dataset_cache, history): + best_individuals = sorted(chain(*history.individuals), + key=lambda ind: ind.fitness, + reverse=True) + best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) + best_models = [] + for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: + pipeline = PipelineAdapter().restore(individual.graph) + model = Model(pipeline, individual.fitness, dataset_cache) + best_models.append(model) + return best_models + + def main(): baseline_pipeline = PipelineBuilder().add_node('rf').build() @@ -144,93 +172,91 @@ def main(): results = [] best_models_per_dataset = {} - for name in tqdm(datasets_train, 'Train datasets'): - cache = datasets_cache[name] - data = cache.from_cache() - - fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') - results.append(run_results) - # TODO: - # x Turn the tuned pipeline into a model (evaluate its fitness on the data) - # x Evaluate historical pipelines on the data instead of using fitness - # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run - - # Filter out unique individuals with the best fitness - history = fedot.history - best_individuals = sorted(chain(*fedot.history.individuals), - key=lambda ind: ind.fitness, - reverse=True) - best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - best_models = [] - for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: - pipeline = PipelineAdapter().restore(individual.graph) - model = Model(pipeline, individual.fitness, cache) - best_models.append(model) - best_models_per_dataset[name] = best_models + progress_file = open(save_dir.joinpath('progress.txt'), 'a') + for name in tqdm(datasets_train, 'Train datasets', file=progress_file): + try: + cache = datasets_cache[name] + data = cache.from_cache() + + fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') + results.append(run_results) + # TODO: + # x Turn the tuned pipeline into a model (evaluate its fitness on the data) + # x Evaluate historical pipelines on the data instead of using fitness + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run + + # Filter out unique individuals with the best fitness + history = fedot.history + best_models = extract_best_history_models(cache, history) + best_models_per_dataset[name] = best_models + except Exception: + logging.exception(f'Train dataset "{name}"') model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) - for name in tqdm(datasets_test, 'Test datasets'): - cache = datasets_cache[name] - data = cache.from_cache() - - # Run pure AutoML - fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT') - results.append(fedot_naive_results) - - # Run meta AutoML - # 1 - time_start = timeit.default_timer() - meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) - meta_features = meta_features.fillna(0) - meta_learning_time = timeit.default_timer() - time_start - initial_assumptions = model_advisor.predict(meta_features)[0] - assumption_pipelines = [model.predictor for model in initial_assumptions] - # 2 - fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', - initial_assumption=assumption_pipelines) - fedot_meta_results['meta_learning_time'] = meta_learning_time - results.append(fedot_meta_results) - - # Fit & evaluate simple baseline - baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics) - baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline, - **baseline_metrics) - results.append(baseline_res) - - # Fit & evaluate initial assumptions - for i, assumption in enumerate(initial_assumptions): - pipeline = assumption.predictor - assumption_metrics = get_pipeline_metrics(assumption.predictor, fedot_meta.train_data, fedot_meta.metrics) - assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}', - pipeline=assumption.predictor, **assumption_metrics) - results.append(assumption_res) + for name in tqdm(datasets_test, 'Test datasets', file=progress_file): + try: + cache = datasets_cache[name] + data = cache.from_cache() + + # Run pure AutoML + fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT') + results.append(fedot_naive_results) + + # Run meta AutoML + # 1 + time_start = timeit.default_timer() + meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) + meta_features = meta_features.fillna(0) + meta_learning_time = timeit.default_timer() - time_start + initial_assumptions = model_advisor.predict(meta_features)[0] + assumption_pipelines = [model.predictor for model in initial_assumptions] + # 2 + fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', + initial_assumption=assumption_pipelines) + fedot_meta_results['meta_learning_time'] = meta_learning_time + results.append(fedot_meta_results) + + # Fit & evaluate simple baseline + baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics) + baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline, + **baseline_metrics) + results.append(baseline_res) + + # Fit & evaluate initial assumptions + for i, assumption in enumerate(initial_assumptions): + pipeline = assumption.predictor + assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data, fedot_meta.metrics) + assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}', + pipeline=pipeline, **assumption_metrics) + results.append(assumption_res) + except Exception: + logging.exception(f'Test dataset "{name}"') # Save the accumulated results - time_now = datetime.now().isoformat(timespec="minutes") - time_now_for_path = time_now.replace(":", ".") - save_dir = Path(f'run_{time_now_for_path}') - save_dir.mkdir() history_dir = save_dir.joinpath('histories') history_dir.mkdir() models_dir = save_dir.joinpath('models') for res in results: - res['run_date'] = time_now - dataset_name = res['dataset_name'] - run_label = res['run_label'] - # define saving paths - model_path = models_dir.joinpath(f'{dataset_name}_{run_label}') - history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json') - # replace objects with export paths for csv - res['model_path'] = str(model_path) - res.pop('model_obj').save(res['model_path']) - res['history_path'] = str(history_path) - history_obj = res.pop('history_obj') - if history_obj is not None: - history_obj.save(res['history_path']) - pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_pre_{time_now_for_path}.csv')) + try: + res['run_date'] = time_now + dataset_name = res['dataset_name'] + run_label = res['run_label'] + # define saving paths + model_path = models_dir.joinpath(f'{dataset_name}_{run_label}') + history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json') + # replace objects with export paths for csv + res['model_path'] = str(model_path) + res.pop('model_obj').save(res['model_path']) + res['history_path'] = str(history_path) + history_obj = res.pop('history_obj') + if history_obj is not None: + history_obj.save(res['history_path']) + except Exception: + logging.exception(f'Saving results "{res}"') + pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) # save experiment hyperparameters @@ -257,4 +283,7 @@ def main(): if __name__ == "__main__": - main() + try: + main() + except Exception: + logging.exception(f'Main level cached the error') From a796ea73caccb9eead7cfe659f387df3db41d449 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sat, 8 Apr 2023 13:16:28 +0300 Subject: [PATCH 35/60] update timeouts --- experiments/fedot_warm_start/run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 3e5a2a28..3113ad97 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -29,11 +29,11 @@ # Meta-alg hyperparameters SEED = 42 # Datasets sampling -N_DATASETS = 3 -TEST_SIZE = 0.33 +N_DATASETS = None +TEST_SIZE = 0.2 # Evaluation timeouts -TRAIN_TIMEOUT = 0.5 -TEST_TIMEOUT = 0.5 +TRAIN_TIMEOUT = 15 +TEST_TIMEOUT = 15 # Models & datasets N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 N_CLOSEST_DATASETS_TO_PROPOSE = 5 From 86652043368290d95039b3ae99ccdff1744cbe18 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 18 Apr 2023 18:02:21 +0300 Subject: [PATCH 36/60] remove GOLEM from requirements.txt to inherit version required by FEDOT --- requirements.txt | Bin 460 -> 430 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/requirements.txt b/requirements.txt index ad0a22332f176f2c866188116575624428ac1536..eca13d853ca1f8e55c583bd3790a78a679ffee4d 100644 GIT binary patch delta 7 OcmX@ZypDOpIz|8ti~`^Q delta 38 pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006 Date: Tue, 18 Apr 2023 18:04:02 +0300 Subject: [PATCH 37/60] clean openml cache --- .../datasets_loaders/openml_datasets_loader.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py index f23510d7..7959ca61 100644 --- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py +++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py @@ -1,5 +1,7 @@ from __future__ import annotations +import shutil +from pathlib import Path from typing import List, Union import openml @@ -10,6 +12,12 @@ OpenMLDatasetID = Union[str, int] +def _clear_openml_cache(): + cache_dir = openml.config.get_cache_directory() + cache_dir = Path(cache_dir) + shutil.rmtree(cache_dir) + + class OpenMLDatasetsLoader(DatasetsLoader): def __init__(self): @@ -27,7 +35,10 @@ def load(self, dataset_sources: List[OpenMLDatasetID]) -> List[DatasetCache]: return datasets def load_single(self, source: OpenMLDatasetID): - return self.get_openml_dataset(source) + try: + return self.get_openml_dataset(source) + finally: + _clear_openml_cache() def get_openml_dataset(self, dataset_id: OpenMLDatasetID, force_download: bool = False) -> DatasetCache: openml_dataset = openml.datasets.get_dataset(dataset_id, download_data=False, download_qualities=False) From 6eddbb19dea0f3b8d16aad3a6b6af451277c9f4e Mon Sep 17 00:00:00 2001 From: morrisnein Date: Tue, 18 Apr 2023 18:04:35 +0300 Subject: [PATCH 38/60] update Dockerfile --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index e17e17cd..7958082a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ WORKDIR $WORKDIR COPY . $WORKDIR RUN pip3 install pip && \ + pip install wheel && \ pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt ENV PYTHONPATH $WORKDIR From d8bd536935ad02c98d21a39b543c0027ad60be24 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 20 Apr 2023 15:42:57 +0300 Subject: [PATCH 39/60] make experiment safer --- experiments/fedot_warm_start/run.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 3113ad97..66c80192 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -19,6 +19,7 @@ from sklearn.model_selection import train_test_split, StratifiedKFold from tqdm import tqdm +from meta_automl.data_preparation.data_manager import DataManager from meta_automl.data_preparation.dataset import DatasetCache, Dataset from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor @@ -52,7 +53,8 @@ # Setup logging time_now = datetime.now().isoformat(timespec="minutes") time_now_for_path = time_now.replace(":", ".") -save_dir = Path(f'run_{time_now_for_path}') +save_dir = DataManager.get_data_dir()\ + .joinpath(f'run_{time_now_for_path}').joinpath('experiments').joinpath('fedot_warm_start') save_dir.mkdir() log_file = save_dir.joinpath('log.txt') Log(log_file=log_file) @@ -168,17 +170,16 @@ def main(): datasets_train, datasets_test = \ train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED) - data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) - results = [] best_models_per_dataset = {} progress_file = open(save_dir.joinpath('progress.txt'), 'a') - for name in tqdm(datasets_train, 'Train datasets', file=progress_file): + for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file): try: cache = datasets_cache[name] data = cache.from_cache() - fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT') + timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT + fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT') results.append(run_results) # TODO: # x Turn the tuned pipeline into a model (evaluate its fitness on the data) @@ -192,19 +193,16 @@ def main(): except Exception: logging.exception(f'Train dataset "{name}"') + data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) - for name in tqdm(datasets_test, 'Test datasets', file=progress_file): + for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file): try: cache = datasets_cache[name] data = cache.from_cache() - # Run pure AutoML - fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT') - results.append(fedot_naive_results) - # Run meta AutoML # 1 time_start = timeit.default_timer() @@ -234,6 +232,7 @@ def main(): results.append(assumption_res) except Exception: logging.exception(f'Test dataset "{name}"') + progress_file.close() # Save the accumulated results history_dir = save_dir.joinpath('histories') From 36c1d0155440db71c2dcd6bd74d96aad1c87ff7e Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 20 Apr 2023 16:38:47 +0300 Subject: [PATCH 40/60] add .dockerignore --- .dockerignore | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..2bfa6863 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +# Config & info files +.pep8speaks.yml +Dockerfile +LICENSE +README.md + +# Unnecessary files +examples +notebooks +test + +# User data +data From 29b8cb9d3c50aea8e911d072f4071301a7c8d201 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 20 Apr 2023 17:39:14 +0300 Subject: [PATCH 41/60] fix save path --- experiments/fedot_warm_start/run.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 66c80192..9bf33ccb 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -4,7 +4,6 @@ import timeit from datetime import datetime from itertools import chain -from pathlib import Path from typing import Dict, List, Tuple import numpy as np @@ -53,9 +52,9 @@ # Setup logging time_now = datetime.now().isoformat(timespec="minutes") time_now_for_path = time_now.replace(":", ".") -save_dir = DataManager.get_data_dir()\ - .joinpath(f'run_{time_now_for_path}').joinpath('experiments').joinpath('fedot_warm_start') -save_dir.mkdir() +save_dir = DataManager.get_data_dir().\ + joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') +save_dir.mkdir(parents=True) log_file = save_dir.joinpath('log.txt') Log(log_file=log_file) logging.basicConfig(filename=log_file, From fbe04eac5c9fcfa330039340acb581fb5d767a0f Mon Sep 17 00:00:00 2001 From: max Date: Tue, 16 May 2023 01:52:03 +0300 Subject: [PATCH 42/60] Resolving conflict --- experiments/auto-sklearn_run/results.json | 45 ----------------------- experiments/fedot_warm_start/run.py | 41 +++------------------ 2 files changed, 6 insertions(+), 80 deletions(-) delete mode 100644 experiments/auto-sklearn_run/results.json diff --git a/experiments/auto-sklearn_run/results.json b/experiments/auto-sklearn_run/results.json deleted file mode 100644 index b4ce4cbf..00000000 --- a/experiments/auto-sklearn_run/results.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "ensemble": { - "2": { - "model_id": 2, - "rank": 1, - "cost": 0.02008032128514059, - "ensemble_weight": 0.1, - "balancing": "Balancing(random_state=1)", - "sklearn_classifier": "RandomForestClassifier(max_features=5, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)" - }, - "6": { - "model_id": 6, - "rank": 2, - "cost": 0.04216867469879515, - "ensemble_weight": 0.02, - "balancing": "Balancing(random_state=1)", - "sklearn_classifier": "RandomForestClassifier(bootstrap=False, max_features=4, min_samples_leaf=4, min_samples_split=20, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)" - }, - "7": { - "model_id": 7, - "rank": 3, - "cost": 0.025100401606425682, - "ensemble_weight": 0.08, - "balancing": "Balancing(random_state=1)", - "sklearn_classifier": "HistGradientBoostingClassifier(early_stopping=True, l2_regularization=5.759216242427118e-07, learning_rate=0.14515873247977112, loss='auto', max_iter=64, max_leaf_nodes=11, min_samples_leaf=1, n_iter_no_change=18, random_state=1, validation_fraction=0.06967552984405034, warm_start=True)" - }, - "8": { - "model_id": 8, - "rank": 4, - "cost": 0.02208835341365467, - "ensemble_weight": 0.54, - "balancing": "Balancing(random_state=1, strategy='weighting')", - "sklearn_classifier": "AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=2), learning_rate=1.7653851967971248, n_estimators=290, random_state=1)" - }, - "11": { - "model_id": 11, - "rank": 5, - "cost": 0.017068273092369468, - "ensemble_weight": 0.26, - "balancing": "Balancing(random_state=1)", - "sklearn_classifier": "AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=5), learning_rate=0.9772078202526538, n_estimators=418, random_state=1)" - } - }, - "score": 0.9182632313000073 -} \ No newline at end of file diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 26741afd..1df6a0b6 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -4,6 +4,8 @@ import timeit from datetime import datetime from itertools import chain +from pathlib import Path + from typing import Dict, List, Tuple import numpy as np @@ -50,7 +52,7 @@ show_progress=False, ) -<<<<<<< HEAD + SAVE_DIR = None TIME_NOW = None TIME_NOW_FOR_PATH = None @@ -75,26 +77,6 @@ def setup_logging(): force=True, ) -======= -# Setup logging -time_now = datetime.now().isoformat(timespec="minutes") -time_now_for_path = time_now.replace(":", ".") -save_dir = DataManager.get_data_dir().\ - joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') -save_dir.mkdir(parents=True) -log_file = save_dir.joinpath('log.txt') -Log(log_file=log_file) -logging.basicConfig(filename=log_file, - filemode='a', - format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', - datefmt='%H:%M:%S', - force=True, - ) - - -def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]: - """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" ->>>>>>> origin/docker_and_experiments def fetch_openml_data() -> Tuple[List[int], Dict[str, DatasetCache]]: """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" @@ -214,24 +196,17 @@ def main(): ds_ids, datasets = ds_with_ids -<<<<<<< HEAD data_similarity_assessor, extractor = prepare_extractor_and_assessor(train_ds_names) results = [] best_models_per_dataset = {} progress_file = open(SAVE_DIR.joinpath('progress.txt'), 'a') - for name in tqdm(train_ds_names, 'Train datasets', file=progress_file): -======= - results = [] - best_models_per_dataset = {} - progress_file = open(save_dir.joinpath('progress.txt'), 'a') - for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file): ->>>>>>> origin/docker_and_experiments + for name in tqdm(train_ds_names, 'FEDOT, all datasets', file=progress_file): try: cache = datasets[name] data = cache.from_cache() - timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT + timeout = TRAIN_TIMEOUT if name in train_ds_names else TEST_TIMEOUT fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT') results.append(run_results) # TODO: @@ -251,11 +226,7 @@ def main(): minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) -<<<<<<< HEAD - for name in tqdm(test_ds_names, 'Test datasets', file=progress_file): -======= - for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file): ->>>>>>> origin/docker_and_experiments + for name in tqdm(test_ds_names, 'MetaFEDOT, Test datasets', file=progress_file): try: cache = datasets[name] data = cache.from_cache() From ac060eeb0185f3713400fd7f39e9c75a7cb38c95 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 17 May 2023 10:41:07 +0300 Subject: [PATCH 43/60] add logging in PymfeExtractor --- .../meta_features_extractors/pymfe_extractor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py index 36cb9d45..8dbc728f 100644 --- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py @@ -3,6 +3,7 @@ from typing import List, Union, Dict, Any import pandas as pd +from golem.core.log import default_log from pymfe.mfe import MFE from meta_automl.data_preparation.dataset import DatasetCache @@ -18,6 +19,7 @@ def __init__(self, extractor_params: Dict[str, Any] = None, datasets_loader: Dat self.extractor_params = extractor_params if extractor_params is not None else self.DEFAULT_PARAMS self._datasets_loader = datasets_loader or OpenMLDatasetsLoader() self._extractor = MFE(**self.extractor_params) + self._logger = default_log(self) @property def datasets_loader(self) -> DatasetsLoader: @@ -34,6 +36,7 @@ def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: boo if isinstance(dataset, str): dataset = DatasetCache(dataset) + self._logger.info(f'Extracting meta features of the dataset {dataset.name}...') if (use_cached and (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))): meta_features[dataset.name] = mfs From 7c42e79032924c9d4cb17b8dafb3c30057caf948 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Wed, 31 May 2023 17:43:09 +0300 Subject: [PATCH 44/60] add intelligent datasets train/test split --- experiments/fedot_warm_start/run.py | 95 +++++++++++-------- .../datasets_train_test_split.py | 64 +++++++++++++ 2 files changed, 117 insertions(+), 42 deletions(-) create mode 100644 meta_automl/data_preparation/datasets_train_test_split.py diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 9bf33ccb..26382ebb 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -4,23 +4,28 @@ import timeit from datetime import datetime from itertools import chain -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Sequence import numpy as np import openml import pandas as pd + from fedot.api.main import Fedot +from fedot.core.data.data import InputData from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate from fedot.core.pipelines.adapters import PipelineAdapter +from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.pipeline_builder import PipelineBuilder +from fedot.core.repository.quality_metrics_repository import QualityMetricsEnum, MetricsRepository from fedot.core.validation.split import tabular_cv_generator from golem.core.log import Log -from sklearn.model_selection import train_test_split, StratifiedKFold +from sklearn.model_selection import StratifiedKFold from tqdm import tqdm from meta_automl.data_preparation.data_manager import DataManager from meta_automl.data_preparation.dataset import DatasetCache, Dataset from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor from meta_automl.data_preparation.model import Model from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor @@ -29,11 +34,11 @@ # Meta-alg hyperparameters SEED = 42 # Datasets sampling -N_DATASETS = None -TEST_SIZE = 0.2 +N_DATASETS = 3 +TEST_SIZE = 0.33 # Evaluation timeouts -TRAIN_TIMEOUT = 15 -TEST_TIMEOUT = 15 +TRAIN_TIMEOUT = 1 +TEST_TIMEOUT = 1 # Models & datasets N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 N_CLOSEST_DATASETS_TO_PROPOSE = 5 @@ -41,6 +46,9 @@ N_BEST_MODELS_TO_ADVISE = 5 # Meta-features MF_EXTRACTOR_PARAMS = {'groups': 'general'} +COLLECT_METRICS = ['f1', 'roc_auc', 'accuracy', 'neg_log_loss', 'precision'] +COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.metric_by_id, COLLECT_METRICS)) +COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss' COMMON_FEDOT_PARAMS = dict( problem='classification', @@ -50,19 +58,21 @@ ) # Setup logging -time_now = datetime.now().isoformat(timespec="minutes") -time_now_for_path = time_now.replace(":", ".") -save_dir = DataManager.get_data_dir().\ +time_now = datetime.now() +time_now_iso = time_now.isoformat(timespec="minutes") +time_now_for_path = time_now_iso.replace(":", ".") +save_dir = DataManager.get_data_dir(). \ joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') save_dir.mkdir(parents=True) log_file = save_dir.joinpath('log.txt') Log(log_file=log_file) -logging.basicConfig(filename=log_file, - filemode='a', - format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', - datefmt='%H:%M:%S', - force=True, - ) +logging.basicConfig( + filename=log_file, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', + datefmt='%H:%M:%S', + force=True, +) def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]: @@ -84,18 +94,16 @@ def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): return x, y -def get_pipeline_metrics(pipeline, - input_data, - metrics_obj) -> dict: +def get_pipeline_metrics(pipeline: Pipeline, + input_data: InputData, + metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM, + metric_names: Sequence[str] = COLLECT_METRICS) -> dict: """Gets quality metrics for the fitted pipeline. The function is based on `Fedot.get_metrics()` Returns: the values of quality metrics """ - metrics = metrics_obj.metric_functions - metric_names = metrics_obj.get_metric_names(metrics) - data_producer = functools.partial(tabular_cv_generator, input_data, 10, StratifiedKFold) objective = MetricsObjective(metrics) @@ -103,10 +111,10 @@ def get_pipeline_metrics(pipeline, data_producer=data_producer, eval_n_jobs=-1) - metrics = obj_eval.evaluate(pipeline).values - metrics = {metric_name: round(metric, 3) for (metric_name, metric) in zip(metric_names, metrics)} + metric_values = obj_eval.evaluate(pipeline).values + metric_values = {metric_name: round(value, 3) for (metric_name, value) in zip(metric_names, metric_values)} - return metrics + return metric_values def prepare_extractor_and_assessor(datasets_train: List[str]): @@ -127,7 +135,7 @@ def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption= fedot.fit(x, y) automl_time = timeit.default_timer() - time_start - metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data, fedot.metrics) + metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data) pipeline = fedot.current_pipeline run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time, automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics) @@ -144,6 +152,7 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut history_obj=history_obj, automl_time_sec=automl_time_sec, automl_timeout_min=automl_timeout_min, + task_type='classification', **metrics) return run_results @@ -156,7 +165,7 @@ def extract_best_history_models(dataset_cache, history): best_models = [] for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: pipeline = PipelineAdapter().restore(individual.graph) - model = Model(pipeline, individual.fitness, dataset_cache) + model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset_cache) best_models.append(model) return best_models @@ -166,10 +175,11 @@ def main(): dataset_ids, datasets_cache = prepare_data() - datasets_train, datasets_test = \ - train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED) + split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED) + datasets_train = split_datasets[split_datasets['is_train'] == 1]['dataset_name'].to_list() + datasets_test = split_datasets[~split_datasets['is_train'] == 0]['dataset_name'].to_list() - results = [] + evaluation_results = [] best_models_per_dataset = {} progress_file = open(save_dir.joinpath('progress.txt'), 'a') for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file): @@ -179,7 +189,7 @@ def main(): timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT') - results.append(run_results) + evaluation_results.append(run_results) # TODO: # x Turn the tuned pipeline into a model (evaluate its fitness on the data) # x Evaluate historical pipelines on the data instead of using fitness @@ -207,28 +217,28 @@ def main(): time_start = timeit.default_timer() meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) meta_features = meta_features.fillna(0) - meta_learning_time = timeit.default_timer() - time_start + meta_learning_time_sec = timeit.default_timer() - time_start initial_assumptions = model_advisor.predict(meta_features)[0] assumption_pipelines = [model.predictor for model in initial_assumptions] # 2 fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', initial_assumption=assumption_pipelines) - fedot_meta_results['meta_learning_time'] = meta_learning_time - results.append(fedot_meta_results) + fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec + evaluation_results.append(fedot_meta_results) # Fit & evaluate simple baseline - baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics) + baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data) baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline, **baseline_metrics) - results.append(baseline_res) + evaluation_results.append(baseline_res) # Fit & evaluate initial assumptions for i, assumption in enumerate(initial_assumptions): pipeline = assumption.predictor - assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data, fedot_meta.metrics) + assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data) assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}', pipeline=pipeline, **assumption_metrics) - results.append(assumption_res) + evaluation_results.append(assumption_res) except Exception: logging.exception(f'Test dataset "{name}"') progress_file.close() @@ -237,7 +247,7 @@ def main(): history_dir = save_dir.joinpath('histories') history_dir.mkdir() models_dir = save_dir.joinpath('models') - for res in results: + for res in evaluation_results: try: res['run_date'] = time_now dataset_name = res['dataset_name'] @@ -255,11 +265,11 @@ def main(): except Exception: logging.exception(f'Saving results "{res}"') - pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) + pd.DataFrame(evaluation_results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) # save experiment hyperparameters params = { - 'run_date': time_now, + 'run_date': time_now_iso, 'seed': SEED, 'n_datasets': N_DATASETS or len(dataset_ids), 'test_size': TEST_SIZE, @@ -283,5 +293,6 @@ def main(): if __name__ == "__main__": try: main() - except Exception: - logging.exception(f'Main level cached the error') + except Exception as e: + logging.exception('Main level caught an error.') + raise diff --git a/meta_automl/data_preparation/datasets_train_test_split.py b/meta_automl/data_preparation/datasets_train_test_split.py new file mode 100644 index 00000000..101b7ce8 --- /dev/null +++ b/meta_automl/data_preparation/datasets_train_test_split.py @@ -0,0 +1,64 @@ +import openml +import pandas as pd + +from sklearn.model_selection import train_test_split + + +def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed: int = 42): + df_openml_datasets = openml.datasets.list_datasets(dataset_ids, output_format='dataframe') + df_openml_datasets_split_features = df_openml_datasets[ + ['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']] + for column in df_openml_datasets_split_features.columns[1:]: + if column != 'NumberOfClasses': + median = df_openml_datasets_split_features[column].median() + df_openml_datasets_split_features[column] = \ + (df_openml_datasets_split_features[column] > median).map({False: 'small', True: 'big'}) + else: + median = df_openml_datasets_split_features[column][df_openml_datasets_split_features[column] != 2].median() + df_openml_datasets_split_features[column] = df_openml_datasets_split_features[column].apply( + lambda n: 'binary' if n == 2 else {False: 'small', True: 'big'}[n > median]) + df_split_categories = df_openml_datasets_split_features.copy() + df_split_categories['category'] = df_openml_datasets_split_features.apply(lambda row: '_'.join( + row[1:]), axis=1) + df_split_categories.drop(columns=['NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses'], inplace=True) + # Group single-value categories into a separate category + cat_counts = df_split_categories['category'].value_counts() + single_value_categories = cat_counts[cat_counts == 1].index + idx = df_split_categories[df_split_categories['category'].isin(single_value_categories)].index + df_split_categories.loc[idx, 'category'] = 'single_value' + df_datasets_to_split = df_split_categories[df_split_categories['category'] != 'single_value'] + df_test_only_datasets = df_split_categories[df_split_categories['category'] == 'single_value'] + if not df_datasets_to_split.empty: + df_train_datasets, df_test_datasets = train_test_split( + df_datasets_to_split, + train_size=train_size, + shuffle=True, + stratify=df_datasets_to_split['category'], + random_state=seed + ) + df_test_datasets = pd.concat([df_test_datasets, df_test_only_datasets]) + else: + df_train_datasets, df_test_datasets = train_test_split( + df_split_categories, + train_size=train_size, + shuffle=True, + random_state=seed + ) + df_train_datasets['is_train'] = 1 + df_test_datasets['is_train'] = 0 + df_split_datasets = pd.concat([df_train_datasets, df_test_datasets]).join( + df_openml_datasets_split_features.drop(columns='name')) + df_split_datasets = df_split_datasets.rename(columns={'name': 'dataset_name'}) + df_split_datasets.index.rename('dataset_id', inplace=True) + + return df_split_datasets + + +def main(): + dataset_ids = openml.study.get_suite(99).data + df_split_datasets = openml_datasets_train_test_split(dataset_ids) + df_split_datasets.to_csv('train_test_datasets_opencc18.csv') + + +if __name__ == '__main__': + main() From cb11a3ccbbb61074f8bc046c7a3d50ab62106792 Mon Sep 17 00:00:00 2001 From: Peter Shevcnenko <57573631+MorrisNein@users.noreply.github.com> Date: Fri, 30 Jun 2023 18:35:35 +0300 Subject: [PATCH 45/60] Refactor data storage (#15) * refactor dataset classes, use openml cache * fix example select_similar_datasets_by_knn.py * create DatasetIDType * create PredictorType * remove DataManager, refactor cache * update tests & test data * allow explicit OpenMLDataset creation from name/search * adapt examples to the last changes --- .gitignore | 2 +- .../0_loading_data/load_list_of_datasests.py | 5 +- .../extract_with_load_on_demand.py | 5 +- .../load_and_extract_features_sequentially.py | 4 +- .../select_similar_datasets_by_knn.py | 5 +- .../advise_models_from_similar_datasets.py | 11 +- examples/knowledge_base_loading.py | 6 +- experiments/fedot_warm_start/run.py | 96 +-- meta_automl/data_preparation/data_manager.py | 59 -- meta_automl/data_preparation/dataset.py | 64 -- .../data_preparation/dataset/__init__.py | 3 + .../dataset/custom_dataset.py | 30 + .../data_preparation/dataset/dataset_base.py | 40 + .../dataset/openml_dataset.py | 39 + .../datasets_loaders/__init__.py | 2 +- .../datasets_loaders/datasets_loader.py | 16 +- .../openml_datasets_loader.py | 66 +- .../data_preparation/file_system/__init__.py | 5 + .../data_preparation/file_system/cache.py | 95 +++ .../file_system/cache_properties.py | 21 + .../file_system/file_system.py | 27 + .../meta_features_extractor.py | 22 +- .../pymfe_extractor.py | 31 +- meta_automl/data_preparation/model.py | 9 +- .../models_loaders/fedot_pipelines_loader.py | 48 +- .../knowledge_base_models_loader.py | 21 +- .../model_based_similarity_assessors.py | 9 +- .../model_advisors/model_advisor.py | 7 +- requirements.txt | Bin 430 -> 460 bytes test/conftest.py | 40 + test/constants.py | 7 +- test/data/datasets/australian.pkl | Bin 41870 -> 0 bytes test/data/datasets/monks-problems-1.pkl | Bin 16009 -> 0 bytes .../pymfe/334.pkl} | Bin .../pymfe/40981.pkl} | Bin .../org/openml/www/datasets/333/dataset.arff | 651 ++++++++++++++++ .../www/datasets/333/dataset_333.pkl.py3 | Bin 0 -> 5724 bytes .../openml/www/datasets/333/dataset_333.pq | Bin 0 -> 6016 bytes .../openml/www/datasets/333/description.xml | 33 + .../org/openml/www/datasets/333/features.xml | 84 +++ .../openml/www/datasets/333/features.xml.pkl | Bin 0 -> 509 bytes .../openml/www/datasets/40981/dataset.arff | 707 ++++++++++++++++++ .../www/datasets/40981/dataset_40981.pkl.py3 | Bin 0 -> 17678 bytes .../www/datasets/40981/dataset_40981.pq | Bin 0 -> 20170 bytes .../openml/www/datasets/40981/description.xml | 49 ++ .../openml/www/datasets/40981/features.xml | 175 +++++ .../www/datasets/40981/features.xml.pkl | Bin 0 -> 899 bytes test/data_manager.py | 9 - test/general_checks.py | 25 - test/unit/datasets/__init__.py | 0 test/unit/datasets/conftest.py | 18 + test/unit/datasets/general_checks.py | 24 + test/unit/datasets/test_custom_dataset.py | 48 ++ test/unit/datasets/test_datasets_loaders.py | 24 + test/unit/datasets/test_file_dataset.py | 48 ++ test/unit/datasets/test_openml_dataset.py | 27 + test/unit/test_dataset.py | 40 - test/unit/test_datasets_loaders.py | 50 -- test/unit/test_file_system.py | 7 + test/unit/test_meta_features_extractors.py | 47 +- 60 files changed, 2399 insertions(+), 462 deletions(-) delete mode 100644 meta_automl/data_preparation/data_manager.py delete mode 100644 meta_automl/data_preparation/dataset.py create mode 100644 meta_automl/data_preparation/dataset/__init__.py create mode 100644 meta_automl/data_preparation/dataset/custom_dataset.py create mode 100644 meta_automl/data_preparation/dataset/dataset_base.py create mode 100644 meta_automl/data_preparation/dataset/openml_dataset.py create mode 100644 meta_automl/data_preparation/file_system/__init__.py create mode 100644 meta_automl/data_preparation/file_system/cache.py create mode 100644 meta_automl/data_preparation/file_system/cache_properties.py create mode 100644 meta_automl/data_preparation/file_system/file_system.py create mode 100644 test/conftest.py delete mode 100644 test/data/datasets/australian.pkl delete mode 100644 test/data/datasets/monks-problems-1.pkl rename test/data/{pymfe/monks-problems-2.pkl => metafeatures/pymfe/334.pkl} (100%) rename test/data/{pymfe/australian.pkl => metafeatures/pymfe/40981.pkl} (100%) create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset.arff create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/description.xml create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/features.xml create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/description.xml create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/features.xml create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl delete mode 100644 test/data_manager.py delete mode 100644 test/general_checks.py create mode 100644 test/unit/datasets/__init__.py create mode 100644 test/unit/datasets/conftest.py create mode 100644 test/unit/datasets/general_checks.py create mode 100644 test/unit/datasets/test_custom_dataset.py create mode 100644 test/unit/datasets/test_datasets_loaders.py create mode 100644 test/unit/datasets/test_file_dataset.py create mode 100644 test/unit/datasets/test_openml_dataset.py delete mode 100644 test/unit/test_dataset.py delete mode 100644 test/unit/test_datasets_loaders.py create mode 100644 test/unit/test_file_system.py diff --git a/.gitignore b/.gitignore index 9e584fd4..a5f9134a 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,4 @@ dmypy.json .pyre/ # User data -data/ +/data diff --git a/examples/0_loading_data/load_list_of_datasests.py b/examples/0_loading_data/load_list_of_datasests.py index c2ee1cbb..741438e1 100644 --- a/examples/0_loading_data/load_list_of_datasests.py +++ b/examples/0_loading_data/load_list_of_datasests.py @@ -6,9 +6,8 @@ def get_datasets(): 'nomao', 'sylvine', 'kc1', 'jungle_chess_2pcs_raw_endgame_complete', 'credit-g', 'delta_ailerons', 'pol' ] datasets_loader = OpenMLDatasetsLoader() - datasets = datasets_loader.load(dataset_names) - print(f'Datasets "{", ".join(dataset_names)}" are available at the paths:') - print('\n'.join(str(d) for d in datasets)) + datasets = datasets_loader.load(dataset_names, allow_names=True) + print(f'Datasets "{", ".join(dataset_names)}" are downloaded.') return datasets diff --git a/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py b/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py index 9519e6ca..ad2110a2 100644 --- a/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py +++ b/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py @@ -1,3 +1,5 @@ +import openml + from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor @@ -6,8 +8,9 @@ def main(): dataset_names = [ 'nomao', 'sylvine' ] + dataset_ids = [openml.datasets.get_dataset(name, download_data=False, download_qualities=False).dataset_id for name in dataset_names] extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader()) - meta_features = extractor.extract(dataset_names) + meta_features = extractor.extract(dataset_ids) return meta_features diff --git a/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py b/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py index f1d21cf4..cda8b804 100644 --- a/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py +++ b/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py @@ -9,8 +9,8 @@ def main(): loader = OpenMLDatasetsLoader() extractor = PymfeExtractor(extractor_params={'groups': 'general'}) - cached_datasets = loader.load(dataset_names) - meta_features = extractor.extract(cached_datasets) + datasets = loader.load(dataset_names, allow_names=True) + meta_features = extractor.extract(datasets) return meta_features diff --git a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py index b6f2bb8c..5f13201e 100644 --- a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py +++ b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py @@ -8,9 +8,10 @@ def main(): # Define datasets. dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing'] + datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True) # Extract meta-features and load on demand. - extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader()) - meta_features = extractor.extract(dataset_names) + extractor = PymfeExtractor(extractor_params={'groups': 'general'}) + meta_features = extractor.extract(datasets) # Preprocess meta-features, as KNN does not support NaNs. meta_features = meta_features.dropna(axis=1, how='any') # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py index 37c3b2db..e1dc16aa 100644 --- a/examples/4_advising_models/advise_models_from_similar_datasets.py +++ b/examples/4_advising_models/advise_models_from_similar_datasets.py @@ -2,7 +2,7 @@ from golem.core.optimisers.fitness import SingleObjFitness from sklearn.model_selection import train_test_split -from meta_automl.data_preparation.dataset import DatasetCache +from meta_automl.data_preparation.dataset import OpenMLDataset from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor from meta_automl.data_preparation.model import Model @@ -13,9 +13,10 @@ def main(): # Define datasets. dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing'] + datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True) # Extract meta-features and load on demand. - extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader()) - meta_features = extractor.extract(dataset_names) + extractor = PymfeExtractor(extractor_params={'groups': 'general'}) + meta_features = extractor.extract(datasets) # Preprocess meta-features, as KNN does not support NaNs. meta_features = meta_features.dropna(axis=1, how='any') # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). @@ -29,8 +30,8 @@ def main(): PipelineBuilder().add_node('normalization').add_node('logit').build(), PipelineBuilder().add_node('rf').add_node('logit').build() ] - best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', DatasetCache(dataset_name))] - for dataset_name, pipeline in zip(y_train, best_pipelines)] + best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', OpenMLDataset(dataset_id))] + for dataset_id, pipeline in zip(y_train, best_pipelines)] dataset_names_to_best_pipelines = dict(zip(y_train, best_models)) advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines) diff --git a/examples/knowledge_base_loading.py b/examples/knowledge_base_loading.py index 699a547f..310b4bdf 100644 --- a/examples/knowledge_base_loading.py +++ b/examples/knowledge_base_loading.py @@ -16,12 +16,12 @@ # ===== Another way to get train models, but also group them by datasets: models_for_train = {} - for dataset_name in train_datasets['dataset_name']: + for dataset_id in train_datasets['dataset_id']: dataset_models = models_loader.load( - dataset_names=[dataset_name], # load models just for this exact dataset. + dataset_ids=[dataset_id], # load models just for this exact dataset. fitness_metric='logloss', # must correspond to a metric name in a knowledge base. ) - models_for_train[dataset_name] = dataset_models + models_for_train[dataset_id] = dataset_models # If you need to load data to the local storage # dataset = OpenMLDatasetsLoader().load_single(dataset_name) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 26382ebb..c0461f30 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -22,8 +22,8 @@ from sklearn.model_selection import StratifiedKFold from tqdm import tqdm -from meta_automl.data_preparation.data_manager import DataManager -from meta_automl.data_preparation.dataset import DatasetCache, Dataset + +from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor @@ -37,8 +37,8 @@ N_DATASETS = 3 TEST_SIZE = 0.33 # Evaluation timeouts -TRAIN_TIMEOUT = 1 -TEST_TIMEOUT = 1 +TRAIN_TIMEOUT = 0.01 +TEST_TIMEOUT = 0.01 # Models & datasets N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 N_CLOSEST_DATASETS_TO_PROPOSE = 5 @@ -61,7 +61,7 @@ time_now = datetime.now() time_now_iso = time_now.isoformat(timespec="minutes") time_now_for_path = time_now_iso.replace(":", ".") -save_dir = DataManager.get_data_dir(). \ +save_dir = get_data_dir(). \ joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') save_dir.mkdir(parents=True) log_file = save_dir.joinpath('log.txt') @@ -75,18 +75,23 @@ ) -def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]: +def prepare_data() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]: """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" dataset_ids = openml.study.get_suite(99).data if N_DATASETS is not None: dataset_ids = pd.Series(dataset_ids) dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) - dataset_ids = list(dataset_ids) - return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} + df_split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED) + df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1] + df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0] + + datasets = {dataset.id_: dataset for dataset in OpenMLDatasetsLoader().load(dataset_ids)} + return df_datasets_train, df_datasets_test, datasets -def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): + +def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array): x = data.x y = data.y if len(y.shape) == 1: @@ -127,8 +132,8 @@ def prepare_extractor_and_assessor(datasets_train: List[str]): return data_similarity_assessor, extractor -def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=None): - x, y = transform_data_for_fedot(data) +def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None): + x, y = transform_data_for_fedot(dataset.get_data(dataset_format='array')) time_start = timeit.default_timer() fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS) @@ -137,14 +142,14 @@ def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption= metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data) pipeline = fedot.current_pipeline - run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time, + run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time, automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics) return fedot, run_results -def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., automl_timeout_min=0., - **metrics): - run_results = dict(dataset_id=dataset.id, +def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., + automl_timeout_min=0., **metrics): + run_results = dict(dataset_id=dataset.id_, dataset_name=dataset.name, run_label=run_label, model_obj=pipeline, @@ -157,7 +162,7 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut return run_results -def extract_best_history_models(dataset_cache, history): +def extract_best_history_models(dataset, history): best_individuals = sorted(chain(*history.individuals), key=lambda ind: ind.fitness, reverse=True) @@ -165,7 +170,7 @@ def extract_best_history_models(dataset_cache, history): best_models = [] for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: pipeline = PipelineAdapter().restore(individual.graph) - model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset_cache) + model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset) best_models.append(model) return best_models @@ -173,22 +178,19 @@ def extract_best_history_models(dataset_cache, history): def main(): baseline_pipeline = PipelineBuilder().add_node('rf').build() - dataset_ids, datasets_cache = prepare_data() + df_datasets_train, df_datasets_test, datasets = prepare_data() - split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED) - datasets_train = split_datasets[split_datasets['is_train'] == 1]['dataset_name'].to_list() - datasets_test = split_datasets[~split_datasets['is_train'] == 0]['dataset_name'].to_list() + dataset_ids_train = df_datasets_train.index.to_list() + dataset_ids_test = df_datasets_test.index.to_list() evaluation_results = [] best_models_per_dataset = {} progress_file = open(save_dir.joinpath('progress.txt'), 'a') - for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file): + for dataset_id in tqdm(datasets.keys(), 'FEDOT, all datasets', file=progress_file): try: - cache = datasets_cache[name] - data = cache.from_cache() - - timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT - fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT') + dataset = datasets[dataset_id] + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT + fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT') evaluation_results.append(run_results) # TODO: # x Turn the tuned pipeline into a model (evaluate its fitness on the data) @@ -197,38 +199,37 @@ def main(): # Filter out unique individuals with the best fitness history = fedot.history - best_models = extract_best_history_models(cache, history) - best_models_per_dataset[name] = best_models + best_models = extract_best_history_models(dataset, history) + best_models_per_dataset[dataset_id] = best_models except Exception: - logging.exception(f'Train dataset "{name}"') + logging.exception(f'Train dataset "{dataset_id}"') - data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) + data_similarity_assessor, extractor = prepare_extractor_and_assessor(dataset_ids_train) model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) - for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file): + for dataset_id in tqdm(dataset_ids_test, 'MetaFEDOT, Test datasets', file=progress_file): try: - cache = datasets_cache[name] - data = cache.from_cache() + dataset = datasets[dataset_id] # Run meta AutoML # 1 time_start = timeit.default_timer() - meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True) + meta_features = extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True) meta_features = meta_features.fillna(0) meta_learning_time_sec = timeit.default_timer() - time_start initial_assumptions = model_advisor.predict(meta_features)[0] assumption_pipelines = [model.predictor for model in initial_assumptions] # 2 - fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', + fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', initial_assumption=assumption_pipelines) fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec evaluation_results.append(fedot_meta_results) # Fit & evaluate simple baseline baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data) - baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline, + baseline_res = get_result_data_row(dataset=dataset, run_label='simple baseline', pipeline=baseline_pipeline, **baseline_metrics) evaluation_results.append(baseline_res) @@ -236,11 +237,11 @@ def main(): for i, assumption in enumerate(initial_assumptions): pipeline = assumption.predictor assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data) - assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}', + assumption_res = get_result_data_row(dataset=dataset, run_label=f'MetaFEDOT - initial assumption {i}', pipeline=pipeline, **assumption_metrics) evaluation_results.append(assumption_res) except Exception: - logging.exception(f'Test dataset "{name}"') + logging.exception(f'Test dataset "{dataset_id}"') progress_file.close() # Save the accumulated results @@ -250,11 +251,11 @@ def main(): for res in evaluation_results: try: res['run_date'] = time_now - dataset_name = res['dataset_name'] + dataset_id = res['dataset_id'] run_label = res['run_label'] # define saving paths - model_path = models_dir.joinpath(f'{dataset_name}_{run_label}') - history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json') + model_path = models_dir.joinpath(f'{dataset_id}_{run_label}') + history_path = history_dir.joinpath(f'{dataset_id}_{run_label}_history.json') # replace objects with export paths for csv res['model_path'] = str(model_path) res.pop('model_obj').save(res['model_path']) @@ -271,12 +272,13 @@ def main(): params = { 'run_date': time_now_iso, 'seed': SEED, - 'n_datasets': N_DATASETS or len(dataset_ids), + 'n_datasets': N_DATASETS or len(datasets), 'test_size': TEST_SIZE, - 'dataset_ids': dataset_ids, - 'dataset_names': list(datasets_cache.keys()), - 'dataset_names_train': datasets_train, - 'dataset_names_test': datasets_test, + 'dataset_ids': list(datasets.keys()), + 'dataset_ids_train': dataset_ids_train, + 'dataset_ids_test': dataset_ids_test, + 'dataset_names_train': df_datasets_train['dataset_name'].to_list(), + 'dataset_names_test': df_datasets_test['dataset_name'].to_list(), 'train_timeout': TRAIN_TIMEOUT, 'test_timeout': TEST_TIMEOUT, 'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE, diff --git a/meta_automl/data_preparation/data_manager.py b/meta_automl/data_preparation/data_manager.py deleted file mode 100644 index 0a743e28..00000000 --- a/meta_automl/data_preparation/data_manager.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -import pickle -from os import PathLike -from pathlib import Path -from typing import Dict, Any, Union - -PathType = Union[PathLike, str] -DEFAULT_CACHE_EXTENSION = '.pkl' - - -class DataManager: - - @classmethod - def get_dataset_cache_path(cls, dataset_name: str) -> Path: - return cls.get_datasets_dir().joinpath(dataset_name).with_suffix(DEFAULT_CACHE_EXTENSION) - - @classmethod - def get_datasets_dir(cls) -> Path: - datasets_dir = cls.get_data_dir().joinpath('datasets') - return cls.ensure_dir_exists(datasets_dir) - - @classmethod - def get_data_dir(cls) -> Path: - data_dir = cls.get_project_root().joinpath('data') - return cls.ensure_dir_exists(data_dir) - - @classmethod - def ensure_dir_exists(cls, dir_: Path) -> Path: - if not dir_.exists(): - dir_.mkdir() - return dir_ - - @classmethod - def get_project_root(cls) -> Path: - """Returns project root folder.""" - return Path(__file__).parents[2] - - @classmethod - def get_meta_features_cache_path(cls, dataset_name: str, source_name: str): - meta_features_dir = cls.ensure_dir_exists(cls.get_data_dir().joinpath(source_name)) - return meta_features_dir.joinpath(dataset_name).with_suffix('.pkl') - - @classmethod - def get_meta_features_dict(cls, dataset_name: str, source_name: str) -> Dict[str, Any]: - meta_features_file = cls.get_meta_features_cache_path(dataset_name, source_name) - if not meta_features_file.exists(): - return {} - with open(meta_features_file, 'rb') as f: - meta_features = pickle.load(f) - return meta_features - - @classmethod - def update_meta_features_dict(cls, dataset_name: str, source_name: str, meta_features: Dict[str, Any]): - meta_features_file = cls.get_meta_features_cache_path(dataset_name, source_name) - meta_features_old = cls.get_meta_features_dict(dataset_name, source_name) - with open(meta_features_file, 'wb') as f: - meta_features_old.update(meta_features) - pickle.dump(meta_features, f) diff --git a/meta_automl/data_preparation/dataset.py b/meta_automl/data_preparation/dataset.py deleted file mode 100644 index 23dda83c..00000000 --- a/meta_automl/data_preparation/dataset.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import annotations - -import pickle -from dataclasses import dataclass -from pathlib import Path -from typing import Union, Optional, List - -import numpy as np -import pandas as pd -import scipy as sp - -from meta_automl.data_preparation.data_manager import DataManager - - -class NoCacheError(FileNotFoundError): - pass - - -@dataclass -class DatasetCache: - name: str - _cache_path: Optional[Path] = None - _id: Optional[int] = None - - @property - def id(self): - return self._id or self.name - - @property - def cache_path(self): - return self._cache_path or DataManager.get_dataset_cache_path(self.name) - - @cache_path.setter - def cache_path(self, val): - self._cache_path = val - - def from_cache(self) -> Dataset: - if not self.cache_path.exists(): - raise NoCacheError(f'Dataset {self.name} not found!') - with open(self.cache_path, 'rb') as f: - dataset = pickle.load(f) - dataset.cache_path = self.cache_path - return dataset - - -@dataclass -class Dataset: - name: str - x: Union[np.ndarray, pd.DataFrame, sp.sparse.csr_matrix] - y: Optional[Union[np.ndarray, pd.DataFrame]] = None - categorical_indicator: Optional[List[bool]] = None - attribute_names: Optional[List[str]] = None - cache_path: Optional[Path] = None - _id: Optional[int] = None - - def dump_to_cache(self, cache_path: Optional[Path] = None) -> DatasetCache: - cache_path = cache_path or self.cache_path - with open(cache_path, 'wb') as f: - pickle.dump(self, f) - return DatasetCache(self.name, cache_path, self.id) - - @property - def id(self): - return self._id or self.name diff --git a/meta_automl/data_preparation/dataset/__init__.py b/meta_automl/data_preparation/dataset/__init__.py new file mode 100644 index 00000000..62c0a37d --- /dev/null +++ b/meta_automl/data_preparation/dataset/__init__.py @@ -0,0 +1,3 @@ +from .dataset_base import DatasetBase, DatasetData, DatasetIDType +from .custom_dataset import DataNotFoundError, CustomDataset +from .openml_dataset import OpenMLDataset, OpenMLDatasetIDType diff --git a/meta_automl/data_preparation/dataset/custom_dataset.py b/meta_automl/data_preparation/dataset/custom_dataset.py new file mode 100644 index 00000000..505868f6 --- /dev/null +++ b/meta_automl/data_preparation/dataset/custom_dataset.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import pickle +from pathlib import Path +from typing import Optional + +from meta_automl.data_preparation.dataset import DatasetBase +from meta_automl.data_preparation.dataset.dataset_base import DatasetData + + + +class DataNotFoundError(FileNotFoundError): + pass + + +class CustomDataset(DatasetBase): + + def get_data(self, cache_path: Optional[Path] = None) -> DatasetData: + cache_path = cache_path or self.cache_path + if not cache_path.exists(): + raise DataNotFoundError(f'Dataset {self} is missing by the path "{cache_path}".') + with open(cache_path, 'rb') as f: + dataset_data = pickle.load(f) + return dataset_data + + def dump_data(self, dataset_data: DatasetData, cache_path: Optional[Path] = None) -> CustomDataset: + cache_path = cache_path or self.cache_path + with open(cache_path, 'wb') as f: + pickle.dump(dataset_data, f) + return self diff --git a/meta_automl/data_preparation/dataset/dataset_base.py b/meta_automl/data_preparation/dataset/dataset_base.py new file mode 100644 index 00000000..fd84dee5 --- /dev/null +++ b/meta_automl/data_preparation/dataset/dataset_base.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from abc import abstractmethod, ABC +from dataclasses import dataclass +from pathlib import Path +from typing import Union, Optional, List, Any + +import numpy as np +import pandas as pd +import scipy as sp + +from meta_automl.data_preparation.file_system import CacheOperator, get_dataset_cache_path + +DatasetIDType = Any + + +@dataclass +class DatasetData: + x: Union[np.ndarray, pd.DataFrame, sp.sparse.csr_matrix] + y: Optional[Union[np.ndarray, pd.DataFrame]] = None + categorical_indicator: Optional[List[bool]] = None + attribute_names: Optional[List[str]] = None + + +class DatasetBase(ABC, CacheOperator): + + def __init__(self, id_: DatasetIDType, name: Optional[str] = None): + self.id_ = id_ + self.name = name + + def __repr__(self): + return f'{self.__class__.__name__}(id_={self.id_}, name={self.name})' + + @abstractmethod + def get_data(self) -> DatasetData: + raise NotImplementedError() + + @property + def cache_path(self) -> Path: + return get_dataset_cache_path(self) diff --git a/meta_automl/data_preparation/dataset/openml_dataset.py b/meta_automl/data_preparation/dataset/openml_dataset.py new file mode 100644 index 00000000..08fc5c1d --- /dev/null +++ b/meta_automl/data_preparation/dataset/openml_dataset.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import Union + +import openml + +from meta_automl.data_preparation.dataset import DatasetBase +from meta_automl.data_preparation.dataset.dataset_base import DatasetData +from meta_automl.data_preparation.file_system import update_openml_cache_dir + +OpenMLDatasetIDType = int + +update_openml_cache_dir() + + +class OpenMLDataset(DatasetBase): + + def __init__(self, id_: OpenMLDatasetIDType): + if isinstance(id_, str): + raise ValueError('Creating OpenMLDataset by dataset name is ambiguous. Please, use dataset id.' + f'Otherwise, you can perform search by f{self.__class__.__name__}.from_search().') + self._openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False, + error_if_multiple=True) + id_ = self._openml_dataset.id + name = self._openml_dataset.name + super().__init__(id_, name) + + @classmethod + def from_search(cls, id_: Union[OpenMLDatasetIDType, str], **get_dataset_kwargs) -> OpenMLDataset: + openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False, + **get_dataset_kwargs) + return cls(openml_dataset.id) + + def get_data(self, dataset_format: str = 'dataframe') -> DatasetData: + X, y, categorical_indicator, attribute_names = self._openml_dataset.get_data( + target=self._openml_dataset.default_target_attribute, + dataset_format=dataset_format + ) + return DatasetData(X, y, categorical_indicator, attribute_names) diff --git a/meta_automl/data_preparation/datasets_loaders/__init__.py b/meta_automl/data_preparation/datasets_loaders/__init__.py index 3908c8e0..4b91c8aa 100644 --- a/meta_automl/data_preparation/datasets_loaders/__init__.py +++ b/meta_automl/data_preparation/datasets_loaders/__init__.py @@ -1,2 +1,2 @@ from .datasets_loader import DatasetsLoader -from .openml_datasets_loader import OpenMLDatasetsLoader, OpenMLDatasetID +from .openml_datasets_loader import OpenMLDatasetsLoader diff --git a/meta_automl/data_preparation/datasets_loaders/datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/datasets_loader.py index 8faba6d0..ab6ffa6c 100644 --- a/meta_automl/data_preparation/datasets_loaders/datasets_loader.py +++ b/meta_automl/data_preparation/datasets_loaders/datasets_loader.py @@ -1,25 +1,17 @@ from __future__ import annotations from abc import abstractmethod -from typing import List, Type +from typing import List -from meta_automl.data_preparation.data_manager import DataManager -from meta_automl.data_preparation.dataset import Dataset, DatasetCache, NoCacheError +from meta_automl.data_preparation.dataset import DatasetBase class DatasetsLoader: - data_manager: Type[DataManager] = DataManager @abstractmethod - def load(self, *args, **kwargs) -> List[DatasetCache]: + def load(self, *args, **kwargs) -> List[DatasetBase]: raise NotImplementedError() @abstractmethod - def load_single(self, *args, **kwargs) -> DatasetCache: + def load_single(self, *args, **kwargs) -> DatasetBase: raise NotImplementedError() - - def cache_to_memory(self, dataset: DatasetCache) -> Dataset: - try: - return dataset.from_cache() - except NoCacheError: - return self.load_single(dataset.id).from_cache() diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py index 7959ca61..11294c45 100644 --- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py +++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py @@ -1,57 +1,43 @@ from __future__ import annotations -import shutil -from pathlib import Path -from typing import List, Union +from typing import List, Union, Optional -import openml +from golem.core.log import default_log -from meta_automl.data_preparation.dataset import DatasetCache, Dataset +from meta_automl.data_preparation.dataset import OpenMLDataset, OpenMLDatasetIDType from meta_automl.data_preparation.datasets_loaders import DatasetsLoader -OpenMLDatasetID = Union[str, int] - - -def _clear_openml_cache(): - cache_dir = openml.config.get_cache_directory() - cache_dir = Path(cache_dir) - shutil.rmtree(cache_dir) - class OpenMLDatasetsLoader(DatasetsLoader): + def __init__(self, allow_names: bool = False): + self.dataset_ids = [] + self._allow_names = allow_names - def __init__(self): - self.dataset_sources = [] - - def load(self, dataset_sources: List[OpenMLDatasetID]) -> List[DatasetCache]: - self.dataset_sources = dataset_sources + def load(self, dataset_ids: List[Union[OpenMLDatasetIDType, str]], + allow_names: Optional[bool] = None) -> List[OpenMLDataset]: + self.dataset_ids += dataset_ids + allow_names = self._allow_names if allow_names is None else allow_names datasets = [] # TODO: Optimize like this # https://github.com/openml/automlbenchmark/commit/a09dc8aee96178dd14837d9e1cd519d1ec63f804 - for source in self.dataset_sources: - dataset = self.load_single(source) + for dataset_id in self.dataset_ids: + dataset = self.load_single(dataset_id, allow_name=allow_names) datasets.append(dataset) return datasets - def load_single(self, source: OpenMLDatasetID): - try: - return self.get_openml_dataset(source) - finally: - _clear_openml_cache() - - def get_openml_dataset(self, dataset_id: OpenMLDatasetID, force_download: bool = False) -> DatasetCache: - openml_dataset = openml.datasets.get_dataset(dataset_id, download_data=False, download_qualities=False) - name = openml_dataset.name.lower() - dataset_cache_path = self.data_manager.get_dataset_cache_path(name) - if dataset_cache_path.exists() and not force_download: - dataset_cache = DatasetCache(name, dataset_cache_path) + def load_single(self, dataset_id: Union[OpenMLDatasetIDType, str], + allow_name: Optional[bool] = None) -> OpenMLDataset: + allow_name = self._allow_names if allow_name is None else allow_name + + if allow_name: + dataset = OpenMLDataset.from_search(dataset_id) else: - dataset_id = openml_dataset.id - X, y, categorical_indicator, attribute_names = openml_dataset.get_data( - target=openml_dataset.default_target_attribute, - dataset_format='array' - ) - dataset = Dataset(name, X, y, categorical_indicator, attribute_names, _id=dataset_id) - dataset_cache = dataset.dump_to_cache(dataset_cache_path) - return dataset_cache + dataset = OpenMLDataset(dataset_id) + + self.dataset_ids.append(dataset.id_) + return dataset + + @property + def _log(self): + return default_log(self) diff --git a/meta_automl/data_preparation/file_system/__init__.py b/meta_automl/data_preparation/file_system/__init__.py new file mode 100644 index 00000000..a228da6e --- /dev/null +++ b/meta_automl/data_preparation/file_system/__init__.py @@ -0,0 +1,5 @@ +from meta_automl.data_preparation.file_system.file_system import PathType, get_project_root, get_data_dir +from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_dataset_cache_path, + get_dataset_cache_path_by_id, get_meta_features_cache_path, + get_local_meta_features, update_local_meta_features, + get_openml_cache_dir, update_openml_cache_dir) diff --git a/meta_automl/data_preparation/file_system/cache.py b/meta_automl/data_preparation/file_system/cache.py new file mode 100644 index 00000000..99daf965 --- /dev/null +++ b/meta_automl/data_preparation/file_system/cache.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import pickle +from pathlib import Path + +from typing import Type, Any, Dict, TYPE_CHECKING + +import openml + +from meta_automl.data_preparation.file_system.cache_properties import CacheProperties, CacheType +from meta_automl.data_preparation.file_system.file_system import get_data_dir, ensure_dir_exists + +if TYPE_CHECKING: + from meta_automl.data_preparation.dataset import DatasetBase + from meta_automl.data_preparation.meta_features_extractors import MetaFeaturesExtractor + + +class CacheOperator: + pass + + +def get_openml_cache_dir() -> Path: + return get_data_dir().joinpath('openml_cache') + + +def get_full_openml_cache_dir() -> Path: + return get_data_dir().joinpath('openml_cache/org/openml/www') + + +def update_openml_cache_dir(): + openml_cache_path = str(get_openml_cache_dir()) + openml.config.set_cache_directory(openml_cache_path) + + +def _get_cache_path(object_class: Type[CacheOperator], object_id: str, _create_parent_dir: bool = True) -> Path: + cache_properties = get_cache_properties(object_class.__name__) + directory = cache_properties.dir_ + path = cache_properties.template.format(id_=object_id) + path = directory.joinpath(path) + if _create_parent_dir: + ensure_dir_exists(directory) + return path + + +def get_dataset_cache_path(dataset: DatasetBase) -> Path: + class_ = dataset.__class__ + id_ = dataset.id_ + return _get_cache_path(class_, str(id_)) + + +def get_dataset_cache_path_by_id(class_: Type[DatasetBase], id_: Any) -> Path: + return _get_cache_path(class_, str(id_)) + + +def get_meta_features_cache_path(extractor_class: Type[MetaFeaturesExtractor], dataset_id: Any) -> Path: + return _get_cache_path(extractor_class, str(dataset_id)) + + +def get_local_meta_features(extractor_class: Type[MetaFeaturesExtractor], dataset_id: Any) -> Dict[str, Any]: + meta_features_file = get_meta_features_cache_path(extractor_class, dataset_id) + if not meta_features_file.exists(): + return {} + with open(meta_features_file, 'rb') as f: + meta_features = pickle.load(f) + return meta_features + + +def update_local_meta_features(extractor_class: Type[MetaFeaturesExtractor], + dataset_id: Any, meta_features: Dict[str, Any]): + meta_features_file = get_meta_features_cache_path(extractor_class, dataset_id) + meta_features_old = get_local_meta_features(extractor_class, dataset_id) + with open(meta_features_file, 'wb') as f: + meta_features_old.update(meta_features) + pickle.dump(meta_features_old, f) + + +def get_cache_properties(class_name: str) -> CacheProperties: + cache_properties_by_class_name = { + 'OpenMLDataset': CacheProperties( + type_=CacheType.directory, + dir_=get_full_openml_cache_dir().joinpath('datasets'), + template='{id_}'), + 'CustomDataset': CacheProperties( + type_=CacheType.file, + dir_=get_data_dir().joinpath('datasets/custom_dataset'), + template='{id_}.pkl'), + 'PymfeExtractor': CacheProperties( + type_=CacheType.file, + dir_=get_data_dir().joinpath('metafeatures/pymfe'), + template='{id_}.pkl'), + } + try: + return cache_properties_by_class_name[class_name] + except KeyError as e: + raise KeyError(f'Cache properties for the class {class_name} are not defined.').with_traceback(e.__traceback__) diff --git a/meta_automl/data_preparation/file_system/cache_properties.py b/meta_automl/data_preparation/file_system/cache_properties.py new file mode 100644 index 00000000..7374df08 --- /dev/null +++ b/meta_automl/data_preparation/file_system/cache_properties.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from meta_automl.data_preparation.file_system import PathType + + +class CacheType(Enum): + file = 'file' + directory = 'directory' + + +@dataclass +class CacheProperties: + type_: Optional[CacheType] = None + dir_: Optional[Path] = None + template: Optional[PathType] = None diff --git a/meta_automl/data_preparation/file_system/file_system.py b/meta_automl/data_preparation/file_system/file_system.py new file mode 100644 index 00000000..ff2c3743 --- /dev/null +++ b/meta_automl/data_preparation/file_system/file_system.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from os import PathLike +from pathlib import Path +from typing import Union + +PathType = Union[PathLike, str] + +DATA_SUBDIR = 'data' + + +def ensure_dir_exists(dir_: Path) -> Path: + if dir_.is_file(): + dir_ = dir_.parent + if not dir_.exists(): + dir_.mkdir(parents=True) + return dir_ + + +def get_project_root() -> Path: + """Returns project root folder.""" + return Path(__file__).parents[3] + + +def get_data_dir() -> Path: + data_dir = get_project_root().joinpath(DATA_SUBDIR) + return data_dir diff --git a/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py b/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py index dc7ccf5a..d81e8cbd 100644 --- a/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py @@ -1,28 +1,28 @@ from __future__ import annotations -from abc import abstractmethod -from typing import Optional, Iterable, Dict, Any, Type +from abc import abstractmethod, ABC +from typing import Optional, Iterable, Dict, Any import pandas as pd -from meta_automl.data_preparation.data_manager import DataManager +from meta_automl.data_preparation.dataset import DatasetIDType +from meta_automl.data_preparation.file_system import (CacheOperator, get_local_meta_features, + update_local_meta_features) -class MetaFeaturesExtractor: - DEFAULT_PARAMS: Optional[Dict[str, Any]] = None - SOURCE: Optional[str] = None - data_manager: Type[DataManager] = DataManager +class MetaFeaturesExtractor(ABC, CacheOperator): + default_params: Optional[Dict[str, Any]] = None @abstractmethod def extract(self, datasets) -> pd.DataFrame: raise NotImplementedError() - def _get_meta_features_cache(self, dataset_name: str, meta_feature_names: Iterable[str]): - cache = self.data_manager.get_meta_features_dict(dataset_name, self.SOURCE) + def _get_meta_features_cache(self, dataset_id: DatasetIDType, meta_feature_names: Iterable[str]): + cache = get_local_meta_features(self.__class__, str(dataset_id)) if set(meta_feature_names) ^ cache.keys(): return None else: return {mf_name: cache[mf_name] for mf_name in meta_feature_names} - def _update_meta_features_cache(self, dataset_name: str, meta_features_dict: Dict[str, Any]): - self.data_manager.update_meta_features_dict(dataset_name, self.SOURCE, meta_features_dict) + def _update_meta_features_cache(self, dataset_id: DatasetIDType, meta_features_dict: Dict[str, Any]): + update_local_meta_features(self.__class__, dataset_id, meta_features_dict) diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py index 8dbc728f..edfa6925 100644 --- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py @@ -6,17 +6,16 @@ from golem.core.log import default_log from pymfe.mfe import MFE -from meta_automl.data_preparation.dataset import DatasetCache +from meta_automl.data_preparation.dataset import DatasetBase, DatasetIDType from meta_automl.data_preparation.datasets_loaders import DatasetsLoader, OpenMLDatasetsLoader from meta_automl.data_preparation.meta_features_extractors import MetaFeaturesExtractor class PymfeExtractor(MetaFeaturesExtractor): - DEFAULT_PARAMS = {'groups': 'default'} - SOURCE = 'pymfe' + default_params = {'groups': 'default'} def __init__(self, extractor_params: Dict[str, Any] = None, datasets_loader: DatasetsLoader = None): - self.extractor_params = extractor_params if extractor_params is not None else self.DEFAULT_PARAMS + self.extractor_params = extractor_params if extractor_params is not None else self.default_params self._datasets_loader = datasets_loader or OpenMLDatasetsLoader() self._extractor = MFE(**self.extractor_params) self._logger = default_log(self) @@ -27,21 +26,21 @@ def datasets_loader(self) -> DatasetsLoader: raise ValueError("Datasets loader not provided!") return self._datasets_loader - def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: bool = False, - use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame: + def extract(self, datasets_or_ids: List[Union[DatasetBase, DatasetIDType]], + fill_input_nans: bool = False, use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame: meta_features = {} meta_feature_names = self._extractor.extract_metafeature_names() - load_dataset = self.datasets_loader.cache_to_memory - for dataset in datasets: - if isinstance(dataset, str): - dataset = DatasetCache(dataset) - self._logger.info(f'Extracting meta features of the dataset {dataset.name}...') + for dataset in datasets_or_ids: + if not isinstance(dataset, DatasetBase): + dataset = self._datasets_loader.load_single(dataset) + + self._logger.info(f'Extracting meta features of the dataset {dataset}...') if (use_cached and - (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))): - meta_features[dataset.name] = mfs + (mfs := self._get_meta_features_cache(dataset.id_, meta_feature_names))): + meta_features[dataset.id_] = mfs else: - loaded_dataset = load_dataset(dataset) + loaded_dataset = dataset.get_data(dataset_format='array') cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val] x = loaded_dataset.x y = loaded_dataset.y @@ -51,8 +50,8 @@ def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: boo feature_names, dataset_features = mfe.extract(out_type=tuple) mfs = dict(zip(feature_names, dataset_features)) if update_cached: - self._update_meta_features_cache(dataset.name, mfs) - meta_features[dataset.name] = mfs + self._update_meta_features_cache(dataset.id_, mfs) + meta_features[dataset.id_] = mfs meta_features = pd.DataFrame.from_dict(meta_features, orient='index') return meta_features diff --git a/meta_automl/data_preparation/model.py b/meta_automl/data_preparation/model.py index 25de781c..d437ea24 100644 --- a/meta_automl/data_preparation/model.py +++ b/meta_automl/data_preparation/model.py @@ -3,13 +3,16 @@ from golem.core.optimisers.fitness import Fitness -from meta_automl.data_preparation.dataset import DatasetCache +from meta_automl.data_preparation.dataset import DatasetBase + + +PredictorType = Any @dataclass class Model: - predictor: Any + predictor: PredictorType fitness: Fitness fitness_metric_name: str - dataset_cache: DatasetCache + dataset: DatasetBase metadata: Dict[str, Any] = field(default_factory=dict) diff --git a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py index ae7f0b38..599056fa 100644 --- a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py +++ b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py @@ -14,8 +14,8 @@ from golem.core.log import default_log from tqdm import tqdm -from meta_automl.data_preparation.data_manager import PathType -from meta_automl.data_preparation.dataset import DatasetCache +from meta_automl.data_preparation.file_system import PathType +from meta_automl.data_preparation.dataset import DatasetBase from meta_automl.data_preparation.datasets_loaders import DatasetsLoader, OpenMLDatasetsLoader from meta_automl.data_preparation.model import Model from meta_automl.data_preparation.models_loaders import ModelsLoader @@ -29,10 +29,9 @@ def evaluate_classification_fedot_pipeline(pipeline, input_data): return fitness -def get_n_best_fedot_performers(dataset_cache: DatasetCache, pipelines: List[Pipeline], datasets_loader: DatasetsLoader, - n_best: int = 1) -> List[Model]: - loaded_dataset = datasets_loader.cache_to_memory(dataset_cache) - X, y_test = loaded_dataset.x, loaded_dataset.y +def get_n_best_fedot_performers(dataset: DatasetBase, pipelines: List[Pipeline], n_best: int = 1) -> List[Model]: + data = dataset.get_data() + X, y_test = data.x, data.y input_data = InputData(idx=np.arange(0, len(X)), features=X, target=y_test, data_type=DataTypesEnum.table, task=Task(TaskTypesEnum.classification)) fitnesses = [] @@ -41,14 +40,14 @@ def get_n_best_fedot_performers(dataset_cache: DatasetCache, pipelines: List[Pip for pipeline in tqdm(pipelines, desc='Evaluating pipelines'): fitness = evaluate_classification_fedot_pipeline(pipeline, input_data) fitnesses.append(fitness) - models.append(Model(pipeline, fitness, metric_name, dataset_cache)) + models.append(Model(pipeline, fitness, metric_name, dataset)) best_models = [models.pop(np.argmax(fitnesses)) for _ in range(min(n_best, len(pipelines)))] return best_models class FEDOTPipelinesLoader(ModelsLoader): - def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Literal['auto']] = 'auto', + def __init__(self, datasets_to_load: Union[List[Union[DatasetBase, str]], Literal['auto']] = 'auto', candidate_pipelines: Optional[List[List[Pipeline]]] = None, candidate_pipeline_paths: Optional[List[List[PathType]]] = None, launch_dir: Optional[PathType] = None, @@ -56,12 +55,12 @@ def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Liter self.log = default_log(self) - self.datasets_loader = datasets_loader or OpenMLDatasetsLoader() + self.datasets_loader = datasets_loader or OpenMLDatasetsLoader(allow_names=True) self.launch_dir: Path = Path(launch_dir) if isinstance(launch_dir, str) else launch_dir - self._datasets: List[DatasetCache] = (self._define_datasets() if datasets_to_load == 'auto' - else self._dataset_names_to_cache(datasets_to_load)) + self._datasets: List[DatasetBase] = (self._define_datasets() if datasets_to_load == 'auto' + else self._get_datasets_from_names(datasets_to_load)) self.candidate_pipelines = candidate_pipelines @@ -71,8 +70,8 @@ def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Liter def load(self, datasets: Union[List[str], Literal['auto']] = 'auto', n_best: int = 1) -> List[List[Model]]: if datasets != 'auto': - datasets = self._dataset_names_to_cache(datasets) - difference = set(d.name for d in datasets) - set(self.dataset_names) + datasets = self._get_datasets_from_names(datasets) + difference = set(d.name for d in datasets) - set(self.dataset_ids) if difference: raise ValueError(f'Results for these datasets are not available: {difference}.') else: @@ -89,10 +88,10 @@ def _define_pipeline_paths(self) -> List[List[Path]]: if not self.launch_dir: raise ValueError('Launch dir or model paths must be provided!') - dataset_names = self.dataset_names - datasets_models_paths = dict(zip(dataset_names, [[]] * len(dataset_names))) + dataset_ids = self.dataset_ids + datasets_models_paths = dict(zip(dataset_ids, [[]] * len(dataset_ids))) - for dataset_name in tqdm(dataset_names, desc='Defining model paths', unit='dataset'): + for dataset_name in tqdm(dataset_ids, desc='Defining model paths', unit='dataset'): for model_path in self.launch_dir.joinpath(dataset_name).glob(r'FEDOT*\*\*\launch_*.json'): datasets_models_paths[dataset_name].append(model_path) @@ -104,28 +103,27 @@ def _import_pipelines(self, candidate_pipeline_paths: List[List[PathType]]): desc='Importing pipelines', unit='dataset'): candidates_for_dataset = [Pipeline.from_serialized(str(p)) for p in paths] if not candidates_for_dataset: - self.log.warning(f'No pipelines found for the dataset "{dataset.name}".') + self.log.warning(f'No pipelines found for the dataset "{dataset}".') candidate_pipelines.append(candidates_for_dataset) self.candidate_pipelines = candidate_pipelines - def _define_datasets(self) -> List[DatasetCache]: + def _define_datasets(self) -> List[DatasetBase]: if not self.launch_dir: raise ValueError('Launch dir or datasets must be provided!') datasets = list({p.parents[2].name for p in self.launch_dir.glob(r'*\FEDOT*\*\launch_0')}) datasets.sort() - datasets = self._dataset_names_to_cache(datasets) + datasets = self._get_datasets_from_names(datasets) return datasets @property - def dataset_names(self): - return [d.name if isinstance(d, DatasetCache) else d for d in self._datasets] + def dataset_ids(self): + return [d.name if isinstance(d, DatasetBase) else d for d in self._datasets] - @staticmethod - def _dataset_names_to_cache(datasets: List[Union[str, DatasetCache]]) -> List[DatasetCache]: + def _get_datasets_from_names(self, datasets: List[Union[str, DatasetBase]]) -> List[DatasetBase]: new_list = [] for dataset in datasets: - if isinstance(dataset, str): - dataset = DatasetCache(dataset) + if not isinstance(dataset, DatasetBase): + dataset = self.datasets_loader.load_single(dataset) new_list.append(dataset) return new_list diff --git a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py index e26b896e..7c38b9d8 100644 --- a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py +++ b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py @@ -7,12 +7,13 @@ from fedot.core.pipelines.pipeline import Pipeline from golem.core.optimisers.fitness import SingleObjFitness -from meta_automl.data_preparation.data_manager import DataManager -from meta_automl.data_preparation.dataset import DatasetCache + +from meta_automl.data_preparation.dataset import OpenMLDataset +from meta_automl.data_preparation.file_system import get_data_dir from meta_automl.data_preparation.model import Model from meta_automl.data_preparation.models_loaders import ModelsLoader -DEFAULT_KNOWLEDGE_BASE_PATH = DataManager.get_data_dir().joinpath('knowledge_base_0') +DEFAULT_KNOWLEDGE_BASE_PATH = get_data_dir().joinpath('knowledge_base_0') class KnowledgeBaseModelsLoader(ModelsLoader): @@ -21,21 +22,21 @@ def __init__(self, knowledge_base_path: Union[str, PathLike] = DEFAULT_KNOWLEDGE self.df_knowledge_base: Optional[pd.DataFrame] = None self.df_datasets: Optional[pd.DataFrame] = None - def load(self, dataset_names: Optional[Sequence[str]] = None, + def load(self, dataset_ids: Optional[Sequence[str]] = None, fitness_metric: str = 'f1') -> List[Model]: if self.df_knowledge_base is None: knowledge_base_split_file = self.knowledge_base_path.joinpath('knowledge_base.csv') self.df_knowledge_base = pd.read_csv(knowledge_base_split_file) - if dataset_names is None: - dataset_names = self.parse_datasets()['dataset_name'] + if dataset_ids is None: + dataset_ids = self.parse_datasets()['dataset_id'] df_knowledge_base = self.df_knowledge_base - df_knowledge_base = df_knowledge_base[df_knowledge_base['dataset_name'].isin(dataset_names)] + df_knowledge_base = df_knowledge_base[df_knowledge_base['dataset_id'].isin(dataset_ids)] cached_datasets = {} - for name in dataset_names: - cached_datasets[name] = DatasetCache(name) + for id_ in dataset_ids: + cached_datasets[id_] = OpenMLDataset(id_) models = [] for _, row in df_knowledge_base.iterrows(): @@ -45,7 +46,7 @@ def load(self, dataset_names: Optional[Sequence[str]] = None, metric_value = row[fitness_metric] fitness = SingleObjFitness(metric_value) metadata = dict(row) - dataset_cache = cached_datasets[row['dataset_name']] + dataset_cache = cached_datasets[row['dataset_id']] model = Model(predictor, fitness, fitness_metric, dataset_cache, metadata) models.append(model) return models diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py index 09720a1e..40008d00 100644 --- a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py +++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py @@ -1,10 +1,11 @@ from abc import ABC -from typing import Optional, Dict, Any, List, Iterable +from typing import Optional, List, Iterable import numpy as np import pandas as pd from sklearn.neighbors import NearestNeighbors +from meta_automl.data_preparation.dataset import DatasetIDType from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \ DatasetsSimilarityAssessor @@ -13,7 +14,7 @@ class ModelBasedSimilarityAssessor(ABC, DatasetsSimilarityAssessor): def __init__(self, model, n_best: int = 1): self._inner_model = model self.n_best = n_best - self._datasets: Optional[Iterable[str]] = None + self._datasets: Optional[Iterable[DatasetIDType]] = None class KNeighborsBasedSimilarityAssessor(ModelBasedSimilarityAssessor): @@ -21,7 +22,7 @@ def __init__(self, n_neighbors: int = 1, **model_params): model = NearestNeighbors(n_neighbors=n_neighbors, **model_params) super().__init__(model, n_neighbors) - def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]): + def fit(self, meta_features: pd.DataFrame, datasets: Iterable[DatasetIDType]): meta_features = self.preprocess_meta_features(meta_features) self._datasets = np.array(datasets) self._inner_model.fit(meta_features) @@ -30,7 +31,7 @@ def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]): def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame: return meta_features.dropna(axis=1, how='any') - def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[str]]: + def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[DatasetIDType]]: dataset_indexes = self._inner_model.kneighbors(meta_features, return_distance=return_distance) if return_distance: distances, dataset_indexes = dataset_indexes diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py index a9ca0d97..c653a173 100644 --- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py @@ -1,8 +1,9 @@ from abc import abstractmethod -from typing import List, Dict, Iterable, Optional +from typing import List, Dict, Iterable import pandas as pd +from meta_automl.data_preparation.dataset import DatasetIDType from meta_automl.data_preparation.model import Model from meta_automl.meta_algorithm.datasets_similarity_assessors import DatasetsSimilarityAssessor @@ -17,13 +18,13 @@ def predict(self, *args, **kwargs) -> List[List[Model]]: class SimpleSimilarityModelAdvisor(ModelAdvisor): def __init__(self, fitted_similarity_assessor: DatasetsSimilarityAssessor): self.similarity_assessor = fitted_similarity_assessor - self.best_models: Dict[str, List[Model]] = {} + self.best_models: Dict[DatasetIDType, List[Model]] = {} @property def datasets(self): return self.similarity_assessor.datasets - def fit(self, dataset_names_to_best_pipelines: Dict[str, List[Model]]): + def fit(self, dataset_names_to_best_pipelines: Dict[DatasetIDType, List[Model]]): self.best_models.update(dataset_names_to_best_pipelines) return self diff --git a/requirements.txt b/requirements.txt index eca13d853ca1f8e55c583bd3790a78a679ffee4d..ad0a22332f176f2c866188116575624428ac1536 100644 GIT binary patch delta 38 pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006?}>0$1-Lige-*6QG}uhMaV)3MJPf?D2k#8RTheng`#wy z&wHP*{p8_2*EZe1SA6gH_vicjJ%8TkJ?GfD=1eyI#h?ClH(aZi-nM7|uA6qC`wbfT`C#Cg~wV(czwLf0lvbyQ^UAOI7TYb&0 zyWY3|&RzR%-F5rg>Unpst-kj5yKY-~{~K<()!EqT*Hs6yZvi7AG-3y{Vs0umi0B}XZ$fX zHikugHqzGp7wOWs&9`%1q)T0!kKgu*J{(>AmXojhL^BpkPWEbfJ^Gz{9Br*1>$NDK zc-=oeYp(6ojExgrH1S)mYd5k}*J|pxwpTQ)F%!RvW+OF}ocnnvAFK z6+b){OD;SVHnoX`Yi;ast4X|OHMQT`b&oM^9b06#Js0_@57)Y}*jPTgu$tK6#MU$G zKkHtt-PYCl<$Cq=(-Y6HX-0Et7tXG>K^XtB2W7YXZ^{pm-YJD`}?VH&zZu*O zwv+$Ww|0|9us;l}+eB;&_z*Y`|3dg*!hZxV#$JY3;WxoQ3LeD%2G~aL2iE<`pTM60 zpCi5x{&!#%JcR#Y@GSUm_($NYz^6dR9KVm)gW%`D^~4Ua*9H0{#!iz=hy^umXMqJW1{cVb}1p@MG{*@N@LL9ex=9_uymj zhu~iU2eH2mT;r?24lqN1BeC7sKLu9s*^ll$*f)cdrN*_mC;IoYUmqdoTHnoj{tR}F z*T9#9Uj|p_07GmoyLhxcRoif;1X>Epeda&F!R=e~UqyCbIdFR?GA?kMa0UF^feyvNRYEA^byc^ z?w7zj!NvHW06&7g1-6&Zj_2_L@OrS7J-HWNBj-Ky-rD;=vqtyY=ZiH>9>uKNBoILy6@P~VW?VkJ^yhN>g?3!ob*NM4D z2eCcNYV^IA9|he1?lSlgVC};h+f3|@Xq=xl$MHV^d)6-n^qqVh{#U^B_*=j;y#;)R zKKFpPVE;$pzU&0=1wJ!-VE5106rZ^X_vu#bONi|R&fR-|J!5$O-Nz+xH+T;0BDMy4 z>fDbV-~d|h-7)(8Cj9#V{lxKI_g}KE|BAhg?V7!x6Yw$YYvFyM!#)AtfPE3{YnHDE z&gWt953oHiuJy;kW5iwnuFq$HIduOR)}Jvqd#T~sI@Q}XUcy|QKibLt@b$oF{~B`6 z>o0)w;!Jdpfb)Spe-b#a_rRX-dtvXB&&{jBSIN0Ij(Hxv{|ELe_TK{U^^?Fp#_k91 zgJ=Gy!8fosgKMze2hVd4ei8g9@OfhH%k}IjXSur!^u&G`_Vwl=U~kXB$FYA4{z334 z_6^{5*lVz7)${S|@SlO70sjcRNbIlRIs0b6e~SHDa4+`1hP|g>!u}!nGI$Bx&l->5 z`}*jacP&1-;41_1yYwxE^1Vz6=hd-v<8#Jsu|hG2nT=2>-A?xn*iUjO{+T zUhk1{pUrDv&y#EZ9QIydKlW*OAAAP;ugQIse%{a9z_*CGCeNfbKLGn2AIE+b?Af)) zAsBnIO3b{WH0u3 z@%sHr?!$fnytlx4xvoFNe;4pM`~&zNuoL?-;Jx1m&L>8X;r-Ba<=(%7n6G!=1G|6j z&oSV0>G|6PT$`_zEAXd*=lxmcV&1*D3%&{*!9GFmA7WpM?OZOv_I!N>TkY4V^O=4< z_&h%QH_ZJ3>e##C-mb#86MqWc0@N~I_gUbxpw|2DS@7)eoDJvL@jQnQ0M;}-OE+M@ z9W84c=6e9!e%pz6;056PJzL+x-T_>@`{Did^~-TLfo}uzPlBbwAA+9)K2yGT8fT3| zt@X~|nja!}xp7`&)?i;_4#LOLEMa?}9M}45;H`l3H|)P_@;S8LGw5@;6I?>x-W@%C z<~+ZTz}~|tF>?s*K9eF3}?oon#9w9cAC*smo%qi(qO#CqUssdIB)o)bTN9)o-I z);X`Ou(tQldvEVez?|<*zRr9Z-~G50_I_Pabn2boRiI;RKVNy)K7qd(=GhtQ?B&^V zzV7oja1h&P)3vETNq!0R^!ah(&f!wnGw7V`>sZ&~JJ+Z19e)RGz5DFGyBEG*4EGlM zGsK;{b2F#TI`_n9$Y)sF{r4F>jBP)ko!mFi)=p}jtFak#bUr=)r-5tP4xGzT*fC~k zjQjj_#Cgvi=I?vWar`&K?!|*>yeI0d{UUHaYFy6=?4O10<-E3mqvURY9p|m&T!-^= zoxb+IgujH{1M7x;f-eBOfpzMS!>>f|c<#*v$Ucjn@q6LNz&>DXncNBBKDZ}7%a6nR z!NtJ7KTMtX?C#=U4x9gNVl&q0{G8LXz%@FKdvA=-9>X}EIp?9?v*x*R-R_s~De9fG z>vA710>^;;9)?$e&*jen*RU6OW|r~o<5)+)Q(zOAGLE>EynEuld4JXY1~L0x0-Vzh z*m16b)w_?5|9RlPKLFGpM&o{n%fYPF9Ke1O_#Si*JO{qEKLZ~{nE{2+lR63H6`yD?uGa5ZRG6bp1ZC` z!Exg5ou4JVS89y8r}njW0y=Eh=I6*8VEehgjxpQ=_hK`?&!fGEbz*xath0~L!WL}z z;{?9X(q{Z$D;me|u`ht#Pj$Xlc{W{-xt==bBfE!J0r%tQ;fsLd`r7*#G1qw#*vIG8 zIqbx@x9r?ptLu;*SKIaY8UF?NAaES_$n)dp2YVTR1X$<$sQc=3<5o&pOYsYpz?t!r>_F-@k+qGQ?F9Ut|dlk%y`&#u`Y@dZIvE5tua5#tDi@Vjq-gn2a z&NVr{p9^}}wM@X}!10&KJC18OQtYdVe-q#RbRTEH_cPb~BJ7;4cTSGu^X(jb9Ww5E zJVR^Pj&}n8Lj2zZu7792A^2Ki_rlKU>);jm*<`Goz6rL!?=LUFJIJ|D#8)DKZ?db139Mso+swm%b)|+`>gy7*h0>@=hZRXTi@e4&W$>eD}LB`?C5Gg zcCD-JI^V|iy=qPDn#VMWh1Iv$q8@d8k=^!Pq&u5<-7ES!R@=$Tx$0Q#)P$o?zUEp} z$73fi=Sr^S!Y0gJkz7)c3v4L@tSM9?X!`ex>jS{ zs~t1Cd-5s9!YUMY@gRXItCqT7PT|HC}bD_G?>B{BT`& zE_PjC+h^0G)$4n0RNLkk`E5Ks>$tX;xo~tXuPeS&+t#&y+vo4&e>c5T6VCYh^;O4X z%U(4v%B?G3e(&;LE_Zc)VF-L^^4|WKd-jWdUX9~or^YH=VG5rF4~sU zOHQtiCB7(EYu7uBuQ7dU{i0Yj8ME%2STq~;7}KV%&ee9CPp;*Rn|SIHU*xOzYITX# zb=rNu{^uZh0;a|9VE=3o%?>{YgE?_+xd(n`o)+7@wn*KOBi<96H%K3Q%*c7E*H>%4 z2gaf`?l`j8T#tH&QM8O;0jan$tK`m(ubU2jLs zoKUN;#;fm7@5`w?xhe5k$ysBr2iEna*7*sw=Aw~1FfBZTt;u@K2OXF8&=buP(4PqA zK7}_E^D^!gp7rateh+eG9P5NV?WL_&#I|k*)EMi)j-sJ!+QI{9BQl|YK#GO!!(GduERIxHRSs;KHL>pYmM5}3`}fbQt}iGL23 z-kdOQ&5YO-bl9?*4yZA&ogUgfm;&>|+PifdI#s7;4(zGUtCvA?#_eNV?Vv|uTzdwF z3)Y?jwO-nDP}jybuU-uEr^cQ=erJ8bGHefh^BuP98FbjaF)z7jrv3U}ysVCN?a$Dt zSG!~_Y+k!BzOh*qH}7T4T+WpK)L5>5z1JM}l7+q1+230IK?C>1XGJ4-g1GloSTiMN zjXC`uHaCOk1?G)+aO%u?4NSc2PunwWVa!@SFiU`f}`9lkd2UdCAeui=CM90O=`RV(KPEV_rWwtI@SZwoS zwlQ;}126Lpm6$xKY*H`FwfWT5HM3H0ZS3g6UD4#;CO#uRDY?Yt#AYQoFC4#bYC5hu z7GM1YB-ddZi(j+xuE`M@w6ghHKW0xFP3zky*Q`I?Gq$#LY{ zw6W(+-gt6y?%7OxfwtpP>!n7mFc$8(^v4j5Ij>ryKk2_SR%?=1YkXSt#%IMgu3h`n z##a+eirsM~*79rvO>)*ufPRpeER4%uwXYov$&!!8*rb1XjecV4v}Yq4b7CxKEEv<& zF>NndObW-J6~AU}uf+QP<+YRIxA|5Vzh!$Hn}G>Pp71H#Tpic1W3{cW_VY{_uhDVU zerr$Ad)4t-KZJ(<+M>?4_H;y(noF)+Kl>;7OzRx|{e_Fbgqnl!0q_KP82I}Z{{F>r zuoHA>w5Qf$`+FM3Ciqi$2K>F1&k&zr_wY6_C-#sr#(M_73ivxJ9hw91*T9#7v7Xop z`eA->hyMUPg|+Qx|D7e*6T1Q5-^noN@0QGn`Fk7o@b^aM_^#0!`759!rth4V@nvI| zVXN_XU+g8O#E+wO9sXX&Py^4Z9@hAKC_ORrbGXw6d#_yYp=MsdiX`drpaezQktO9k>_2ht02mCE(ih=futJ#_!+> za1G`zfnAf@wPHKg7qFef9Id(&!1X%c9k4YMa&!Bk@8Cy??If??<2&EG!R8Y4p4iL0 zYq|$41NYxOoKiCf_E-V;lXD*(&zhy;JGT0d!QLnJ`mSjT>pOpIPQuq4r>}jU1)JDs zb7~*P*LIw)#EpFkI0t)8sde7xlYfx7XLe4k1IMtRBj?)eZ=VTq>%NM87~TZWiHW}O zVPY2p`x@_H_jZbH%(cv6?Vgx>4Citw@OgT%M5%Wy&;3e?yGQms0L;06-Xr^3+kwNyf1%`^=MikjGxu%S`OO)}Irs3r@C06i zy>DB<9N&KS?y((D+p&A%(-K?8?y$4R@>9UG&=a4Pv2re__>SwIn*Rj0YxpL-1SZ6s z^91Y!9sbkC8T%nvzX$q`?f$qg?%BM=H{*9;GckJ_x7Vk!t#geTLqGfP+%k^$)Vc{! z>l~NN3u1@iIq)^Yy_tiZ=I9rgn*pEC;T)EHNBve}jyuP;w`{%h)^<$K?>+eHdSYv^ zz3nw4m%bBn?&%!1#}wO`^Xg!0JdegY^26GR9VK=EOo@Actes%1UBbQqc5F3|g9$l% zsdHV%J&!Me8-ckV&39D_1*uLg=!p_0`9RD!Bdf^=83(0v_Uj)u+1}3G}*ptBc zoY+tUPeBKK4rlb)1e!G!!0?EP9I-s2yIy_e4Iv*0pdo%7j`J;Oc@K7_5t_>{U&VQ(hxJe^U#C>+7)X>pX_N!tcSoC5PhFHS8Jlk-ciJbBRY2 z|7^JK)7IDiM!Jln-mCT3qwDeH(yz{?w$`+E`Hso)z-P#a<0VmV}}!qHr(cGzt*HadAYrki?-%y z7irQj`EYu~kFM=^rhN427f!!$bn(--jgR?l%|^A!r6%(UC!cW=ula0p=@U+U#!Y;T z>;83JYLkzj9%I_%!qJ4&BYw+u-dwK4>%Pg!ZM^kcP4eON($7`fi(++st4}WR`1Y#p z_~ANN+igBMIk|Ofy?4w&T9aPkS|3~X8mn8+oZg4l zAFIh&buRr=-*T&sKE2}CT=!_}<42P|HMe$KGgh0Lb{?rszV;KVYuZ@+nze(C=F-+g z+m6@jYQNQ{M>MT(Jh?Hh^Xc8{HtN~t#(Jb*axEuc&!x6AM(XNZTbo?sbxrKLU+n1X z`ovO~+?ZeYsdaTcb~w6r9P`Q5+S*P{&9Pfu+bemwtx0YyKGvs>jn!m~K=(F7C&|MIEh7D=jzzG*cmUJ zo*65CdbS*09p5OHT-&$yYkQ-4)ni7NSUXPLv$mrT*L8KQtr^p)Pds*A7rW++5w7dg zqvqJPE_OJ*!p4&eC)f6?^L1UVPfcR2uI_OrZQY~Q*15W8#%p7>K6ZLWJI0CCHK|(^ z+o&eFmg}Ay1%n?h^*z2A`cvEX9Q@?r z4mi%F#LVjt@e-TEqA&ibo+Z}Co%1P8J+IO?xoPRwHNE}&>S$)C@v+ODtUX(XKTe+5u%=j_>?ZqdmmXtyFQ>x1;i-AGl~| zu(~=|_j26i=9O)1$ThXC#@_W@)2rqweTQE3oMG3wm(7iw|2?J7*Y=_whpDme6sYNo zUDrO~Jc%1KFSMtH6R%l2e%2_bPx7*{K_dfWM~k)|<7GAEC!&lqr>?h_*uW)jeyP%R zrGM(=Hl{E7!nHPbH1Xv*`iZEqhJhT(z3_W=^c&`fKdm(OvM%E~o_1ep;wNTJ>{+9mP;Wl{g}lS7RjB_BS3~c*(w1t$ykpFMBieMCUvvaBAjNZN{x*UCGa2 zU~je`^J<|kV^U(T&s02>)Q12{a?H8liZYk*2Q0PJYuzWxyq+k z^mDWsGi<&2DO}g4X5gZ!W2q0uE{PORp! zd6=)`wLW&8&pgkRt97w!UR0lWI5qLdI2ze2{Cyr8a(DoBCE$YZv9t zq-n>ApMLGw$&bY|R$Uu?a$|MpqDg&fT0i52ldt{MCKhhvZEdTIzSgy|_^Ayi-qzN6 z?K+=a8*er7hhlz=>1$1$PrQxQadXKV_ew0>#^TFf(WOuP_3Zl3#5U5@dhOK2uJd)S z&ZmF0t?pd3ZLjnUCm&8cyiwd-^07Da?V0>o|2j8j8w=8_*444-!pWU!+%{j=x4HN= z*LrQQnq#kL7O$t)#cugbJ?i|JZOon<^^GR=;nc(*)3v?hr*_P*`_(<`n#9{!;%CDf zY0P=0|GKySvyQcP{BU~Ie%)&7g`MyJ zt-1Vt##&TsJbyl@*Re=nYyLj=M*TO+jg5J(`q3W!cizXd?K`F$i)GwJxx{6!=vyAU zU#;f5_0OjFMsph*H~R3H=3HanO|L~YWAU+AJ;yWUYwiCl`@5Orxz?OXpL}@HUYsev zsBTgGY|inb+C}mI-E&+t&PMTe4*w_CS&!e2mt4jj%a6rU7mg-<5^p)WHeUN@qp9_= zt@Db1uAJKNxz>;A?d3I=Yh!1tYwK$NY%JqNW`+3;dVgHU9TlVjD{kLiQp9B~2Z;ZFX>ijqU@>k*G=!{ilxjaHw%k6KWjma_}gyr{58gb)Up{o%?~`7h#>SW*gAoYrGfg?B!Y<&$-)U3vB!uVEvbY|NjJAVaM@a4C5BowuklKDy-J;+i?$ve6iIz zhTkXS_q~~WlGvf5v%mAY5ooLb2C&9|=kEPA_WDxmf9v-=J_XzBer(6i+T4%ybR6e% zJ=hKQ0q5*`jBA?{es9zYJ++<7^T7VA@K&JK`M6gnfjvCi&P{t*Gwi<$b$r)rOq@XH z+WoFFbBP}(Zg1x<_Ltm|!r9A@;a^x{+Y8&{VeChN`!V!{A1Jo-GA4J7V}HMMWeP{{ z9vgT5CxGi;2KxE_AM@&-gf9b*r|;Q4ik(LWB{^NiWC7Wcrh)i_S(B0I)Ope@{o=Yi+z3E1z;ajo{weQ-_DyZ@gf_p>GD zSech&`tSc8)7l;w^Sg?y_nAG3uiklF0BYoRL4$b4(C%{*MYZlJcd40XGw)K85 zo&8=3Td&sp>^ij7JO!L<)|PSX;n=yy?up++W!=sazXWz}*MQFebL!n|Yv-)%QP}T~ z@;tciCEyreg13RYi@z6kOy}#o)YwCI&d0Fr^<0_1<2v4}={+Tuc{{%2%t2!E4fxiZ zcP?GYJH{7_opEL7GU!=@{Trh$_)}nxZEh2AJ?cG2 zvhjn!dAp{IfMaJ2=j*)nv$wuRz7E*Snq}ZT?}hE_y3+3@Y}ae;=YYM;T?nkze+0J1 z@5S;v3^U$QVp~D>>)Y7&5yqX<7WDQzS^OvHr+y7=1Fp~B+I~l@zWrY){4i`E;e7la zG;P=IxwJODt;^gpC$-wv`X1`%jP#Qa5tn^k@Y#7DY!*fLQEWeBso7rG@uvOy_abIK z>G=h6+V1%uqMtG+zaR4;{v6wJj)5D1=O}f)4*Q*b`diEwd(Jq~$llX!C1)?cH%(0k zPLT7p&GYedMPtpB+-Het=Ui!DW(;^%^?iQi4)!yu`=B5AU57UT_x^$=*D;2=d0}nm z_Ze^;=x?U)my5kz#!7GJEjsFNWQ?64xv=L@+wYT;{a!cc_)gf*@NWcf2hm?u>`$Xv z#=aZ&&n6vX_}&wpY)tkw!|xZ>-cdMt&p}U}-y7(;GcN9^bkV+-m_0pvv(np~?+x;5 zsdq2+f4%hZGl{Q%#@vskW}l|?I$p)@FWOHQ&V9B%xQLwdh#&TO*ay5X`%2#L&oqA< z+%cAO&|ic7{Jf+1#(ljoH$!)6vGx5d;p?XNC->vGu!r+TUQI`htmXR1=w5V(!R^?|JyX^C z`ueHj58u5=pEna*p;zvwb$waSk!p--ZIk~teU?hEDgHIs?ydFTDx5jUzCOyc(%aX- ztl@a+y?vz4d-pW)oR6@*9;o!o=zI;Ud-ROeff?|AXxDRceJ6;g=U1^^L(iUESM8xS ze*bi8H=)s<)6dVg$IATO6VF*+#@UIT8rKo~+S02pv3D2$O|WzNHQ2u!uupnf`xasy zG2ffgOLneVL*o8;lyV6pXmE!!N)3J{ch@I&#!-u zi7tFO@frE}vaeP0m6iVYv0rwU82islt$*&AG(9dM_Xos}lwS7B>zVg)j{a8ieW~l3 z`g%UvN2yVhIb2PxXW$;M8&_=zfo!n&)pIau=6IX}N^Ronba z_)iu6oZ9rhp4?{QzYH&_uX27azQq_la>hJ!K9ip9kAtq#tMkts2da2q@=p^pU%&2V z4o}eYb<}x(jh)1oXQlRG;vX)#t+0Q_KURGIj5uhEpPsVouJ>eEAMw47*7c|EFk@R6 z|8151)7ZPw=CgojUtK@UcWwXL&8z3Ve{Jt;_I}^h-3Qk8{=f&<-nF*n>hrGs6N%V7E3<4cHFwI&BYI=S88gmbBWh=vE@Zr)FZXY z#}C)>je18PPF*Kpj;{3+52wEN6AQPwIv!gN)FTTjoc?LN&eeY6W4tJq_~{YWt;EC0>$fnMSafwPx$s!7u4`@Ua-~ONVe`qw zPEE_HskyE_Q+@KOKb!v1WSq09-AI!@nM=5?i5+e=iOV(AdA0dPu|a$WeKg^^cVaEq zn%J#I-&~#F$WE@UY5Ugs+OBo!Wh{2BjV%wKhhKZkuKj!7d&ix(-m+`oO}E~@8{U7% zoom0pw&l=$_Z>QP2>8GI_N@NEuKoM(y!Gb0_V2mr_FcE_d7qlq^RIa08b5yCAHU#_ zt^RnEKYr04Z}!LK{`jS})mL2c#^vEBHP{sKlPZ30UE8zznp<|=vUkr-E4%jZUAu1e S6*t|wd+plSPVc(;4gU)*%LZrw diff --git a/test/data/datasets/monks-problems-1.pkl b/test/data/datasets/monks-problems-1.pkl deleted file mode 100644 index 58061c482d71f3bd84e61d8c42cd1b7eb45666e7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16009 zcmeHOO>fgc5Ow+isDR3WN*pg0@04fKdmG;)< zfA9}sXXEE=oORgBHQ01VTEE>lZ)V5qaoUTIH-3D6?nquj?dO-DXOt-aglrzbPPI;%b7&1UoUFJbc1a3bD5Ik>w&yT|HXk6$=W z6^7%Avn|`G-s83vm*@nl@ZQ-P%lrFSy5GaH{XW}zR{za3_$|DM_y0os)S%{7l05fw zuKQS3gyJ);=UjWcVgub!_O0U7^sLIN(>@d3b1ExW#imheE^A+y)@A!gd6@rTYWkeY zs{7yiM|aMqZJ~<=XOaWjKVjZndE&ETbv5g>COVylI1|f?1#glgj|Te`g?VZ-i7l#b z9;ox1e!nUT}m9|I5X}@Gtl%mWD9+Rb0}}q*`sVZ>&*sCV5i6 zgm;AlL($Mw&5PO+EyV}}uWD6Z>ZNk7)pD^$f-9~Vl9*kvSK`xXq!K((lGzvJ&BCSq zkQbPpT6(hbg17nSy@4}P(VmYzpH;7z+AF%b?H}7emF^!`R4)egHV|`{e6cNYu5gG! z8u9_bscYg?4f)8_oQYn;LrWSNE6|}4TRKmY%bn}N0TGjxN2ZSB5kr_5<^m;;wGbq^ z#T-VQx8czB}q|&%ZhntJ`d?uY76LyR-bA(dog@YmQSBH)b+s z_dmOmKCflv(D6Es_9s5ex2itGZsUbNDVh;_T9hb4l}9 zTmNED;aq^1<$Q3TRYg3nG~#~JJl584`yV|}MU?;d>~y@ye;t57G zq7r-vO{M*oh2O#-;{8X!%T0 zv!okLV%{CZY|NE>ewU@}Gm?+g)$i3KwISQyco-{rr3@cMDUTx-CfzZM*pSDGk}qtB zlVIhOLk^5MYRu+R`T^I8&CK3 zXE4sfbVNshV0tkRCQ*{cS&-s6I|Nj66+SbJ0O;J-f!eWrg@@U2E!R0v;$7LHBcO@> zMoBTG}m$96}5M_6G4}>}P`kk73!3pRU=OVJCW8jho%1ooMY-05?9}DFK_r@H8<> zyf5w1rs1pDtg^KQ%)EQeJ^-5Ts1%s_2zc+y=lpC0Srb7jc9L)uV4k`yNCpde@7nkj z7sX5Bl1Y5ArZZVw7VF{)B%^p+yd&PjbP^loWV(*&A-2R#aSIbe+}1fEK7?ElAI%~9 zu@7%f2 zD3sVH8jDX05{oZ@ulCZ6h1?JYV;Az3PG>@rAGT1K!`D@{u^t+4g`_>y(3VvwYJyp? zD%D`cu!=IH0#FkSqAJy3#juJpqXJM945BL4V8yVCGNS@e6AYp%)nLW2iZY`DP!kNI zD$VDs?K(^I6|XA=Ul)GE+Hk3`EV^0X24P4~t;L+fG3qstWH=%rp+mhkMj*-v&Rymb zz%_^fgq~Spu+Swo)L}y%HmyC^pOrYwXRPD4mu76c%2;V_(S`Y`_X69tY~Y21{BwX? zj^%&=ZyN*$m5xQvmgP9aThJhckO0dKK?gqU7EGXqIEO5@S`NHm0-b>o=wKO#JR>h* zMO^#5e+J?(pS5kbvovc@a4!Hh!V4G!Z|WeYaYVn`q{SqsAwEfiy^-`Z;kBA($%po( zvH91g_KiMsn09vlJo^!t&)1H-^55}QTGy4>`q!H*QAhKKD2u}$daUbVOX!$AZPW%m_m}DNr}{Lgw=@QHM$9LtqYd;~1k~#INEv@w@m#{3-t8k5%^{MmfBy literal 0 HcmV?d00001 diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq b/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq new file mode 100644 index 0000000000000000000000000000000000000000..34aeff80b70eaa803ee1c0a02805fe86430545a7 GIT binary patch literal 6016 zcmd5=U1%HG6`mP;9mmbq+r^{GU^cGU3~A!n8vS_fg*?p2)?Xw=maX4zab`w8mNFV? zrIBSh@os3LloFPOX17oZH7$j#NoiP0-dchm_MwmcflyfbL-wVxB~K;ngXuYUMzSo~ z>$DELJBhFFJ@?*ozVn@P@3}ZC#CUd)eTB7!*dccCWr`Y|qbZ8wv2kGIWGQx5tn-2I^*U`_+rKWWQ{H22{P+}t;B=1*F2jXi3|M}^aQ{BV-9L2V9zxy7Cn z8yBz)e*Cc450}C7`}Ez8A3y!%-Cs7}`fK>D$A7x`_~$o%BEIvt$>~?4clhtSKK=94 z$I;KCdkm#9;KgUq@BE{<-7`I2xD7Av4Onh7+};THV1#%Ql?#-p;@bA8t`1Iqy|N1vO4`^y)bccK++^x|bc(Vq)A2F8S)7*bv? zCQtg{*$LA??K?D^>Vd~wX*&?_+i1fZqX&kq)M;UO=+-2CT8`ZPemI>?}Bt4UfR zXPcU|X;L`{0oT>Pro_sN*i& z3}CBztBM9~4XvnDvId4mx!USXROC`l(GgoYr%PE$mtgq~TTX-JMXi8)<15}vbGEH6 zFb7fCNS#I%2Y~F0d$1^yo-0&pMMY8xCkOBpHGQrRjn*iyws#Q-T!$kf;^&X(S)%4X91#&ee?+cl;Ar-6 zL`3}j5givbQd9a7b;Mr`B62U)a*|%Dk?TS&$~CFh!rP+Sl4`X|6CtkWYV~48L+R|a zeVVty8MT=JyXN?5$F$S7MOn`XvDnq*>qf=4K4NF^+b9GCL0Ezx{H_RsOArf!R}cz< zr$A()cwoMX!o|+KzW{4-3?I=#=0RsufPL^Am556s%)uB|24KDTTA-{pGQNZ><%q1y z{)8@T;o{YzDCb(inv`^350ol}fEHYn%ULy0Y!(7#_nMrH!gree%ttr|P z$b_ajUROMU*}y`akKi6HV$Y-moZrYN?Dc>rSdtwZyqt9B(MBeIu&GVBf^`NoT#-!Q~0A zW&N>McsYRB%2F~>&v@q>=_Icb9?M>P)*sj)b~O<=1>X31doCrK^O3czFQLiKU^VUV z)=j?q@B?S|s~%!Q&j9X>&udR3FMhzHC3J{oM^7e&@f0>OE}$2<4yqZ94dBcs6Z84y zCi>iqv1QoKR5Qtq>nTS&CPEQ-vMZ%Pk;K{Oh1~IOc5@53?U`hDU4a;+lTES?F)4nv zJ4RJmTXG%2Kjamk@6u!B#zGi51-&s8TtFw&LjTw^C8aXpySyaFH4iUUs!YrEoEi^$<&= z-@K2ZY>HSq#ezo=JWZ^n38Sn++4R)0bSAMeifnvleP#v*q8P)XODe12!|6^b2PsbZ z46bpex&GSB3@OixIAhqyj`?|_oq2o{2m+2BD;H;G=3ubs#X@gZ`ZRV8tJuT$f*=@o zG?`k)rv=J{7-`8o>5)cEoA?2)@jk$@-nPFKkK?i~kjDFh?8PaO)Vm1rF9enlpCEQU zLdZ(eTq&9O1rdma7tR8X7fpQ6z#pldcHCMAjNSaf6X(Epv5g-dIXE$gV}8ZN*X9rX zC6A&b_zUB^3rK7VfY&bw6>7NlALj?Ev`?M8kO z|5|7Vf)qw*kcaRNG`}U{l*ezv->3TjQ-6U!X-461@Bal0yk|%N literal 0 HcmV?d00001 diff --git a/test/data/openml_cache/org/openml/www/datasets/333/description.xml b/test/data/openml_cache/org/openml/www/datasets/333/description.xml new file mode 100644 index 00000000..4c00296e --- /dev/null +++ b/test/data/openml_cache/org/openml/www/datasets/333/description.xml @@ -0,0 +1,33 @@ + + 333 + monks-problems-1 + 1 + **Author**: Sebastian Thrun (Carnegie Mellon University) +**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/MONK's+Problems) - October 1992 +**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html) + +**The Monk's Problems: Problem 1** +Once upon a time, in July 1991, the monks of Corsendonk Priory were faced with a school held in their priory, namely the 2nd European Summer School on Machine Learning. After listening more than one week to a wide variety of learning algorithms, they felt rather confused: Which algorithm would be optimal? And which one to avoid? As a consequence of this dilemma, they created a simple task on which all learning algorithms ought to be compared: the three MONK's problems. + +The target concept associated with the 1st Monk's problem is the binary outcome of the logical formula: +MONK-1: (a1 == a2) or (a5 == 1) + +In this dataset, the original train and test sets were merged to allow other sampling procedures. However, the original train-test splits can be found as one of the OpenML tasks. + +### Attribute information: +* attr1: 1, 2, 3 +* attr2: 1, 2, 3 +* attr3: 1, 2 +* attr4: 1, 2, 3 +* attr5: 1, 2, 3, 4 +* attr6: 1, 2 + +### Relevant papers +The MONK's Problems - A Performance Comparison of Different Learning Algorithms, by S.B. Thrun, J. Bala, E. Bloedorn, I. Bratko, B. Cestnik, J. Cheng, K. De Jong, S. Dzeroski, S.E. Fahlman, D. Fisher, R. Hamann, K. Kaufman, S. Keller, I. Kononenko, J. Kreuziger, R.S. Michalski, T. Mitchell, P. Pachowicz, Y. Reich H. Vafaie, W. Van de Welde, W. Wenzel, J. Wnek, and J. Zhang. Technical Report CS-CMU-91-197, Carnegie Mellon University, Dec. 1991. + 1 + ARFF + Sebastian Thrun 1992-10-01 2014-08-26T17:11:18 + English Public https://api.openml.org/data/v1/download/52236/monks-problems-1.arff + http://openml1.win.tue.nl/dataset333/dataset_333.pq 52236 class https://archive.ics.uci.edu/ml/citation_policy.html artificialmythbusting_1OpenML100study_1study_123study_135study_14study_144study_15study_20study_34study_41study_50study_52study_7uci public https://archive.ics.uci.edu/ml/datasets/MONK's+Problems https://link.springer.com/article/10.1023/A:1022622132310 http://openml1.win.tue.nl/dataset333/dataset_333.pq active + 2020-11-20 18:58:56 6cd008dccee6a34420c091dfe7cdb457 + diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml b/test/data/openml_cache/org/openml/www/datasets/333/features.xml new file mode 100644 index 00000000..6cca4738 --- /dev/null +++ b/test/data/openml_cache/org/openml/www/datasets/333/features.xml @@ -0,0 +1,84 @@ + + + 0 + class + nominal + 0 + 1 + true + false + false + 0 + + + 1 + attr1 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 2 + attr2 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 3 + attr3 + nominal + 1 + 2 + false + false + false + 0 + + + 4 + attr4 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 5 + attr5 + nominal + 1 + 2 + 3 + 4 + false + false + false + 0 + + + 6 + attr6 + nominal + 1 + 2 + false + false + false + 0 + + diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl b/test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl new file mode 100644 index 0000000000000000000000000000000000000000..03189bf8369f06b88fd0cd4bd39fe6a701908ab6 GIT binary patch literal 509 zcmZ|Ky-ve05C`zMiBnZbMGR%)0br@<2Rs2{38+tiB9|H*DR!&aMlc}3ejCS&@&p_q zrXGT)lka~2v){7MFP60D6;F!>UXMugpe1BZXu}Xd34CwXhap^b=OPmmS>=%~n@N)Cl9QDxXB{rNujnVCM*Ih7u;m z>$Pl3U0GY1$1qbQeqAt?{%%w`)kEOIxn_oHv=3JYl%~eO=9ITUsSaB?h~yy1d7o4h tbsSV0sWhlXzX~RcOcuNb#U9{!_q~e?QA3nFo2gr0eBrh?bh)obI92_rzg^3el$sj-&J3z3V#b8^;LXOv1c1!{! zO330ArzMF4UDDQVX;a$igy}HTb~=kPO=r@Xw!7hL`0r>7SeHk68wOw0^VPYjOb8pa1F22T%9=N1?I?PHTeqn*0H_!ere zW|jW2k#w%1sI$0G-ak7rH8M0hHr$^kC6r#MnwTA*dPaML&Ee^hp~Z!={>=2`czQOI z8J;G{%a>MZ?^PK;64$+Bx-JZIzl-brTvu@2r0b;_UFZ9CeLbh^<&3V4N#-Babs?+k zORQITis@`GAJ=^&sq6SrT}K9VoloexFu{DrH}>kj!FKbkx5WK{A(nqc*ZC7%AJBCL z`yFR~)0CzcX1VTV`Egw@vEO;AS+Br$3w>HH!um$H?i(q_>EV7v*GoM9IQy|Q%yNfx zE#o(2{3n^uabIS?j8RRmVE-#P&I8;p`thKa8{jzR*`KA`8M|2(M zxI}o|%ir_0`ucs8;mb@ig{tbgkNBPrZ`@b*8-1YO6Cva zJi_QT-$Ch^Vld@aa)<$0ZF zd&@kIysST&hmY&{MFe-B?n^w3xb8Q}`iScO6!&imy;{?kc;C2vy)AQo4MWQnad{od zeqrp={J6{?j^B0mPvV^C{o)AYM%b>*BjbL?;q@;2j1gn|4{$B{{W#Bu0Dql{bDz05LOnMC|gpB7POvZgGEx)EZ(c-|D)kBFSNJZ{<7^D>V)P6HBm&RcO^uw8@a z$ujF1TiC9QpYv>Rej_~JWFE;rT#&plPLZF`cm4HGfI+#rt4h_I3962*-7q;~@KEfyW{Hv?1pc&%6A?8h=^F#qk;te73jDwVX3c z(wFhE{|29PMg@qS`aBc7KAuOG>CaeZ=r3cPPE@x0H=`62rz-)|H+ z5BYZOPw_rW{|nA-Ix7E@><}1K<+p4j4%0H;=D>6mRVl-vhEDl6Py8_PjdbU zU-Fn|oJ$-p!M!Q*H zK<)?Pd~V3Rk#R=Y&UGHIoPPzmpXPNf_j}?`QI7K@^Fj9ch~$@XjCSo;UiS02?i;cX z%6`J@Xn^Nkp4WGAzVJHvNce2WILPBpbIs>y{t?|5zn54(&ikp%qX_HenQnAwy|~1k z$0_@2ocE6i=Xrqlt+?Rx{E+ixiTySBoGq|_%Phad_T_$RfcJe_r*Xdjii@8~?Vs!q z1)fh8oX5EAgIlzm%JE;APa~x&gj?2Cw^MP^1?>yU;{N&|ahZpWIhm zmwC$f;c*`S0P78~Kl1!hJZFz{et3RM9t%9)soU03p2uyly-Uo$&h;|exyk)F?<0oX zm$qwra-WlD`%4@@qgnIie!Yq3vmxW+eL?oWo6MK|%Kjz$UxEFRaTGYN5vkMKo*{kq zqagX>d>-L>XdGdArWZIK#eG-u#_L-4Ke<1M^Lf%FcZa(+ypVH1^3lyW59xY2t?T?gT}xc#TwWHxc|Vi+R^a=IDUN?} zU*Ypw=AXpf*va^8zqoFXYI=e5V94_=pKpRIaNxP_{Q(Nl!Atus^B}#X3Gc19=Dsj8s^%Ee6?B2 zV^Dvr`$}dL{MPrLtiJcL_cK=-c9@se_j-T38aA|C{LS{y&h>8eSzp+6bw0Dc;V0c| zuLOEiH!8oMTz{haQf2B&?IS;}{oqdy_PzuSXM=O#Uf=XY`}V-2DR=v;o-a1M9{$e7 z&OY~}r)$GE&U~}Iw{i8Z)DIqk>UW>_`P;8FZ@xbT*_Uf8D=Np|+dKHcc-e-zvv&>c ziKZ^kKa^_N^B3-S+jHUfK4^bo{Ckz|_gfk-{>BHjS9aC(g^%uE`qtjo?(2>1$Hz9= z_SLL^efyv8-&)xm-gy;j($62>UzJKVgmxw}Uu~OuaeDPqxa%_wqh4QC=GOw>Id%0@ z<0n4bIJc)EGtpb#yna*W#Hv)om!Ed89nFv2fAB`{Tw6!w;T@^*OJ%E@);D}mwmrkzC+M*J@t_bH4W1x8xeW5{K@-)LWY#$o!uBm&ez9?cJ&d zySCEz!&GyBPq_C>jXwmjH1>i)pyA2;>J8!nc$r&hK9;WN-Y zkPJV!sbSsKJ!fkgs&5?X-LUYp-0t1`ezG_I)P*bU8|L2V`t=YJxSRW)SWJv##bsDrJY)RYI9N z2sMvc;FzvfyT&o=wkE882iAtIiH)vsfAwM0TirU>-3TWtJ%>H8t|kl?xL{~Y%bEl5 zf&*u)xglr`+i7r{d#!M}ZzAki_nWT9@Ye2$L)|cZ#tCmYv8K7(<$&1}1grHSFVux$ zsO`Ru9@`0)chodPZB4i5k;6exV|8#C&c*EJ+)Ge>6l(Htbfi3JhhOYWIcDGby5<^J zaJ*`h<5dE6fB__g;7oU~7A`dAbICuDer>{pGIQy@|%yM(}y6 z;MrLCcyrbHpwnvYbsajq&GCTM2noO4SY|fYG>>&#A*b6_-MYr=-S|wcD_jvYC!ou; zpsYW4w*y-u-dZR41RM@mKn+wwSDoVuZ}e5Ts@5H7oC*TKfq@fs0Wj-poQ+N};)K_? z)mVq1E_9;K)#?O~xEf7JxjbQbG+}oKT`Aw@n#7O`f&+@S}^yxR_j&F~t#tko&2P91gCZueQM*Vmk^D}z0uaIyyM z$GyROf>tXyj_X0#8royS2WfzF$Ta;f5s%MAFmFK7 zUFB{>0wke0>6sN+P{XVJL3<{+sgItl!5S}3sB+sT!UEi7lKa@Q@DEE5c*r)G9+wHL zP3?dOaUH9I1kg#%CmGXHZERrA!GROeLjsmVh2^%&h=eYaq)EedQPU4)YMdJ0Pr$g= zz;dfdA&>>8&omtc>{GHP1d;Bc(YX9j?m|^G1!dZ8l(*DKE65f6AUs)Aoq)UA4FNKa zm$F54Y+2yLt`Ge|H(dc2vr6Ja8Z2GHS`*Yjes;tv}@69J_k`~ zs}0t{6L7r#@Rqp+=&MLJ^hCdrhI5fUO9076xHNefr6BZ$bRrF>BK4aACbyHxx6_yV z0)PE@npH4J06jX|GYWC?HR_3c;>ak3B5h~N%i7*aMM9anp3vKm zCS%d#kqep8ybT*-^U1`!(3U*%b`0(tj>3zbkbJc-f&q$LYmTl8MZBTWcV5CT}c zA@OD?3hDm7WM}M|fi_5DC@)S%E<_TqCyzf2>1bceY;q!cZ)8WV?VLnEwm)i6q2k0fx?uN25KNr!tx3r|MsZXw_&m zIaW8CK`&#W=;WuPyF*)Ncb`{15IXn+|{#b_!?ZOMNaaqL|OOa>yn$Q;B`p0w?%7u;6i5zb3gy(s_1>y zdtZ;WMW2tP`$KK_L_^17Z+tcckJd-vd^%kZUvE1HcSjC2CS#4SBty@Jj$%}=K<8-e z9E3370Fmeyzl28i!HzStTQ0%Fm*E3AXaB(!qz%J;|Filx+Sl#q+l7O=`fb?umuk^g z{U+@Ao?~vv^?VDL(n<6R>f#q*J$Syb1=YU$51lyYaFyVC-Ekrd-Y039K`v+h?K0MI z4BmqqTmER_SAGQWAv|&Yp+)l-1KA2{r@G>KbsLZt0pN@O%liQQGW>5d2Z!!zhFm|m?*(|_^o<|EpVyHuH{QoRvtq~K zBntnxul{TZHkg;O`P3fxR67Bz#?R;X9{fK5(*X02!}sAk<(;&7tO7i&!9HIL_Rbuf zCD|nI2cxU;gj8$su8EW6$zhz3!$`19_oRtJtInK+@m%4j*!*pPP2hZT4V-VbU^kuv z06)RKB{{>Gg7+2s%00XlybE#JALpO7$E9p1X+SsY~iG;!espr z;M)Boa1VxVFgmG`lpA7CMR+;)3ZY}q+YS$;LpEk?DULnyu7#<(u?|G9Ur(cnwZ^( zOQXK^)~L7PGr%w$}V^rs7)0FscOU4ZY zCzOGIbWsP}bZH86iKFC*E=`h9eWfEJqOHWVtWpqIwnUm@P)n5n5QgqbLyIU3rAd9V zttB-QsnEX_x(2~gaY$C;o$sMmDZ@fb8Dv{_H_q#MwjH&g{dq7_&!9mtrq1wCv;A zCuosnPw4fP9k^{>DPM7jG-!3H;Yw4eDoPTkB9pXOu~ZxREU#cyFetDZOzIWG1VQ?$ zRDgm;mUNr6=%bODuPIuQ^pt>xQW7ktRmq+f;Qk7@B8Y~_5SuRT8@r*qN`wj?nv&9X zSzMS{%SEz9McpD66A6n3KrV_+)n!tVT!KiFBwNI|rLA~^t|YafwxAW2j5e)Glpv#; zsT*2^RYgmL$T@|_?vXHGLEoizEBE)NL%?HPjv2ZLFd$(@WW`iI4Ntql&pqWy3f$)@Dd%}6H*DC!>sjJ_)1l8 zThuH`20}8Z_CxiBLZS+!q)-GT**|M9mS>qPy^q|VO7E?q7Ze(=n9rn zW9=eeVXok)uBK^mQfEayj-momOQp4Wq%e_v5M4#rDjMO=>&EJE{le=exN_#ux>NH+x97PAYdck!S}sxKbmqJ!|{83_~OR)8rU}6_@du10zR+T2VP~z?els94!*y| zEjD1=ZpU-YH( z$OxE*<@Q=8zU{RQ%e3t>v=45#gC61#9Y6YpU^pnZ-(z~+mL2dqhJo${@ZAB1!jCUJ ze14R4ye=O`%x^mm4%o5+7W#+~9(2)gY(RsK&+GF#0h>mRLXOwtc1Xl<+a3eIpmbX< z3?P|yc^seNHV_=O0)Qy!yzR9u!-XNS1BM^JGr+O9alk$tyq5&5fa%9^2fPSJ>GPTa z{1(CXf#EYuOoYd0_&t~>+mC_vIc~#r+kT7^cKzss+w1XJ9*?IC1@UE#NeT2}hEWYa zZjbH5So<*$m~sPOBzfGnVPJ3!jF0UHpX0IIKFogr=Y`MoTLH`-0tOJ<@t8Pg+%z%# zsDcv%5XQ!D0Rkot5TOy#i(kVS^xcMyv2c8VPWgP67vmeiZ-Oup_)U#6M$TK9dJLl9 z^amVF$(y)j)fw zj^A6Mw97C^3(b-^36cm-5d4r+h(ZL-_ z&<76|iW!$A(kl9#~DE-ktlp<4#K+}S{I!Bs;1VTg^!o!l>ATj(<#1x$>i{g$J zby|s_oON4wUO{$;`xjn8^rqrp_fkP0`f_a3sj3gyl|xvcv~>I;jZRv+ZaH?Tr#?~^ zrL-*aR@A9Qm&Q;mCC)U$p??~Jj44@N71gLgJT0xHNUliIo|24$lR}{=DP1Top-@BX z=ni+aJwnz($|7SUP5mRYY?q~op*~nJu&#@i$PfJy3}s!zkQ8ZBrSz1nbf{Kz(jX4k zN`=%kSL^5wt0)OlQKm^i8(|Hq6oCzsB*7^Q#Nw`MQb!yhdXdQJq)0?n>q6Qjs{XJL znvzgBD@5XvQAXh=_U^oD9DEt~i@sF&*RLAw7WuC(<+Zq8q)`BPs5}@oA{tQ1$h58s zg&HF<@zR6mih$_SWW`n>B&T@9&|Iph!(xO217E8l0qCZ zS9DA3F(6AQwo)Q7YAAcuCTHkRQxq2Qi6SH=ru)h*_KXcFDHb7A)hx;>igHGA7*;_m zS?#CtLfO<(Vw)97ix~K!iYy9A)yU8^&0GQ2vYJbpN}VA}#wh9vnV|@91(g+vq!|iI z^^|YgCxj$aGN3;!iyTR2fZINEColNCIG!MAp3`Y_XwKlo1kI=_wCTs))ii2vobLAT^&fQ9+4o z6;e|?g{Z)a-e_U6MG{o#3u{GDFig}q#1SFgP;@O%3}s9;*r*~AU$Mx95@9)RvnEv~ z5XwDmk<{66u~~$oFNW^)5-!+LUb<>#7U|oYfB74pa=95>Fcb7rAVDt$67*6afo~AZ z1inHr6Zj6nOyEldGl6dr%mlthuoCnh0ZY`u5|yw-Ei6$D%h;`N6!zcx{j_|yJUuvZ zn!a2*-%J`}re}g?UI6Z^k4llTJGm}$`__Cf_!&CU9Z1o-Qu==9c)sJ6j<9{AP z-$2WMB+@@VJTs`il0JdI04# Vv7d?qR2-xtMa3a3hF@K5`2Vhn;nV;C literal 0 HcmV?d00001 diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq b/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq new file mode 100644 index 0000000000000000000000000000000000000000..fe69f0e09f0ae755e168d8f9ea690b4396f21fd7 GIT binary patch literal 20170 zcmc(n31CxI+V}6tO+u13Bx##ArG=y^O#>xu(!DH_q)iuS+NLz!*t%~tZRy^!G-cnZ zD2f{vkVOPVR0Kq=Ac!amu8cTNXU1W4MxFZV`1-x$`@P@)+!Ur@8OIObndtA{d(J(} zbDnd~bDncwIqd>@99Oe74$80p2NWR&j^3|8U7G}5?qTn1{yBBoE8$RAiNmh;_a`qGAr*@26u1Q)jjmw^T(GfNBkOtM(c zFw!Q*Elp(lrE-2rxmd$D&!GwDqsBC(lS_~AjZG+Spbaqv^py+I{fYw$ku--HkT&rL z=85%;Mr!6WPNZWpXqtRn7!6HmC_!Cret)G{#dps_45Jug7NFum={i2M2oI=&ksH#P z5=g*I7i*XjgP!R(=@~n+Fcne}MAbA)<%o4Fb{IM#ayq}HPORs9#8#vYj$ZEvq^~Bst~mf zqB0dT;85AkP;w#KjJ%FPLpq-+7GGgBiX0{dRWU9z-`IwnI@I5U`z5J}&-jT|OuuU# zGiWH`GYkSodmo>PM|!h#9dm^Co2!t?LEmYh5GF}%4hjD=o1@ZWl_Pf+|X&-T`|eV^$nsW@0cflTlzWq%uQU%w`qi z-uyihT!CO3zNSHJM6-^ABPe?Sa?o!5j6dMS3t9yOEtQ~R_u)e3613vBmmon2(`PWF zUviis1L{V}Irag>>-mgc+{9;8sP7Ud>h}=82F34S+$NPGT_I;CicuTH<4Hy${g@zHK;66hqpGkn``yoodOT-M}rADep15mDX5Gpxw8TzrG~B#EoI*a38v+57p5wLcc+o{U~J+gSQ_IDY5r6=FOiW zUWJl}Ff1=|%{6b+i4yrt^rS*Wu48gcdI)Alzqz34*1i58{Px}lVrJ|$v2B0Z{Q6nY3XfWGQ?4MJfk!L);^5o0Ezd2XqO zOJznAk6^f(4^so6;+IsTd)&y>#0?*)9-SU5))``x{)$VXo| z(BHdq3{R8NXY92KyaJ3BmyYx%kCCYp-J*~({(2u>R)6}gGQmdw$OQxIOqO%r{MfoH z$9t1)W5-WvpEr2<%xf}@WtDeG?yI7;q3PT7MW-KIe@}SM?(_$&dpB&4C_Gy8^^w)H z113eMw0`{`0^ts$=+#Xj&sdWmP*ac-EN`&ijZcd0}-#eJak)p%R4PrdKzvhGder>zzi zMOWtBXFBsv{f=jTO7wRz zt9%o-1V7*Gc=M6aoJ{4()<2~Ve*N^8U#|}%JD!aYTpYix7`uU-JGz@3(mSu7#$6|2y>u-f{x9_ZEOUC_gY)&uGc6O>nWfNcDrptcIU1;ZD z$ZI>W?wh{chc9e5P5B@DyPxgfJ3Hdkg8LT#;kOskcbF8ffBK5+qcabD?sM5Y$G6g7 zcsFxplYhyl_H~If{_w>g*FNypBmSM=JvOjnTlRL+Q}$niSbb{Uk#X&%|GDXM>aKA- zY23=b1DR{YyMEx$c`seMCrzrgmmleUYFxy*N74_re*WB|`Ff-4;MGdy<$z3}+a41!^p>NYZox}3nLFcl=2j*3-ihJUT-f=tXY>CllwjEZ? z+cj966Y&h-JlmF)Dt?4$kf6YW) z-#FblHuz^uwoyH-<5J~mX{Pc?mh-Sa7JfUIJ%EarfLQQ2SOfe(2Uq|?z%oz;U~m{4 zXax(wM$iZ*gKZ!Z2*51x6tIGEpdUnlH^H;uWv~il0P4Yp0r<4cSnx2Q`bYq;f?QAx zvOpzx8Uz9b*bams4Zv10AAo9b3CsiD09KQ6g8RXX-~^}xlRy!81w?~BFdh^DF?b0~ z0H?r%Kn)~-TCbhpF|ZC?1VKOwUI6pK9Pk!+2c(0!zy#p^Fl&JbJPJ00=Ybt`f+s)} z@CWhW4A>8PfEFZy8n7D3feR#qQZNm80bfuKW&jtH=N1}1_%zzpsI=fIO70PF*2K|a_8G~hUR4Ge(u;3&8kECq?+GUx_}Kq#03P6Iv| z1Kt2#pad)jo51@Z1uORU*<%n?6Hu z*G#iys;?!?nw(-w6=>3Qvh;uqValx8nRZcDwj|j%C&@82H$@#BImbIWPhgEyPxW%< zdxaMircBWIl2D<>m>gXcP@FqgZ48(#MaRm*z<-A%noD=Gq3t`hnAWz{Dn2dwT{6UkGp&C-Odt6krtRjYkBtlcQt zw0XB4sZbF zEM25q!KTUfJ)^65wob?{-0J&mLPJ9KbCP4vYvwHG9cQOTo)DhMQst`Za>>aT0Bgh^s0&KKvrM7Io?t?RkS24QYI2;0}@_*DWIz7QU5WHmv#HdK-Vi?Sv|RRLD4A( zd}S>G>Ky^6&uDnBdaKTgd>_qfCn3IqU}Z|$x!0DQIB`Cw>A>q3rUs-5-`#{kW82W}ej2p#_?p15Bl6Pb6it(&n`vUz> z2I{_w*KL?Y_x>YC{;2EI1f@0;=NI&VLzYM2dJ=_Lk}>06%i2j0z-iNvUg39nC* zp%cTih^G`d^YUyCfzr?ENbxh9on6d5FnVC?Kd`ibKih z2GT~Jo5Xsw_b1l>J65+YMRy^YJ2~N+I;!68BT6^PR0eaJn=SWfP5{LTm>3WX;y^q| z0EtAvPvQ{+V-yG(lR#*U=22b8aMxHU`b~lI1HmndPVy-F8t>amGOTHSPfK#!Zqwlp z6AJe@O=nv=K3pqY{o%Z2`#ZWqTQ-E%tk8ZmdBENIc*{!fkdDcsIg2+`?oBvjGKC9P;k12TN;Qq(E=Fe|gyL5fKqqAgnzaf5=uAqC>{*r-!gCF*;?anP? zTNB=S`mO>X>?EX_J0!)$YUI z&Fccn3aUTe-CfvSGHIPL+|j>m+-v)%b}5#%PH~2e>FDlHs7&aX@^JxOT6AAzj~o?o z!>4Iu{Ev>&eUza)na+(0u3ttS6*ziX2A4=Hz$??mdlyv@Z%B4zZ8A@%udQAG= zsf(d6cAYt8pKJ8K>fQN2=UqWBEDlPlzj`h3$ZM`^`!;_Q|K`ybk+<{X7sK2|2GA?w;Z}_IX}@w z{UYT-QKoVokL#sA{XaHm6rfI57NCw>G*|)}K^>s;S_XWg-VmDgvwo+WFbTrpw!CLp%Vkbe098V zy+A9NB-65!Sy_Z&N>~^%s3O@YUo9&zs#(2EXd(i=EP@fKqxEczK^@Bz8lvTKvUs^E z!7G4OMkdB3vMParRkK1iX(~KV_AWg;Q=O2+nheCu`pPV7gT@+b(5sVW!TnrvVk%+8x@^hqGzYp#?RJk>)1Gh4=Z5hY+!_5Xy93W zgoUaZ0gJRO%g(4*Mz~m6gFw&IYisp}{0xCQg^j3IMhH!MZKE(i#)b(K4Q!K$cxi=U zR&RkQNyuvJMe`R=W7W-xq$M^?&jz#_>|U%spiLl=S@ovu*{sZDGesEGL=+}h$-ScB zIJUE127@V4*dY`~*yQ$%T0QTuR$gwHa0*>OsqOk)+6(ZVzUK$L6Q$ke3!EB!ZK}Yucjq#EYTWd zhUp~Aq>qw^NrbaIt?Cu|f|U_bY;(0iCtyiOymqNpZDRF$p@L;GD=2BY)Io@AhBGP7~$%+k7P}9V%fV0sTtW zbNsF+y1y~3urFHEtNG0TZ$)W4kEH~E^~LJ(GY%Cl*}C=e?)Be(+UmFA`I*6AE|{}& zP2durvc(TIEU&G3;*;0EiF(5n+C6_T^SArHyY%VKqdPySDOjIhefj*pV??*9apYo= z=xN@QJpbc7-8z@k@7a zPxUUW=>Pudy;p|DUb|Z^J6DpVj@c8(KQs38t$PBe&8atSOKH@sdhnvx=h6$4CN-;f zXTEYe`h3co(+|H|Czs?;c1&OdAwu0i^T?JZd0KXa=YNu?JK3aL)ASQ9J5H1@!GDe9 zR>65ga|ybF+FUv=sPuHr;RCuZ()EJ}=z2oeH)?z7+C$e1y8h60kFF`y&eL^_+EKce z(RGoog|Mez2$kOnZ+3F{G|LTpdW`+{x$CZupoej5gqnU;u81jjLz;eOm6d4vi71e9 zM++G<&KyJ{hP}jSNWZ^cBVf!WYf)GYfUNkGpc2w`;|#Ovt=ci}cl&TskD%=UpQ%aL z&?HzX2=f?iC$rMc!@?+`ZI$OR?gQTAg!mw)VpR;va4#OPU~S8?lxUH7a5{6olf_w= znOpBfQkp5mvzA>VBDY1(lppvlJ%BRG=|U+{iTH!4jmlDrEY=b|fkxdVCwZL)vTQ+> z{Q~zM>TNi@na`NP0i1%6OHk|(bV|&qNCVZ+b_(b+Om&Z-s>B`7)QI?F%te*7Tlfs$ z&q3~mEaqO`z~o6R!+LKvn5 zCA*85XBAjsjnw7oC7aM>y1Wt;J)kwZd2WW!!+>y`35FobL4PABqx)d#-)#tMF&RB$ zag*5pVXVE?aqm0Ouo~B&GAIKi*`Ej0{>XC z#YVsfTWn_vcE`bl&3BF?GtJQ`CSOmKd6n3L+wz@xWCo)o)e{{~6%pj1SQUxq+nhvh zR^%y_NP@EQPo5KsIlt;=#SEXHA7lNWWp(fL65YMMgj3D!_t94*%HK&cl^=NA@k4@d zpWtC6a;{|=?igj9S)rzhx_8voy%*5=ya>1eb%m)rNnK9rzS8MNU1#dD+Cd$l?q3C< z?kZI%HBAoz9l%WG%vA*L)WxK3XeO8lI>07CO`?r%Sc0)3zn>;x8|1=Qu- z4(mK;3S-q)Z0|ARX{QIamy~g4LiN zB!X}-AB+KyfLd@DpxdDFU_EF8@t_(A!9H*ga00q52nHd*3c|o{kPPTnA{X2Z9t6w4 z0T2jgfi<8T+z)QERiG-lv9yhH-pt^{nZc1WD`CC7#nQ1vM&zu*$CvM?RQLz5bjQF3 zu|7C=hF7l<)01a*Y?wk99;D~7lNjM-mQ_TsQw$+U#mXajQ9Rb45gXxLn&9xV(Y%sU%@8xhz+)^5BAXTEqlhA8b|y)-vsvOOThb&Sevl6vl+ByT zvN>i4k7d0vT!gHKRj|2pSV)u?E@2a>tU*pDzd*vu8K1(;BA=PrjNCFITpnGlAVC_& zW|7RD$L5fDMbv!OM^OOFv|wQnTjI+qO2;x#5K~qjR3T?7<&v3I6$EmVYLhQp6Rrqi zP?fxvWi9sE{&hY?!j`aX(V}`boUjTPBWX}HvePCJxlv5`4Q$h5Y!le@oG7zAh~+IB ztMDbwEk3LQs%$mDazwQ$H0>-eXl$}w(GetNIs?Xbb@P@iWydOz&!#YhN7Jn*36AwJv2DjxO;dL*2+`O4Ax@jLlJ3j!Z)BR)a> z3~!O#XHJlO7umf>&K8Rq_Awq$%#O`DEM<__d0fqc=c^=eykoA?2Nvs?i$Hpy) z+7JA6k39@)fBnE2%i!>d^ErS2kNkCS^%32L)tt5Gj~M-jI#2klHM2yZ#KQT|WuvoTl@89-ua$ z3Oojmg2~_tpti#YoB}5RwQCaqT_f_rVLX%mTh37AyplKrg5UAAnZ?AIL!|Xat{u zz2JRt-A2#A^H$&w9s$&rrh~6QGgt?T!3*GdAOkE=f@MGtV!#q$1B-wO&^03yxWE8- z3%mqEz!ney=$dd5c!N*CbKo`b5O^6J0NcT@z*z7gSPlxnZ^4J)8Ylw^zzwp%MsPQn z4pxA6paW8H4(tOR;2p3AtOiejAg}>+g9`8m&;&jL`@!$P9uNvt;1D) zfkc^82;=^2v#Th<&}uUL_+Af z86k8b#5(AwWN~grNZ>dU6imh=euA2?n(=fY3?sx_N5TowPnt|3$P^r*5Q49;Cfq`3 zS~fa{#Kw_$ghN9^5)yGISntR*;q)1I%_PK3EY{?d5Szdlo0>+_kxQI0Yc?U7#JA5L zl9f$J4sj48$t830NT?IJz4HkvAcaIniz><@!lCKKb7SX``D8&zC|OuiNb~zyx zmD6UDDm@isTJ`uEQp?sYLY8`0LnG0Ki1FC8m^Awl%h2=|1nAdULOKcQ>LyFb(q%?M zmXjVrd{Cvkmw2xTB`cAPtRmTDbqmQSYiOust&VIM%3ViDybKY`L$VizAi9UIBkMyq zY{cUxLN=4}TecE+>2lJC9E99Wwvl_ty*=CSTfUr-`$Kjh*+$x)2WC9@5P5h?NC=nKdGrX@Hi14e8NqZLqPK(5^|V?RXj?_YI!;c6kfH38FPwUj@C_k^y!7$_dF3=YLrmiJ*$6LxCoL&&8N9l30NC*;tjcL~}1 z-uvW(kPjQgW&?G0vCT#OSp)semj)lC zXJwXNq3%}$Yp`Ma^o5&ubHiW1JlovC6GV6Dgl9K*yYn($;1BXRoV~P^*HpSnl@l1H zcNmZUQF<7Y84M*VC0sC?QS!Klm@xm$L%t#(`pT0s_IgTBFz)7H{PmzGd%};hCtgqB z$)0pQNYm4UiCZ$zt^D;QGzY@b*TchEMv6Eb75n2-uFHu(y$=iy{6TgHf0X~f{1<7z zz#k3V(U(-f7kskEV@v_VHo2#KkJ5@7xiFOg<@T6@nZK$r) zPSI!^N?nzRXlSdiN2sN>w7C^QBlq0cgvaamwRc&Q` z#loB8Jc2E3?P{VGYTFuW%Nr{^GPJpxx^BE^EUT+5Z>6%iDqBk{N?S{je2KQQ0m*9{ zs%c(lUaBEcyKICoH%p*?hg+|;M2{j17k7&+RrQUft?{uxmSe<=5h|fw zc5_8Bqg2Gj-BMB8^@gD>BNTrl`X5%p$#;upxwu;@xvrx-&}__4X*O<@26J)0yey{J zpAsg16k)iyUtXBRpOPhE6j`{qTV%P><^Mk;ftwq*j07(3mP&qsk?@mqENPSmb8-I$ zSxm-Jgz-e)BF!DC-!!U?<05aV=3mhPrqQgNC-Rq<<|hqh&qyhXvnL}@Lzc0)&_dsLR^=O8GLmQ2 zl|^@&$_nDDXdT6cHTC6PNT1pYd461ev8%qmEG@UH%$3?&JO}C0;!I^}9kiT|BIGYB z%&MwLukYaACC@UIx$04V?sa)hWrfzb!kFB+@-!;9rM4`3*5cxVETgAAruvGsRFsih zL)(gW*O$4n+Kbbm7b;Woto(diQEPVctfX6H@22SsVp?i%$dx?HYM+xq<#Lr4S7&=sp(8rWBNxVmsiQJ~WZQ~c z%A!+^S#=JsjIx4!V^Kjabz#1C6&KL?W;K>K&xpm{88@JiFAGdM;hQ9wf_OsLOnd_t4 z{^56{>D@HKF8$bkHsm*zH8^5DLLJMCS!wO!ZW{foxH?bj}Qp6A_YW8jxxLv5b*#xG5qdd+n=|e)5E?g z#wh%gq^@Cqu%)zMR##EstT@h(Yp}-QxvMP3W6vvG`CaAF_3dS~)K|=i&aAVv+2_o{ zxQ1VRV~!Qqmp5cJl|{#L{-=FTY*%KTy|&6>Nu#d#^>ld5q2YFm-C{|vPQjxQ?z<(1 zy6+hl>biGOH{MFanbmOPtu55o$A8v`oYd)0Zp6h>MK9Pn;D;9$%c7j_s3^J}z#k^k za5lYIxlIE1o(3(nPK(uQ&vROw?J=J878{M^DyDb5PmIcmisBv}bU$G=(r{YNaC%1u zSE#3I?vYDM+hDQTIj8I%!f><*Q0OGpyr0|uvoZdTrY39 zKQdBKe$=92d56c7mD4wOxv>>TUXOexPhpL8(xUKci?xI9(}#b#+}$&juE$wYs&Cdu zN)Zj`V9t!3k6czy&Jpv`qi=dBfZuF7UokikS$_RI0MVZLIYLw;N**ykEiE)`yiuP= z|DH)OVt#ts*Ui;rDWdZoazLkCeOv%~M|-O+$jbGD!*hs0lh{%`^OuYNk^FXAwuP#U z%a1$vOQ!h|KzoX)9_=)Ye!ec$Khqzbo}wB(ljhd`@XU9VkEsM}>ghx-$MAgTn!w4+ usMJlR&5PSATMgw+O$P2BkIqv1|;@L3t((Es^=nXUNyogDmM?EW8!bfpmh literal 0 HcmV?d00001 diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/description.xml b/test/data/openml_cache/org/openml/www/datasets/40981/description.xml new file mode 100644 index 00000000..70843ade --- /dev/null +++ b/test/data/openml_cache/org/openml/www/datasets/40981/description.xml @@ -0,0 +1,49 @@ + + 40981 + Australian + 4 + **Author**: Confidential. Donated by Ross Quinlan +**Source**: [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html), [UCI](https://archive.ics.uci.edu/ml/datasets/Statlog+(Australian+Credit+Approval)) - 1987 +**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html) + +**Important note:** This dataset is derived from [credit-approval](https://www.openml.org/d/29), even though both datasets exist individually on UCI. In this version, missing values were filled in (not clear how) and a duplicate feature was removed. + +**Australian Credit Approval**. This is the famous Australian Credit Approval dataset, originating from the StatLog project. It concerns credit card applications. All attribute names and values have been changed to meaningless symbols to protect the confidentiality of the data. + +This dataset was retrieved 2014-11-14 from the UCI site and converted to the ARFF format. + +__Major changes w.r.t. version 3: dataset from UCI that matches description and data types__ + + +### Feature information + +There are 6 numerical and 8 categorical attributes, all normalized to [-1,1]. The original formatting was as follows: + +A1: 0,1 CATEGORICAL (formerly: a,b) +A2: continuous. +A3: continuous. +A4: 1,2,3 CATEGORICAL (formerly: p,g,gg) +A5: 1, 2,3,4,5, 6,7,8,9,10,11,12,13,14 CATEGORICAL (formerly: ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x) +A6: 1, 2,3, 4,5,6,7,8,9 CATEGORICAL (formerly: ff,dd,j,bb,v,n,o,h,z) +A7: continuous. +A8: 1, 0 CATEGORICAL (formerly: t, f) +A9: 1, 0 CATEGORICAL (formerly: t, f) +A10: continuous. +A11: 1, 0 CATEGORICAL (formerly t, f) +A12: 1, 2, 3 CATEGORICAL (formerly: s, g, p) +A13: continuous. +A14: continuous. +A15: 1,2 class attribute (formerly: +,-) + +### Relevant Papers + +Ross Quinlan. "Simplifying decision trees", Int J Man-Machine Studies 27, Dec 1987, pp. 221-234. + +Ross Quinlan. "C4.5: Programs for Machine Learning", Morgan Kaufmann, Oct 1992 + 2 + ARFF + 2017-12-04T22:15:38 + Public https://api.openml.org/data/v1/download/18151910/Australian.arff + http://openml1.win.tue.nl/dataset40981/dataset_40981.pq 18151910 A15 4 derivedOpenML100study_135study_144study_218study_98 public https://archive.ics.uci.edu/ml/datasets/Statlog+(Australian+Credit+Approval) http://openml1.win.tue.nl/dataset40981/dataset_40981.pq active + 2018-10-04 07:20:02 920e2419a28215109651fcc5cbd1662e + diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml new file mode 100644 index 00000000..ba431ff5 --- /dev/null +++ b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml @@ -0,0 +1,175 @@ + + + 0 + A1 + nominal + 0 + 1 + false + false + false + 0 + + + 1 + A2 + numeric + false + false + false + 0 + + + 2 + A3 + numeric + false + false + false + 0 + + + 3 + A4 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 4 + A5 + nominal + 1 + 10 + 11 + 12 + 13 + 14 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + false + false + false + 0 + + + 5 + A6 + nominal + 1 + 2 + 3 + 4 + 5 + 7 + 8 + 9 + false + false + false + 0 + + + 6 + A7 + numeric + false + false + false + 0 + + + 7 + A8 + nominal + 0 + 1 + false + false + false + 0 + + + 8 + A9 + nominal + 0 + 1 + false + false + false + 0 + + + 9 + A10 + numeric + false + false + false + 0 + + + 10 + A11 + nominal + 0 + 1 + false + false + false + 0 + + + 11 + A12 + nominal + 1 + 2 + 3 + false + false + false + 0 + + + 12 + A13 + numeric + false + false + false + 0 + + + 13 + A14 + numeric + false + false + false + 0 + + + 14 + A15 + nominal + 0 + 1 + true + false + false + 0 + + diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a865af56b27c6a868c9e6806a8fc73bb21606bef GIT binary patch literal 899 zcmaKqJx{|h5QZBkZJMSHVqgPiilD;ud#r?5%EvE2QA!Pt)VNiXpfVuA?hW&Qx!4I9 zkv7srmi@f6!?>e-FpEH&<|a zo$q~$_J;s5uZC<+HUg4ZEQkO`6D+usW%85P6>QjS8HFrZ2=3)8|0P(YBo+^#H!cJ> zups<^rO}M6{V0s%kUba7w35!I1~ru(t-;y`k4r%v$ec!G9X<-`-qAftB8^0n>~0}V zjWi|MQ)FH(b)h`!apMvS>WIF*@qZF z3?W7k;~$Tz=24X%gEEgS6>ZZ~dPdJ_rzo(diEC0k+|mJCBW+2JN^DxxD^k}rL0t;Q zy9JIWa6n*ApeyAM*U-?0Lh Path: - return cls.get_project_root().joinpath('test/data') diff --git a/test/general_checks.py b/test/general_checks.py deleted file mode 100644 index a1d8610d..00000000 --- a/test/general_checks.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from typing import Union - -from meta_automl.data_preparation.dataset import Dataset, DatasetCache -from test.constants import CACHED_DATASETS -from test.data_manager import TestDataManager - - -def assert_file_unmodified_during_test(path: Path, test_start_timestamp: float): - assert path.stat().st_mtime < test_start_timestamp, f'The file should not be modified during the test: ' \ - f'"{path.relative_to(TestDataManager.get_project_root())}".' - - -def assert_cache_file_exists(path: Path): - assert path.exists(), 'Cache not found at the path: ' \ - f'"{path.relative_to(TestDataManager.get_project_root())}".' - - -def check_dataset_and_cache(dataset_or_cache: Union[Dataset, DatasetCache], desired_name: str, desired_path: Path, - test_start_time: float): - assert dataset_or_cache.name == desired_name - assert dataset_or_cache.cache_path == desired_path - assert_cache_file_exists(desired_path) - if desired_name in CACHED_DATASETS: - assert_file_unmodified_during_test(desired_path, test_start_time) diff --git a/test/unit/datasets/__init__.py b/test/unit/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/unit/datasets/conftest.py b/test/unit/datasets/conftest.py new file mode 100644 index 00000000..bd43ec3e --- /dev/null +++ b/test/unit/datasets/conftest.py @@ -0,0 +1,18 @@ +import shutil + +import pytest + +from meta_automl.data_preparation.dataset import OpenMLDataset +from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id +from test.constants import OPENML_CACHED_DATASETS, OPENML_DATASET_IDS_TO_LOAD + + +@pytest.fixture +def openml_dataset_ids(): + ids = OPENML_DATASET_IDS_TO_LOAD + yield ids + for dataset_id in ids: + if dataset_id in OPENML_CACHED_DATASETS: + continue + cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id) + shutil.rmtree(cache_path, ignore_errors=True) diff --git a/test/unit/datasets/general_checks.py b/test/unit/datasets/general_checks.py new file mode 100644 index 00000000..5e2f446d --- /dev/null +++ b/test/unit/datasets/general_checks.py @@ -0,0 +1,24 @@ +from pathlib import Path + +import test.constants +from meta_automl.data_preparation.dataset import DatasetBase +from meta_automl.data_preparation.file_system import get_project_root +from meta_automl.data_preparation.file_system import get_dataset_cache_path + + +def assert_file_unmodified_during_test(path: Path): + failure_message = ('The file should not be modified during the test: ' + f'"{path.relative_to(get_project_root())}".') + assert path.stat().st_mtime < test.constants.TEST_START_TIMESTAMP, failure_message + + +def assert_cache_file_exists(path: Path): + assert path.exists(), 'Cache not found at the path: ' \ + f'"{path.relative_to(get_project_root())}".' + + +def check_dataset_cache(dataset: DatasetBase): + cache_path = get_dataset_cache_path(dataset) + assert_cache_file_exists(cache_path) + if dataset.id_ in test.constants.OPENML_CACHED_DATASETS: + assert_file_unmodified_during_test(cache_path) diff --git a/test/unit/datasets/test_custom_dataset.py b/test/unit/datasets/test_custom_dataset.py new file mode 100644 index 00000000..5f34b194 --- /dev/null +++ b/test/unit/datasets/test_custom_dataset.py @@ -0,0 +1,48 @@ +import numpy as np +import pytest + +from meta_automl.data_preparation.dataset import DataNotFoundError, CustomDataset, DatasetData +from test.unit.datasets.general_checks import assert_cache_file_exists + + +@pytest.fixture(scope='module') +def new_dataset_data(): + dataset_data = DatasetData( + x=np.array([['a', 'b'], ['b', 'a']]), + y=np.array([5, 10]), + categorical_indicator=[True, True], + attribute_names=['foo', 'bar'] + ) + return dataset_data + + +@pytest.fixture(scope='module') +def new_dataset(new_dataset_data): + dataset = CustomDataset(42) + dataset.dump_data(new_dataset_data) + yield dataset + dataset.cache_path.unlink() + + +def test_error_on_missing_dataset_cache(): + with pytest.raises(DataNotFoundError): + CustomDataset('random_missing_dataset').get_data() + + +def test_custom_dataset_dumping(new_dataset): + # Act + cache_path = new_dataset.cache_path + # Assert + assert_cache_file_exists(cache_path) + + +def test_custom_dataset_data_loading(new_dataset_data, new_dataset): + # Act + correct_data = new_dataset_data + dataset = new_dataset + data = dataset.get_data() + # Assert + assert np.all(np.equal(data.x, correct_data.x)) + assert np.all(np.equal(data.y, correct_data.y)) + assert data.categorical_indicator == correct_data.categorical_indicator + assert data.attribute_names == correct_data.attribute_names diff --git a/test/unit/datasets/test_datasets_loaders.py b/test/unit/datasets/test_datasets_loaders.py new file mode 100644 index 00000000..0fd1ce17 --- /dev/null +++ b/test/unit/datasets/test_datasets_loaders.py @@ -0,0 +1,24 @@ +from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from test.unit.datasets.general_checks import check_dataset_cache + + +def test_group_load_new_datasets(openml_dataset_ids): + loader = OpenMLDatasetsLoader() + datasets = loader.load(openml_dataset_ids) + assert loader.dataset_ids == openml_dataset_ids + for dataset_id, dataset in zip(openml_dataset_ids, datasets): + check_dataset_cache(dataset) + + +def test_load_single(openml_dataset_ids): + loader = OpenMLDatasetsLoader() + for dataset_id in openml_dataset_ids: + dataset = loader.load_single(dataset_id) + check_dataset_cache(dataset) + + +def test_load_new_datasets_on_demand(openml_dataset_ids): + loader = OpenMLDatasetsLoader() + for dataset_id in openml_dataset_ids: + dataset = loader.load_single(dataset_id) + check_dataset_cache(dataset) diff --git a/test/unit/datasets/test_file_dataset.py b/test/unit/datasets/test_file_dataset.py new file mode 100644 index 00000000..125cb641 --- /dev/null +++ b/test/unit/datasets/test_file_dataset.py @@ -0,0 +1,48 @@ +import numpy as np +import pytest + +from meta_automl.data_preparation.dataset import CacheNotFoundError, FileDataset, DatasetData +from test.unit.datasets.general_checks import assert_cache_file_exists + + +@pytest.fixture(scope='module') +def new_dataset_data(): + dataset_data = DatasetData( + x=np.array([['a', 'b'], ['b', 'a']]), + y=np.array([5, 10]), + categorical_indicator=[True, True], + attribute_names=['foo', 'bar'] + ) + return dataset_data + + +@pytest.fixture(scope='module') +def new_dataset(new_dataset_data): + dataset = FileDataset(42) + dataset.dump_data(new_dataset_data) + yield dataset + dataset.cache_path.unlink() + + +def test_error_on_missing_dataset_cache(): + with pytest.raises(CacheNotFoundError): + FileDataset('random_missing_dataset').get_data() + + +def test_file_dataset_dumping(new_dataset): + # Act + cache_path = new_dataset.cache_path + # Assert + assert_cache_file_exists(cache_path) + + +def test_file_dataset_data_loading(new_dataset_data, new_dataset): + # Act + correct_data = new_dataset_data + dataset = new_dataset + data = dataset.get_data() + # Assert + assert np.all(np.equal(data.x, correct_data.x)) + assert np.all(np.equal(data.y, correct_data.y)) + assert data.categorical_indicator == correct_data.categorical_indicator + assert data.attribute_names == correct_data.attribute_names diff --git a/test/unit/datasets/test_openml_dataset.py b/test/unit/datasets/test_openml_dataset.py new file mode 100644 index 00000000..81042648 --- /dev/null +++ b/test/unit/datasets/test_openml_dataset.py @@ -0,0 +1,27 @@ +from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData +from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id +from test.constants import OPENML_CACHED_DATASETS +from test.unit.datasets.general_checks import check_dataset_cache + + +def test_openml_dataset_creation(openml_dataset_ids): + for dataset_id in openml_dataset_ids: + dataset = OpenMLDataset(dataset_id) + + assert dataset.id_ == dataset_id + + +def test_openml_dataset_is_cached_cached(openml_dataset_ids): + for dataset_id in openml_dataset_ids: + cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id) + + is_exist = dataset_id in OPENML_CACHED_DATASETS + assert is_exist == cache_path.exists() + + +def test_openml_dataset_data_loading(openml_dataset_ids): + for dataset_id in openml_dataset_ids: + dataset = OpenMLDataset(dataset_id) + dataset_data = dataset.get_data() + assert isinstance(dataset_data, DatasetData) + check_dataset_cache(dataset) diff --git a/test/unit/test_dataset.py b/test/unit/test_dataset.py deleted file mode 100644 index 3ac46d6d..00000000 --- a/test/unit/test_dataset.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np -import pytest - -from meta_automl.data_preparation.dataset import DatasetCache, NoCacheError -from test.constants import CACHED_DATASETS -from test.data_manager import TestDataManager - - -@pytest.fixture -def dumped_cache_path(): - path = TestDataManager.get_dataset_cache_path('data_dumped') - yield path - path.unlink() - - -def test_dataset_caching(dumped_cache_path): - dataset_name = CACHED_DATASETS[0] - - cache_path = TestDataManager.get_dataset_cache_path(dataset_name) - - dataset_cache = DatasetCache(dataset_name, cache_path) - dataset = dataset_cache.from_cache() - dumped_cache = dataset.dump_to_cache(dumped_cache_path) - reloaded_dataset = dumped_cache.from_cache() - # Check data integrity. - assert dataset.name == dataset_name - assert reloaded_dataset.name == dataset_name - assert dataset.id == reloaded_dataset.id - assert np.all(np.equal(dataset.x, reloaded_dataset.x)) - assert np.all(np.equal(dataset.y, reloaded_dataset.y)) - # Check caching integrity. - assert dataset_cache.cache_path == cache_path - assert dataset.cache_path == cache_path - assert dumped_cache.cache_path == dumped_cache_path - assert reloaded_dataset.cache_path == dumped_cache_path - - -def test_error_on_missing_dataset_cache(): - with pytest.raises(NoCacheError): - DatasetCache('random_missing_cache').from_cache() diff --git a/test/unit/test_datasets_loaders.py b/test/unit/test_datasets_loaders.py deleted file mode 100644 index 1596e312..00000000 --- a/test/unit/test_datasets_loaders.py +++ /dev/null @@ -1,50 +0,0 @@ -import time - -import pytest - -from meta_automl.data_preparation.dataset import DatasetCache -from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader -from test.general_checks import check_dataset_and_cache -from test.constants import CACHED_DATASETS -from test.data_manager import TestDataManager - - -@pytest.fixture -def dataset_names(): - dataset_names = ['australian', 'blood-transfusion-service-center'] - yield dataset_names - for dataset_name in dataset_names: - if dataset_name not in CACHED_DATASETS: - TestDataManager.get_dataset_cache_path(dataset_name).unlink(missing_ok=True) - - -def test_group_load_new_datasets(dataset_names): - test_start_time = time.time() - loader = OpenMLDatasetsLoader() - loader.data_manager = TestDataManager - - datasets = loader.load(dataset_names) - - assert loader.dataset_sources == dataset_names - - for dataset_name, dataset_cache in zip(dataset_names, datasets): - check_dataset_and_cache(dataset_cache, dataset_name, dataset_cache.cache_path, test_start_time) - - -def test_load_single(dataset_names): - test_start_time = time.time() - loader = OpenMLDatasetsLoader() - loader.data_manager = TestDataManager - for dataset_name in dataset_names: - dataset_cache = loader.load_single(dataset_name) - check_dataset_and_cache(dataset_cache, dataset_name, dataset_cache.cache_path, test_start_time) - - -def test_load_new_datasets_on_demand(dataset_names): - test_start_time = time.time() - loader = OpenMLDatasetsLoader() - loader.data_manager = TestDataManager - for dataset_name in dataset_names: - cache_path = TestDataManager.get_dataset_cache_path(dataset_name) - dataset = loader.cache_to_memory(DatasetCache(dataset_name, cache_path)) - check_dataset_and_cache(dataset, dataset_name, cache_path, test_start_time) diff --git a/test/unit/test_file_system.py b/test/unit/test_file_system.py new file mode 100644 index 00000000..dba55923 --- /dev/null +++ b/test/unit/test_file_system.py @@ -0,0 +1,7 @@ +import pytest +from pathlib import Path + +from meta_automl.data_preparation.file_system import get_data_dir, get_project_root + +# def test_root_dir(): +# assert get_project_root() == diff --git a/test/unit/test_meta_features_extractors.py b/test/unit/test_meta_features_extractors.py index c5625f53..bd9b925b 100644 --- a/test/unit/test_meta_features_extractors.py +++ b/test/unit/test_meta_features_extractors.py @@ -1,38 +1,37 @@ -import time +import shutil import pytest +from meta_automl.data_preparation.dataset import OpenMLDataset +from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id, get_meta_features_cache_path from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor -from test.general_checks import assert_file_unmodified_during_test, assert_cache_file_exists -from test.data_manager import TestDataManager -from test.constants import CACHED_DATASETS, DATASETS_WITH_CACHED_META_FEATURES +from test.unit.datasets.general_checks import assert_file_unmodified_during_test, assert_cache_file_exists +from test.constants import OPENML_DATASET_IDS_TO_LOAD, OPENML_CACHED_DATASETS, DATASETS_WITH_CACHED_META_FEATURES @pytest.fixture -def dataset_names(): - dataset_names = ['australian', 'monks-problems-1', 'monks-problems-2', 'blood-transfusion-service-center'] - yield dataset_names - for dataset_name in dataset_names: - if dataset_name not in CACHED_DATASETS + DATASETS_WITH_CACHED_META_FEATURES: - TestDataManager.get_dataset_cache_path(dataset_name).unlink(missing_ok=True) - if dataset_name not in DATASETS_WITH_CACHED_META_FEATURES: - TestDataManager.get_meta_features_cache_path(dataset_name, PymfeExtractor.SOURCE).unlink(missing_ok=True) +def dataset_ids(): + dataset_ids = set(OPENML_CACHED_DATASETS + DATASETS_WITH_CACHED_META_FEATURES + OPENML_DATASET_IDS_TO_LOAD) + yield dataset_ids + for dataset_id in dataset_ids: + if dataset_id not in OPENML_CACHED_DATASETS: + dataset_cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id) + shutil.rmtree(dataset_cache_path) + if dataset_id not in DATASETS_WITH_CACHED_META_FEATURES: + mf_cache_path = get_meta_features_cache_path(PymfeExtractor, dataset_id) + mf_cache_path.unlink(missing_ok=True) -def test_meta_features_extraction(dataset_names): - test_start_time = time.time() +def test_meta_features_extraction(dataset_ids): extractor = PymfeExtractor(extractor_params={'groups': 'general'}) - extractor.data_manager = TestDataManager - extractor.datasets_loader.data_manager = TestDataManager - meta_features = extractor.extract(dataset_names) - assert list(meta_features.index) == dataset_names - for dataset_name in dataset_names: - meta_features_cache_path = TestDataManager.get_meta_features_cache_path( - dataset_name, extractor.SOURCE) + meta_features = extractor.extract(dataset_ids) + assert set(meta_features.index) == dataset_ids + for dataset_id in dataset_ids: + meta_features_cache_path = get_meta_features_cache_path(PymfeExtractor, dataset_id) assert_cache_file_exists(meta_features_cache_path) - if dataset_name in DATASETS_WITH_CACHED_META_FEATURES: - assert_file_unmodified_during_test(meta_features_cache_path, test_start_time) + if dataset_id in DATASETS_WITH_CACHED_META_FEATURES: + assert_file_unmodified_during_test(meta_features_cache_path) else: - cache_path = TestDataManager.get_dataset_cache_path(dataset_name) + cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id) assert_cache_file_exists(cache_path) From 0b9ed49fcc200a8ecc2bc658ff285a7fb367f57c Mon Sep 17 00:00:00 2001 From: max Date: Mon, 3 Jul 2023 20:44:53 +0300 Subject: [PATCH 46/60] Auto-sklearn baseline in a progress --- .../__init__.py | 0 experiments/auto-sklearn/experimental_data.csv | 17 +++++++++++++++++ .../openml_suite.py | 0 3 files changed, 17 insertions(+) rename experiments/{auto-sklearn_run => auto-sklearn}/__init__.py (100%) create mode 100644 experiments/auto-sklearn/experimental_data.csv rename experiments/{auto-sklearn_run => auto-sklearn}/openml_suite.py (100%) diff --git a/experiments/auto-sklearn_run/__init__.py b/experiments/auto-sklearn/__init__.py similarity index 100% rename from experiments/auto-sklearn_run/__init__.py rename to experiments/auto-sklearn/__init__.py diff --git a/experiments/auto-sklearn/experimental_data.csv b/experiments/auto-sklearn/experimental_data.csv new file mode 100644 index 00000000..18dbec46 --- /dev/null +++ b/experiments/auto-sklearn/experimental_data.csv @@ -0,0 +1,17 @@ +dataset_id,dataset_name,run_label,roc_auc,f1,accuracy,logloss,precision,fit_time,inference_time,model_str +1590,adult,Auto-sklearn,-0.798,-0.923,-0.878,4.391,-0.897,596.2,0.1,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=1.7108930238344161e-10, learning_rate=0.010827728124541558, loss='auto', max_iter=512, max_leaf_nodes=25, min_samples_leaf=4, n_iter_no_change=19, random_state=1, validation_fraction=0.1759114608225653, warm_start=True)" +1461,bank-marketing,Auto-sklearn,-0.695,-0.507,-0.9,3.587,-0.63,595.0,0.6,"RandomForestClassifier(max_features=4, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)" +1464,blood-transfusion-service-center,Auto-sklearn,-0.627,-0.415,-0.793,7.449,-0.688,593.6,0.0,"MLPClassifier(alpha=6.875656304664039e-05, beta_1=0.999, beta_2=0.9, hidden_layer_sizes=(224,), learning_rate_init=0.00011403871479850849, max_iter=256, n_iter_no_change=32, random_state=1, validation_fraction=0.0, verbose=0, warm_start=True)" +1489,phoneme,Auto-sklearn,-0.881,-0.82,-0.891,3.934,-0.787,593.8,0.0,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=0.00030344870480744136, learning_rate=0.18318625129457267, loss='auto', max_iter=512, max_leaf_nodes=366, min_samples_leaf=2, n_iter_no_change=11, random_state=1, validation_fraction=None, warm_start=True)" +40975,car,Auto-sklearn,-0.999,-0.949,-0.98,0.114,-0.93,598.1,0.0,"RandomForestClassifier(bootstrap=False, max_features=2, min_samples_split=3, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)" +40996,fashion-mnist,Auto-sklearn,-0.968,-0.866,-0.866,1.945,-0.866,596.9,0.8,"KNeighborsClassifier(n_neighbors=4, weights='distance')" +41027,jungle_chess_2pcs_raw_endgame_complete,Auto-sklearn,-0.974,-0.82,-0.866,0.275,-0.826,596.7,0.2,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=9.674948183980905e-09, learning_rate=0.014247987845444413, loss='auto', max_iter=512, max_leaf_nodes=55, min_samples_leaf=164, n_iter_no_change=1, random_state=1, validation_fraction=0.11770489601182355, warm_start=True)" +54,vehicle,Auto-sklearn,-0.965,-0.847,-0.859,0.896,-0.848,594.9,0.0,LinearDiscriminantAnalysis(tol=0.06932929810851429)roc_auc,f1,accuracy,logloss,precision,dataset_id,dataset_name,run_label,fit_time,inference_time,model_str +1590,adult,Hist gradient boosting classifier,-0.806,-0.924,-0.88,4.313,-0.903,0.4,0.0, +1461,bank-marketing,Hist gradient boosting classifier,-0.718,-0.544,-0.904,3.452,-0.639,0.7,0.0, +1464,blood-transfusion-service-center,Hist gradient boosting classifier,-0.565,-0.328,-0.7,10.813,-0.367,0.1,0.0, +1489,phoneme,Hist gradient boosting classifier,-0.876,-0.82,-0.894,3.834,-0.806,0.2,0.0, +40975,car,Hist gradient boosting classifier,-1.0,-0.94,-0.986,0.025,-0.93,0.7,0.0, +40996,fashion-mnist,Hist gradient boosting classifier,-0.993,-0.904,-0.904,0.267,-0.903,113.4,0.9, +41027,jungle_chess_2pcs_raw_endgame_complete,Hist gradient boosting classifier,-0.976,-0.825,-0.87,0.272,-0.833,1.4,0.1, +54,vehicle,Hist gradient boosting classifier,-0.928,-0.763,-0.782,0.778,-0.768,1.2,0.0, diff --git a/experiments/auto-sklearn_run/openml_suite.py b/experiments/auto-sklearn/openml_suite.py similarity index 100% rename from experiments/auto-sklearn_run/openml_suite.py rename to experiments/auto-sklearn/openml_suite.py From 42e343ba956f0a73c46cbd9689fa552a468837f3 Mon Sep 17 00:00:00 2001 From: max Date: Mon, 3 Jul 2023 20:46:59 +0300 Subject: [PATCH 47/60] WIP: auto-sklearn baseline --- experiments/__init__.py | 0 experiments/auto-sklearn/openml_suite.py | 147 ++++++++++++++++------- experiments/fedot_warm_start/run.py | 30 +++-- 3 files changed, 120 insertions(+), 57 deletions(-) create mode 100644 experiments/__init__.py diff --git a/experiments/__init__.py b/experiments/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/auto-sklearn/openml_suite.py b/experiments/auto-sklearn/openml_suite.py index 588d3b93..4a655653 100644 --- a/experiments/auto-sklearn/openml_suite.py +++ b/experiments/auto-sklearn/openml_suite.py @@ -1,57 +1,73 @@ +import csv +import os import pickle import re +import time import numpy as np import json import autosklearn.classification +import autosklearn.ensembles from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import Balancing from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice from autosklearn.pipeline.components.classification import AutoSklearnClassificationAlgorithm, ClassifierChoice -from experiments.fedot_warm_start.run import prepare_data +# from experiments.fedot_warm_start.run import fetch_openml_data, mock_data_fetching from sklearn import model_selection, metrics +from sklearn import ensemble from sklearn.base import ClassifierMixin -class AutoSklearnEncoder(json.JSONEncoder): - def default(self, o): - if isinstance(o, ClassifierChoice): - return repr(o.choice.estimator) - # if isinstance(o, (DataPreprocessorChoice, FeaturePreprocessorChoice)): - # return None - if isinstance(o, ClassifierMixin): - return re.sub(r'\s{2,}', ' ', repr(o)) - elif isinstance(o, Balancing): - return repr(o) - elif isinstance(o, np.integer): - return int(o) - elif isinstance(o, np.floating): - return float(o) +# class AutoSklearnEncoder(json.JSONEncoder): +# def default(self, o): +# # if isinstance(o, dict): +# # return json.dumps(o) +# if isinstance(o, ClassifierChoice): +# return repr(o.choice.estimator) +# # if isinstance(o, (DataPreprocessorChoice, FeaturePreprocessorChoice)): +# # return None +# elif isinstance(o, ClassifierMixin): +# return re.sub(r'\s{2,}', ' ', repr(o)) +# # elif isinstance(o, Balancing): +# # return repr(o) +# elif isinstance(o, np.integer): +# return int(o) +# elif isinstance(o, np.floating): +# return float(o) -class AutoSklearnValidator: +class AutoSklearnBaseline: def __init__(self): pass @staticmethod def main(): - ds_with_ids, ds_names = prepare_data() - train_ds_names, test_ds_names = ds_names + openml_data = None + # dataset_names = [dataset.name for dataset in openml_data] - ds_ids, datasets = ds_with_ids + # train_data_names, test_data_names = model_selection.train_test_split( + # [dataset.name for dataset in openml_data], + # test_size=0.2, + # random_state=42 + # ) + # train_ds_names, test_ds_names = ds_names - for ds_name in train_ds_names: - # if train_ds_names[0] is not None: - print("Sanity check") - dataset = datasets[ds_name].from_cache() + # ds_ids, datasets = ds_with_ids - # cannot wait longer because of the slow data fetching, issue#9 - estimator = autosklearn.classification.AutoSklearnClassifier( - time_left_for_this_task=60 - ) + # for ds_name in train_ds_names: + + for iteration, dataset in enumerate(openml_data): + print(f"Fetched data name: {dataset.name}") + dataset = dataset.from_cache() + + # estimator = autosklearn.classification.AutoSklearnClassifier( + # ensemble_class=autosklearn.ensembles.SingleBest, + # time_left_for_this_task=600 + # ) + estimator = ensemble.HistGradientBoostingClassifier() X_train, X_test, y_train, y_test = model_selection.train_test_split( dataset.x, @@ -60,32 +76,75 @@ def main(): random_state=42 ) + fitting_start_time = time.time() pipeline = estimator.fit(X_train, y_train) + fitting_end_time = time.time() - fitting_start_time + # print(f"Fitting time is {fitting_end_time}sec") + inference_start_time = time.time() predictions = estimator.predict(X_test) - - quality_estimation = metrics.roc_auc_score(y_test, predictions) - - results = { - 'ensemble': pipeline.show_models(), - 'score': quality_estimation + inference_end_time = time.time() - inference_start_time + + prediction_probabilities = estimator.predict_proba(X_test) + + is_multi_classification_problem = True if len(set(predictions)) > 2 else False + # print(f"Inference time is {inference_end_time}sec") + # roc_auc_score = metrics.roc_auc_score(y_test, predictions) + + # autosklearn_ensemble = pipeline.show_models() + # formatted_ensemble = { + # model_id: { + # 'rank': autosklearn_ensemble[model_id].get('rank'), + # 'cost': float(f"{autosklearn_ensemble[model_id].get('cost'):.3f}"), + # 'ensemble_weight': autosklearn_ensemble[model_id].get('ensemble_weight'), + # 'model': autosklearn_ensemble[model_id].get('sklearn_classifier') + # } for model_id in autosklearn_ensemble.keys() + # } + + # best_single_model = list(pipeline.show_models().values())[0].get('sklearn_classifier') + best_single_model = repr(pipeline) + # encoded_ensemble = str(formatted_ensemble).encode('base64') + + # print(f"y_test is {predictions}") + + general_run_info = { + # 'id': iteration + 1, + 'dataset_id': dataset.id, + 'dataset_name': dataset.name, + 'run_label': 'Hist gradient boosting classifier' + } + average = 'macro' if is_multi_classification_problem else 'binary' + model_dependent_run_info = { + 'roc_auc': -1 * float(f"{metrics.roc_auc_score(y_test, prediction_probabilities if is_multi_classification_problem else predictions, multi_class='ovr'):.3f}"), + 'f1': -1 * float(f"{metrics.f1_score(y_test, predictions, average=average):.3f}"), + 'accuracy': -1 * float(f"{metrics.accuracy_score(y_test, predictions):.3f}"), + 'logloss': float(f"{metrics.log_loss(y_test, prediction_probabilities if is_multi_classification_problem else predictions):.3f}"), + 'precision': -1 * float(f"{metrics.precision_score(y_test, predictions, average=average):.3f}"), + 'fit_time': float(f'{fitting_end_time:.1f}'), + 'inference_time': float(f'{inference_end_time:.1f}'), + # 'model_str': re.sub(r'\s{2,}', ' ', repr(best_single_model)) + 'model_str': None } + results = {**general_run_info, **model_dependent_run_info} + + # for key in autosklearn_ensemble.keys(): + # ensemble_model = autosklearn_ensemble[key] + # formatted_ensemble = results['ensemble'] + # for model_id in formatted_ensemble.keys(): + # formatted_ensemble[model_id] = ensemble_model.get("rank", None) # pickle.dump(pipeline.show_models(), open("results.pickle", "wb")) # print(type(pipeline.show_models().get(list(pipeline.show_models().keys())[0]).get("classifier"))) - with open("results.json", "w") as file: - json.dump( - results, - file, - cls=AutoSklearnEncoder, - indent=2 - ) - -if __name__ == '__main__': - AutoSklearnValidator.main() - + # knowledge_base_path = os.path.dirname('knowledge_base_0') + with open('experimental_data.csv', 'a', newline='') as file: + writer = csv.writer(file, delimiter=',') + # if iteration == 0: + # writer.writerow(results.keys()) + writer.writerow(results.values()) +if __name__ == '__main__': + AutoSklearnBaseline.main() diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 1df6a0b6..60cbdcb3 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,6 +1,7 @@ import functools import json import logging +import os import timeit from datetime import datetime from itertools import chain @@ -57,7 +58,7 @@ TIME_NOW = None TIME_NOW_FOR_PATH = None -DEBUG = False +DEBUG = True def setup_logging(): @@ -66,7 +67,9 @@ def setup_logging(): global TIME_NOW_FOR_PATH TIME_NOW_FOR_PATH = time_now_for_path = time_now.replace(":", ".") global SAVE_DIR - SAVE_DIR = save_dir = Path(f'run_{time_now_for_path}') + SAVE_DIR = save_dir = Path(__file__).parent\ + .resolve()\ + .joinpath(f'run_{time_now_for_path}') save_dir.mkdir() log_file = save_dir.joinpath('log.txt') Log(log_file=log_file) @@ -78,7 +81,7 @@ def setup_logging(): ) -def fetch_openml_data() -> Tuple[List[int], Dict[str, DatasetCache]]: +def fetch_openml_data() -> List[DatasetCache]: """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" dataset_ids = openml.study.get_suite(99).data if N_DATASETS is not None: @@ -86,8 +89,11 @@ def fetch_openml_data() -> Tuple[List[int], Dict[str, DatasetCache]]: dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) dataset_ids = list(dataset_ids) - datasets = {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)} - return dataset_ids, datasets + data = [cache for cache in OpenMLDatasetsLoader().load(dataset_ids)] + return data + +def mock_data_fetching() -> List[DatasetCache]: + return [cache for cache in OpenMLDatasetsLoader().load([1590, 1461, 1464, 1489, 40975, 40996, 41027, 54])] def transform_data_for_fedot(data: Dataset) -> (np.array, np.array): @@ -176,28 +182,26 @@ def extract_best_history_models(dataset_cache, history): return best_models -def prepare_data() -> Tuple[Tuple[List[int], Dict[str, DatasetCache]], Tuple[List[str], List[str]]]: - dataset_ids, datasets = fetch_openml_data() +def ds_train_test_split() -> Tuple[Tuple[List[int], Dict[str, DatasetCache]], Tuple[List[str], List[str]]]: + openml_data = fetch_openml_data() train_data_names, test_data_names = train_test_split( - list(datasets.keys()), + [dataset.name for dataset in openml_data], test_size=TEST_SIZE, random_state=SEED ) - return (dataset_ids, datasets), (train_data_names, test_data_names) + return train_data_names, test_data_names def main(): baseline_pipeline = PipelineBuilder().add_node('rf').build() - ds_with_ids, dataset_names = prepare_data() + ds_with_ids, dataset_names = ds_train_test_split() train_ds_names, test_ds_names = dataset_names ds_ids, datasets = ds_with_ids - data_similarity_assessor, extractor = prepare_extractor_and_assessor(train_ds_names) - results = [] best_models_per_dataset = {} progress_file = open(SAVE_DIR.joinpath('progress.txt'), 'a') @@ -221,7 +225,7 @@ def main(): except Exception: logging.exception(f'Train dataset "{name}"') - data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train) + data_similarity_assessor, extractor = prepare_extractor_and_assessor(train_ds_names) model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) model_advisor.fit(best_models_per_dataset) From 26d57b848468276a5ff1ef625aa6f75244d99cf8 Mon Sep 17 00:00:00 2001 From: max Date: Wed, 5 Jul 2023 19:44:58 +0300 Subject: [PATCH 48/60] Implemented Auto-sklearn baseline. --- ...dvise_models_from_similar_datasets.py.orig | 47 ------ experiments/auto-sklearn/baseline.py | 159 ++++++++++++++++++ .../auto-sklearn/experimental_data.csv | 74 ++++++-- experiments/auto-sklearn/openml_suite.py | 150 ----------------- 4 files changed, 216 insertions(+), 214 deletions(-) delete mode 100644 examples/4_advising_models/advise_models_from_similar_datasets.py.orig create mode 100644 experiments/auto-sklearn/baseline.py delete mode 100644 experiments/auto-sklearn/openml_suite.py diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py.orig b/examples/4_advising_models/advise_models_from_similar_datasets.py.orig deleted file mode 100644 index c8f50581..00000000 --- a/examples/4_advising_models/advise_models_from_similar_datasets.py.orig +++ /dev/null @@ -1,47 +0,0 @@ -from fedot.core.pipelines.pipeline_builder import PipelineBuilder -from golem.core.optimisers.fitness import SingleObjFitness -from sklearn.model_selection import train_test_split - -from meta_automl.data_preparation.dataset import DatasetCache -from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader -from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor -from meta_automl.data_preparation.model import Model -from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor -from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor - - -def main(): - # Define datasets. - dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing'] - # Extract meta-features and load on demand. - extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader()) - meta_features = extractor.extract(dataset_names) - # Preprocess meta-features, as KNN does not support NaNs. - meta_features = meta_features.dropna(axis=1, how='any') - # Split datasets to train (preprocessing) and test (actual meta-algorithm objects). - x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42) - y_train = x_train.index -<<<<<<< HEAD - assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=2) -||||||| 9541bf7 - assessor = KNNSimilarityAssessor({'n_neighbors': 2}, n_best=2) -======= - assessor = KNNSimilarityAssessor({'n_neighbors': 3}, n_best=2) ->>>>>>> e140a34de32bf20396693e888560bcc51fb5539e - assessor.fit(x_train, y_train) - # Define best models for datasets. - best_pipelines = [ - PipelineBuilder().add_node('scaling').add_node('rf').build(), - PipelineBuilder().add_node('normalization').add_node('logit').build(), - PipelineBuilder().add_node('rf').add_node('logit').build() - ] - best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', DatasetCache(dataset_name))] - for dataset_name, pipeline in zip(y_train, best_pipelines)] - - dataset_names_to_best_pipelines = dict(zip(y_train, best_models)) - advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines) - return advisor.predict(x_test) - - -if __name__ == '__main__': - result = main() diff --git a/experiments/auto-sklearn/baseline.py b/experiments/auto-sklearn/baseline.py new file mode 100644 index 00000000..da1c21a4 --- /dev/null +++ b/experiments/auto-sklearn/baseline.py @@ -0,0 +1,159 @@ +import csv +import time + +from typing import Any, Tuple, Dict + +import numpy as np +import logging + +import autosklearn.classification +import autosklearn.ensembles + +from sklearn import model_selection, metrics + +from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.data_preparation.models_loaders import KnowledgeBaseModelsLoader +from autosklearn.classification import AutoSklearnClassifier + + + +class AutoSklearnBaseline: + def __init__(self, estimator_ensemble_type, time_limit): + self.estimator = AutoSklearnClassifier( + ensemble_class=estimator_ensemble_type, + time_left_for_this_task=time_limit, + ) + self.knowledge_base_loader = KnowledgeBaseModelsLoader() + + def make_quality_metric_estimates(self, y, predictions, prediction_proba, is_multi_label): + """ Compute roc_auc, f1, accuracy, log_loss and precision scores. """ + results = { + 'roc_auc': -1 * float( + "{:.3f}".format( + metrics.roc_auc_score( + y, + prediction_proba if is_multi_label else predictions, + multi_class='ovr' + ) + ) + ), + 'f1': -1 * float( + "{:.3f}".format( + metrics.f1_score( + y, + predictions, + average='macro' if is_multi_label else 'binary' + ) + ) + ), + 'accuracy': -1 * float( + "{:.3f}".format( + metrics.accuracy_score( + y, + predictions + ) + ) + ), + 'logloss': float( + "{:.3f}".format( + metrics.log_loss( + y, + prediction_proba if is_multi_label else predictions + ) + ) + ), + 'precision': -1 * float( + "{:.3f}".format( + metrics.precision_score( + y, + predictions, + average='macro' if is_multi_label else 'binary', + labels=np.unique(predictions) + ) + ) + ) + } + return results + + def run(self): + """ Fit auto-sklearn meta-optimizer to knowledge base datasets and output a single best model. """ + dataset_ids_to_load = [ + dataset_id for dataset_id in self.knowledge_base_loader + .parse_datasets('test') + .loc[:, 'dataset_id'] + ] + dataset_ids_to_load = [dataset_ids_to_load[dataset_ids_to_load.index(41166)]] + + loaded_datasets = OpenMLDatasetsLoader().load(dataset_ids_to_load) + + for iteration, dataset in enumerate(loaded_datasets): + logging.log(logging.INFO, f"Loaded dataset name: {dataset.name}") + dataset = dataset.from_cache() + + X_train, X_test, y_train, y_test = model_selection.train_test_split( + dataset.x, + dataset.y, + test_size=0.2, + random_state=42, + stratify=dataset.y + ) + + fitting_start_time = time.time() + ensemble = self.estimator.fit(X_train, y_train) + fitting_time = time.time() - fitting_start_time + logging.log(logging.INFO, f"Fitting time is {fitting_time}sec") + + inference_start_time = time.time() + predicted_results = self.estimator.predict(X_test) + inference_time = time.time() - inference_start_time + logging.log(logging.INFO, f"Inference time is {inference_time}sec") + + predicted_probabilities = self.estimator.predict_proba(X_test) + + best_single_model = list(ensemble.show_models().values())[0].get('sklearn_classifier') + + # autosklearn_ensemble = pipeline.show_models() + # formatted_ensemble = { + # model_id: { + # 'rank': autosklearn_ensemble[model_id].get('rank'), + # 'cost': float(f"{autosklearn_ensemble[model_id].get('cost'):.3f}"), + # 'ensemble_weight': autosklearn_ensemble[model_id].get('ensemble_weight'), + # 'model': autosklearn_ensemble[model_id].get('sklearn_classifier') + # } for model_id in autosklearn_ensemble.keys() + # } + + general_run_info = { + 'dataset_id': dataset.id, + 'dataset_name': dataset.name, + 'run_label': 'Auto-sklearn', + } + + is_multilabel_classification = True if len(set(predicted_results)) > 2 else False + quality_metric_estimates = self.make_quality_metric_estimates( + y_test, + predicted_results, + predicted_probabilities, + is_multilabel_classification + ) + + model_dependent_run_info = { + 'fit_time': float(f'{fitting_time:.1f}'), + 'inference_time': float(f'{inference_time:.1f}'), + 'model_str': repr(best_single_model) + } + + results = {**general_run_info, **quality_metric_estimates, **model_dependent_run_info} + + # for key in autosklearn_ensemble.keys(): + # ensemble_model = autosklearn_ensemble[key] + # formatted_ensemble = results['ensemble'] + # for model_id in formatted_ensemble.keys(): + # formatted_ensemble[model_id] = ensemble_model.get("rank", None) + + with open('experimental_data.csv', 'a', newline='') as file: + writer = csv.writer(file, delimiter=',') + writer.writerow(results.values()) + + +if __name__ == '__main__': + AutoSklearnBaseline(autosklearn.ensembles.SingleBest, 600).run() diff --git a/experiments/auto-sklearn/experimental_data.csv b/experiments/auto-sklearn/experimental_data.csv index 18dbec46..7a3f3cfa 100644 --- a/experiments/auto-sklearn/experimental_data.csv +++ b/experiments/auto-sklearn/experimental_data.csv @@ -1,17 +1,57 @@ -dataset_id,dataset_name,run_label,roc_auc,f1,accuracy,logloss,precision,fit_time,inference_time,model_str -1590,adult,Auto-sklearn,-0.798,-0.923,-0.878,4.391,-0.897,596.2,0.1,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=1.7108930238344161e-10, learning_rate=0.010827728124541558, loss='auto', max_iter=512, max_leaf_nodes=25, min_samples_leaf=4, n_iter_no_change=19, random_state=1, validation_fraction=0.1759114608225653, warm_start=True)" -1461,bank-marketing,Auto-sklearn,-0.695,-0.507,-0.9,3.587,-0.63,595.0,0.6,"RandomForestClassifier(max_features=4, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)" -1464,blood-transfusion-service-center,Auto-sklearn,-0.627,-0.415,-0.793,7.449,-0.688,593.6,0.0,"MLPClassifier(alpha=6.875656304664039e-05, beta_1=0.999, beta_2=0.9, hidden_layer_sizes=(224,), learning_rate_init=0.00011403871479850849, max_iter=256, n_iter_no_change=32, random_state=1, validation_fraction=0.0, verbose=0, warm_start=True)" -1489,phoneme,Auto-sklearn,-0.881,-0.82,-0.891,3.934,-0.787,593.8,0.0,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=0.00030344870480744136, learning_rate=0.18318625129457267, loss='auto', max_iter=512, max_leaf_nodes=366, min_samples_leaf=2, n_iter_no_change=11, random_state=1, validation_fraction=None, warm_start=True)" -40975,car,Auto-sklearn,-0.999,-0.949,-0.98,0.114,-0.93,598.1,0.0,"RandomForestClassifier(bootstrap=False, max_features=2, min_samples_split=3, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)" -40996,fashion-mnist,Auto-sklearn,-0.968,-0.866,-0.866,1.945,-0.866,596.9,0.8,"KNeighborsClassifier(n_neighbors=4, weights='distance')" -41027,jungle_chess_2pcs_raw_endgame_complete,Auto-sklearn,-0.974,-0.82,-0.866,0.275,-0.826,596.7,0.2,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=9.674948183980905e-09, learning_rate=0.014247987845444413, loss='auto', max_iter=512, max_leaf_nodes=55, min_samples_leaf=164, n_iter_no_change=1, random_state=1, validation_fraction=0.11770489601182355, warm_start=True)" -54,vehicle,Auto-sklearn,-0.965,-0.847,-0.859,0.896,-0.848,594.9,0.0,LinearDiscriminantAnalysis(tol=0.06932929810851429)roc_auc,f1,accuracy,logloss,precision,dataset_id,dataset_name,run_label,fit_time,inference_time,model_str -1590,adult,Hist gradient boosting classifier,-0.806,-0.924,-0.88,4.313,-0.903,0.4,0.0, -1461,bank-marketing,Hist gradient boosting classifier,-0.718,-0.544,-0.904,3.452,-0.639,0.7,0.0, -1464,blood-transfusion-service-center,Hist gradient boosting classifier,-0.565,-0.328,-0.7,10.813,-0.367,0.1,0.0, -1489,phoneme,Hist gradient boosting classifier,-0.876,-0.82,-0.894,3.834,-0.806,0.2,0.0, -40975,car,Hist gradient boosting classifier,-1.0,-0.94,-0.986,0.025,-0.93,0.7,0.0, -40996,fashion-mnist,Hist gradient boosting classifier,-0.993,-0.904,-0.904,0.267,-0.903,113.4,0.9, -41027,jungle_chess_2pcs_raw_endgame_complete,Hist gradient boosting classifier,-0.976,-0.825,-0.87,0.272,-0.833,1.4,0.1, -54,vehicle,Hist gradient boosting classifier,-0.928,-0.763,-0.782,0.778,-0.768,1.2,0.0, +1461,bank-marketing,Auto-sklearn,-0.711,-0.535,-0.907,3.34,-0.648,598.0,0.1,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=1.7108930238344161e-10, + learning_rate=0.010827728124541558, loss='auto', + max_iter=512, max_leaf_nodes=25, + min_samples_leaf=4, n_iter_no_change=19, + random_state=1, + validation_fraction=0.1759114608225653, + warm_start=True)" +179,adult,Auto-sklearn,-0.774,-0.91,-0.859,5.077,-0.885,595.3,0.1,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=1.7108930238344161e-10, + learning_rate=0.010827728124541558, loss='auto', + max_iter=512, max_leaf_nodes=25, + min_samples_leaf=4, n_iter_no_change=19, + random_state=1, + validation_fraction=0.1759114608225653, + warm_start=True)" +1464,blood-transfusion-service-center,Auto-sklearn,-0.669,-0.5,-0.8,7.209,-0.625,597.6,0.0,"PassiveAggressiveClassifier(C=0.253246830865058, average=True, max_iter=16, + random_state=1, tol=0.01676578241454229, + warm_start=True)" +991,car,Auto-sklearn,-1.0,-1.0,-1.0,0.0,-1.0,596.8,0.0,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=1.9280388598217333e-10, + learning_rate=0.24233932723531437, loss='auto', + max_iter=128, max_leaf_nodes=35, + min_samples_leaf=17, n_iter_no_change=1, + random_state=1, validation_fraction=None, + warm_start=True)" +1489,phoneme,Auto-sklearn,-0.848,-0.797,-0.887,4.068,-0.845,600.4,0.1,"AdaBoostClassifier(algorithm='SAMME', + base_estimator=DecisionTreeClassifier(max_depth=10), + learning_rate=1.1377640450285444, n_estimators=352, + random_state=1)" +41027,jungle_chess_2pcs_raw_endgame_complete,Auto-sklearn,-0.975,-0.816,-0.865,0.271,-0.824,595.1,0.2,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=9.674948183980905e-09, + learning_rate=0.014247987845444413, loss='auto', + max_iter=512, max_leaf_nodes=55, + min_samples_leaf=164, n_iter_no_change=1, + random_state=1, + validation_fraction=0.11770489601182355, + warm_start=True)" +41166,volkert,Auto-sklearn,-0.874,-0.586,-0.644,1.829,-0.587,595.8,0.3,"LinearDiscriminantAnalysis(shrinkage='auto', solver='lsqr', + tol=0.018821286956948503)" +54,vehicle,Auto-sklearn,-0.964,-0.86,-0.859,0.408,-0.861,595.5,0.0,"MLPClassifier(activation='tanh', alpha=0.0002060405669905105, beta_1=0.999, + beta_2=0.9, hidden_layer_sizes=(87, 87, 87), + learning_rate_init=0.00040205833939989724, max_iter=256, + n_iter_no_change=32, random_state=1, validation_fraction=0.0, + verbose=0, warm_start=True)" +40996,fashion-mnist,Auto-sklearn,-0.968,-0.864,-0.865,1.913,-0.866,296.1,1.2,"KNeighborsClassifier(n_neighbors=4, weights='distance')" +40996,fashion-mnist,Auto-sklearn,-0.968,-0.864,-0.865,1.913,-0.866,595.5,0.8,"KNeighborsClassifier(n_neighbors=4, weights='distance')" +42344,sf-police-incidents,Auto-sklearn,-0.574,-0.589,-0.574,15.367,-0.569,594.8,0.5,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=3.609412172481434e-10, + learning_rate=0.05972079854295879, loss='auto', + max_iter=512, max_leaf_nodes=4, + min_samples_leaf=2, n_iter_no_change=14, + random_state=1, validation_fraction=None, + warm_start=True)" +1240,airlinescodrnaadult,Auto-sklearn,-0.62,-0.683,-0.631,13.306,-0.658,594.3,0.1,"SGDClassifier(alpha=1.6992296128865824e-07, average=True, eta0=0.01, loss='log', + max_iter=512, penalty='l1', random_state=1, + tol=1.535384699341134e-05, warm_start=True)" \ No newline at end of file diff --git a/experiments/auto-sklearn/openml_suite.py b/experiments/auto-sklearn/openml_suite.py deleted file mode 100644 index 4a655653..00000000 --- a/experiments/auto-sklearn/openml_suite.py +++ /dev/null @@ -1,150 +0,0 @@ -import csv -import os -import pickle -import re -import time - -import numpy as np -import json - -import autosklearn.classification -import autosklearn.ensembles -from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import Balancing -from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice -from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice -from autosklearn.pipeline.components.classification import AutoSklearnClassificationAlgorithm, ClassifierChoice - -# from experiments.fedot_warm_start.run import fetch_openml_data, mock_data_fetching -from sklearn import model_selection, metrics -from sklearn import ensemble -from sklearn.base import ClassifierMixin - - -# class AutoSklearnEncoder(json.JSONEncoder): -# def default(self, o): -# # if isinstance(o, dict): -# # return json.dumps(o) -# if isinstance(o, ClassifierChoice): -# return repr(o.choice.estimator) -# # if isinstance(o, (DataPreprocessorChoice, FeaturePreprocessorChoice)): -# # return None -# elif isinstance(o, ClassifierMixin): -# return re.sub(r'\s{2,}', ' ', repr(o)) -# # elif isinstance(o, Balancing): -# # return repr(o) -# elif isinstance(o, np.integer): -# return int(o) -# elif isinstance(o, np.floating): -# return float(o) - - -class AutoSklearnBaseline: - - def __init__(self): - pass - - @staticmethod - def main(): - openml_data = None - # dataset_names = [dataset.name for dataset in openml_data] - - # train_data_names, test_data_names = model_selection.train_test_split( - # [dataset.name for dataset in openml_data], - # test_size=0.2, - # random_state=42 - # ) - # train_ds_names, test_ds_names = ds_names - - # ds_ids, datasets = ds_with_ids - - # for ds_name in train_ds_names: - - for iteration, dataset in enumerate(openml_data): - print(f"Fetched data name: {dataset.name}") - dataset = dataset.from_cache() - - # estimator = autosklearn.classification.AutoSklearnClassifier( - # ensemble_class=autosklearn.ensembles.SingleBest, - # time_left_for_this_task=600 - # ) - estimator = ensemble.HistGradientBoostingClassifier() - - X_train, X_test, y_train, y_test = model_selection.train_test_split( - dataset.x, - dataset.y, - test_size=0.2, - random_state=42 - ) - - fitting_start_time = time.time() - pipeline = estimator.fit(X_train, y_train) - fitting_end_time = time.time() - fitting_start_time - # print(f"Fitting time is {fitting_end_time}sec") - - inference_start_time = time.time() - predictions = estimator.predict(X_test) - inference_end_time = time.time() - inference_start_time - - prediction_probabilities = estimator.predict_proba(X_test) - - is_multi_classification_problem = True if len(set(predictions)) > 2 else False - # print(f"Inference time is {inference_end_time}sec") - # roc_auc_score = metrics.roc_auc_score(y_test, predictions) - - # autosklearn_ensemble = pipeline.show_models() - # formatted_ensemble = { - # model_id: { - # 'rank': autosklearn_ensemble[model_id].get('rank'), - # 'cost': float(f"{autosklearn_ensemble[model_id].get('cost'):.3f}"), - # 'ensemble_weight': autosklearn_ensemble[model_id].get('ensemble_weight'), - # 'model': autosklearn_ensemble[model_id].get('sklearn_classifier') - # } for model_id in autosklearn_ensemble.keys() - # } - - # best_single_model = list(pipeline.show_models().values())[0].get('sklearn_classifier') - best_single_model = repr(pipeline) - # encoded_ensemble = str(formatted_ensemble).encode('base64') - - # print(f"y_test is {predictions}") - - general_run_info = { - # 'id': iteration + 1, - 'dataset_id': dataset.id, - 'dataset_name': dataset.name, - 'run_label': 'Hist gradient boosting classifier' - } - average = 'macro' if is_multi_classification_problem else 'binary' - model_dependent_run_info = { - 'roc_auc': -1 * float(f"{metrics.roc_auc_score(y_test, prediction_probabilities if is_multi_classification_problem else predictions, multi_class='ovr'):.3f}"), - 'f1': -1 * float(f"{metrics.f1_score(y_test, predictions, average=average):.3f}"), - 'accuracy': -1 * float(f"{metrics.accuracy_score(y_test, predictions):.3f}"), - 'logloss': float(f"{metrics.log_loss(y_test, prediction_probabilities if is_multi_classification_problem else predictions):.3f}"), - 'precision': -1 * float(f"{metrics.precision_score(y_test, predictions, average=average):.3f}"), - 'fit_time': float(f'{fitting_end_time:.1f}'), - 'inference_time': float(f'{inference_end_time:.1f}'), - # 'model_str': re.sub(r'\s{2,}', ' ', repr(best_single_model)) - 'model_str': None - } - results = {**general_run_info, **model_dependent_run_info} - - # for key in autosklearn_ensemble.keys(): - # ensemble_model = autosklearn_ensemble[key] - # formatted_ensemble = results['ensemble'] - # for model_id in formatted_ensemble.keys(): - # formatted_ensemble[model_id] = ensemble_model.get("rank", None) - - # pickle.dump(pipeline.show_models(), open("results.pickle", "wb")) - - # print(type(pipeline.show_models().get(list(pipeline.show_models().keys())[0]).get("classifier"))) - - # knowledge_base_path = os.path.dirname('knowledge_base_0') - - with open('experimental_data.csv', 'a', newline='') as file: - writer = csv.writer(file, delimiter=',') - # if iteration == 0: - # writer.writerow(results.keys()) - writer.writerow(results.values()) - -if __name__ == '__main__': - AutoSklearnBaseline.main() - From 5c106587dc56555272654e9561c2906b3bc9de88 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 6 Jul 2023 18:07:27 +0300 Subject: [PATCH 49/60] fix inner components --- .../datasets_loaders/openml_datasets_loader.py | 12 ++++++------ .../model_advisors/diverse_fedot_pipeline_advisor.py | 4 ++-- .../meta_algorithm/model_advisors/model_advisor.py | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py index 11294c45..f7fbfb80 100644 --- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py +++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import List, Union, Optional +from typing import List, Union, Optional, Sequence from golem.core.log import default_log @@ -10,18 +10,17 @@ class OpenMLDatasetsLoader(DatasetsLoader): def __init__(self, allow_names: bool = False): - self.dataset_ids = [] + self.dataset_ids = set() self._allow_names = allow_names - def load(self, dataset_ids: List[Union[OpenMLDatasetIDType, str]], + def load(self, dataset_ids: Sequence[Union[OpenMLDatasetIDType, str]], allow_names: Optional[bool] = None) -> List[OpenMLDataset]: - self.dataset_ids += dataset_ids allow_names = self._allow_names if allow_names is None else allow_names datasets = [] # TODO: Optimize like this # https://github.com/openml/automlbenchmark/commit/a09dc8aee96178dd14837d9e1cd519d1ec63f804 - for dataset_id in self.dataset_ids: + for dataset_id in dataset_ids: dataset = self.load_single(dataset_id, allow_name=allow_names) datasets.append(dataset) return datasets @@ -35,7 +34,8 @@ def load_single(self, dataset_id: Union[OpenMLDatasetIDType, str], else: dataset = OpenMLDataset(dataset_id) - self.dataset_ids.append(dataset.id_) + self.dataset_ids.add(dataset.id_) + return dataset @property diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py index 6f7e4a66..21879365 100644 --- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py @@ -12,8 +12,8 @@ class DiverseFEDOTPipelineAdvisor(SimpleSimilarityModelAdvisor): def __init__(self, fitted_similarity_assessor: DatasetsSimilarityAssessor, n_best_to_advise: Optional[int] = None, - minimal_distance: int = 1, - distance_func: Callable[[Pipeline, Pipeline], int] = get_distance_between): + minimal_distance: float = 1, + distance_func: Callable[[Pipeline, Pipeline], float] = get_distance_between): super().__init__(fitted_similarity_assessor) self.minimal_distance = minimal_distance self.n_best_to_advise = n_best_to_advise diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py index c653a173..163dbe2f 100644 --- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py @@ -1,5 +1,5 @@ from abc import abstractmethod -from typing import List, Dict, Iterable +from typing import List, Dict, Iterable, Sequence import pandas as pd @@ -18,13 +18,13 @@ def predict(self, *args, **kwargs) -> List[List[Model]]: class SimpleSimilarityModelAdvisor(ModelAdvisor): def __init__(self, fitted_similarity_assessor: DatasetsSimilarityAssessor): self.similarity_assessor = fitted_similarity_assessor - self.best_models: Dict[DatasetIDType, List[Model]] = {} + self.best_models: Dict[DatasetIDType, Sequence[Model]] = {} @property def datasets(self): return self.similarity_assessor.datasets - def fit(self, dataset_names_to_best_pipelines: Dict[DatasetIDType, List[Model]]): + def fit(self, dataset_names_to_best_pipelines: Dict[DatasetIDType, Sequence[Model]]): self.best_models.update(dataset_names_to_best_pipelines) return self @@ -35,8 +35,8 @@ def predict(self, meta_features: pd.DataFrame) -> List[List[Model]]: advised_pipelines.append(self._predict_single(similar_datasets)) return advised_pipelines - def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]: + def _predict_single(self, similar_dataset_ids: Iterable[DatasetIDType]) -> List[Model]: dataset_pipelines = [] - for dataset_name in similar_dataset_names: - dataset_pipelines += self.best_models.get(dataset_name) + for dataset_id in similar_dataset_ids: + dataset_pipelines += list(self.best_models.get(dataset_id)) return dataset_pipelines From e2c1b890c3de9c6425500f5ec48c53399c9a86ef Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 6 Jul 2023 18:10:17 +0300 Subject: [PATCH 50/60] separate framework cache from other data --- .dockerignore | 2 +- .gitignore | 2 +- .../data_preparation/file_system/__init__.py | 2 +- .../data_preparation/file_system/cache.py | 12 +++-- .../knowledge_base_models_loader.py | 2 +- .../{ => cache}/metafeatures/pymfe/334.pkl | Bin .../{ => cache}/metafeatures/pymfe/40981.pkl | Bin .../org/openml/www/datasets/333/dataset.arff | 0 .../www/datasets/333/dataset_333.pkl.py3 | Bin .../openml/www/datasets/333/dataset_333.pq | Bin .../openml/www/datasets/333/description.xml | 0 .../org/openml/www/datasets/333/features.xml | 0 .../openml/www/datasets/333/features.xml.pkl | Bin .../openml/www/datasets/40981/dataset.arff | 0 .../www/datasets/40981/dataset_40981.pkl.py3 | Bin .../www/datasets/40981/dataset_40981.pq | Bin .../openml/www/datasets/40981/description.xml | 0 .../openml/www/datasets/40981/features.xml | 0 .../www/datasets/40981/features.xml.pkl | Bin test/unit/datasets/test_datasets_loaders.py | 2 +- test/unit/datasets/test_file_dataset.py | 48 ------------------ 21 files changed, 13 insertions(+), 57 deletions(-) rename test/data/{ => cache}/metafeatures/pymfe/334.pkl (100%) rename test/data/{ => cache}/metafeatures/pymfe/40981.pkl (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/dataset.arff (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/dataset_333.pq (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/description.xml (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/features.xml (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/features.xml.pkl (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/dataset.arff (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/description.xml (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/features.xml (100%) rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/features.xml.pkl (100%) delete mode 100644 test/unit/datasets/test_file_dataset.py diff --git a/.dockerignore b/.dockerignore index 2bfa6863..66731471 100644 --- a/.dockerignore +++ b/.dockerignore @@ -10,4 +10,4 @@ notebooks test # User data -data +data/cache diff --git a/.gitignore b/.gitignore index a5f9134a..bf5dbd4b 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,4 @@ dmypy.json .pyre/ # User data -/data +/data/cache diff --git a/meta_automl/data_preparation/file_system/__init__.py b/meta_automl/data_preparation/file_system/__init__.py index a228da6e..c9f8393a 100644 --- a/meta_automl/data_preparation/file_system/__init__.py +++ b/meta_automl/data_preparation/file_system/__init__.py @@ -1,5 +1,5 @@ from meta_automl.data_preparation.file_system.file_system import PathType, get_project_root, get_data_dir -from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_dataset_cache_path, +from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_cache_dir, get_dataset_cache_path, get_dataset_cache_path_by_id, get_meta_features_cache_path, get_local_meta_features, update_local_meta_features, get_openml_cache_dir, update_openml_cache_dir) diff --git a/meta_automl/data_preparation/file_system/cache.py b/meta_automl/data_preparation/file_system/cache.py index 99daf965..04a904b7 100644 --- a/meta_automl/data_preparation/file_system/cache.py +++ b/meta_automl/data_preparation/file_system/cache.py @@ -19,12 +19,16 @@ class CacheOperator: pass +def get_cache_dir() -> Path: + return ensure_dir_exists(get_data_dir().joinpath('cache')) + + def get_openml_cache_dir() -> Path: - return get_data_dir().joinpath('openml_cache') + return get_cache_dir().joinpath('openml_cache') def get_full_openml_cache_dir() -> Path: - return get_data_dir().joinpath('openml_cache/org/openml/www') + return get_cache_dir().joinpath('openml_cache/org/openml/www') def update_openml_cache_dir(): @@ -82,11 +86,11 @@ def get_cache_properties(class_name: str) -> CacheProperties: template='{id_}'), 'CustomDataset': CacheProperties( type_=CacheType.file, - dir_=get_data_dir().joinpath('datasets/custom_dataset'), + dir_=get_cache_dir().joinpath('datasets/custom_dataset'), template='{id_}.pkl'), 'PymfeExtractor': CacheProperties( type_=CacheType.file, - dir_=get_data_dir().joinpath('metafeatures/pymfe'), + dir_=get_cache_dir().joinpath('metafeatures/pymfe'), template='{id_}.pkl'), } try: diff --git a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py index 7c38b9d8..df8a0f70 100644 --- a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py +++ b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py @@ -13,7 +13,7 @@ from meta_automl.data_preparation.model import Model from meta_automl.data_preparation.models_loaders import ModelsLoader -DEFAULT_KNOWLEDGE_BASE_PATH = get_data_dir().joinpath('knowledge_base_0') +DEFAULT_KNOWLEDGE_BASE_PATH = get_data_dir() / 'knowledge_base_0' class KnowledgeBaseModelsLoader(ModelsLoader): diff --git a/test/data/metafeatures/pymfe/334.pkl b/test/data/cache/metafeatures/pymfe/334.pkl similarity index 100% rename from test/data/metafeatures/pymfe/334.pkl rename to test/data/cache/metafeatures/pymfe/334.pkl diff --git a/test/data/metafeatures/pymfe/40981.pkl b/test/data/cache/metafeatures/pymfe/40981.pkl similarity index 100% rename from test/data/metafeatures/pymfe/40981.pkl rename to test/data/cache/metafeatures/pymfe/40981.pkl diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset.arff b/test/data/cache/openml_cache/org/openml/www/datasets/333/dataset.arff similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/333/dataset.arff rename to test/data/cache/openml_cache/org/openml/www/datasets/333/dataset.arff diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 b/test/data/cache/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 rename to test/data/cache/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq b/test/data/cache/openml_cache/org/openml/www/datasets/333/dataset_333.pq similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq rename to test/data/cache/openml_cache/org/openml/www/datasets/333/dataset_333.pq diff --git a/test/data/openml_cache/org/openml/www/datasets/333/description.xml b/test/data/cache/openml_cache/org/openml/www/datasets/333/description.xml similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/333/description.xml rename to test/data/cache/openml_cache/org/openml/www/datasets/333/description.xml diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml b/test/data/cache/openml_cache/org/openml/www/datasets/333/features.xml similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/333/features.xml rename to test/data/cache/openml_cache/org/openml/www/datasets/333/features.xml diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl b/test/data/cache/openml_cache/org/openml/www/datasets/333/features.xml.pkl similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl rename to test/data/cache/openml_cache/org/openml/www/datasets/333/features.xml.pkl diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff b/test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset.arff similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset.arff diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 b/test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq b/test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/description.xml b/test/data/cache/openml_cache/org/openml/www/datasets/40981/description.xml similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/40981/description.xml rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/description.xml diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml b/test/data/cache/openml_cache/org/openml/www/datasets/40981/features.xml similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/40981/features.xml rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/features.xml diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl b/test/data/cache/openml_cache/org/openml/www/datasets/40981/features.xml.pkl similarity index 100% rename from test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/features.xml.pkl diff --git a/test/unit/datasets/test_datasets_loaders.py b/test/unit/datasets/test_datasets_loaders.py index 0fd1ce17..f49e1989 100644 --- a/test/unit/datasets/test_datasets_loaders.py +++ b/test/unit/datasets/test_datasets_loaders.py @@ -5,7 +5,7 @@ def test_group_load_new_datasets(openml_dataset_ids): loader = OpenMLDatasetsLoader() datasets = loader.load(openml_dataset_ids) - assert loader.dataset_ids == openml_dataset_ids + assert loader.dataset_ids == set(openml_dataset_ids) for dataset_id, dataset in zip(openml_dataset_ids, datasets): check_dataset_cache(dataset) diff --git a/test/unit/datasets/test_file_dataset.py b/test/unit/datasets/test_file_dataset.py deleted file mode 100644 index 125cb641..00000000 --- a/test/unit/datasets/test_file_dataset.py +++ /dev/null @@ -1,48 +0,0 @@ -import numpy as np -import pytest - -from meta_automl.data_preparation.dataset import CacheNotFoundError, FileDataset, DatasetData -from test.unit.datasets.general_checks import assert_cache_file_exists - - -@pytest.fixture(scope='module') -def new_dataset_data(): - dataset_data = DatasetData( - x=np.array([['a', 'b'], ['b', 'a']]), - y=np.array([5, 10]), - categorical_indicator=[True, True], - attribute_names=['foo', 'bar'] - ) - return dataset_data - - -@pytest.fixture(scope='module') -def new_dataset(new_dataset_data): - dataset = FileDataset(42) - dataset.dump_data(new_dataset_data) - yield dataset - dataset.cache_path.unlink() - - -def test_error_on_missing_dataset_cache(): - with pytest.raises(CacheNotFoundError): - FileDataset('random_missing_dataset').get_data() - - -def test_file_dataset_dumping(new_dataset): - # Act - cache_path = new_dataset.cache_path - # Assert - assert_cache_file_exists(cache_path) - - -def test_file_dataset_data_loading(new_dataset_data, new_dataset): - # Act - correct_data = new_dataset_data - dataset = new_dataset - data = dataset.get_data() - # Assert - assert np.all(np.equal(data.x, correct_data.x)) - assert np.all(np.equal(data.y, correct_data.y)) - assert data.categorical_indicator == correct_data.categorical_indicator - assert data.attribute_names == correct_data.attribute_names From 20fb4391438e7bd1366a4456d645c654696c2eb7 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 6 Jul 2023 18:11:42 +0300 Subject: [PATCH 51/60] use yaml config for the experiment --- experiments/fedot_warm_start/config.yaml | 26 ++++++++++++++ experiments/fedot_warm_start/run.py | 44 +++++++++++++++--------- 2 files changed, 54 insertions(+), 16 deletions(-) create mode 100644 experiments/fedot_warm_start/config.yaml diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml new file mode 100644 index 00000000..bcab1083 --- /dev/null +++ b/experiments/fedot_warm_start/config.yaml @@ -0,0 +1,26 @@ +--- +seed: 42 +#data_settings: +n_datasets: null # null for all available datasets +test_size: 0.25 +train_timeout: 15 +test_timeout: 15 +#meta_learning_params: +n_best_dataset_models_to_memorize: 10 +n_closest_datasets_to_propose: 5 +minimal_distance_between_advised_models: 1 +n_best_models_to_advise: 5 +mf_extractor_params: + groups: general +#evaluation_params: +collect_metrics: + - f1 + - roc_auc + - accuracy + - neg_log_loss + - precision +common_fedot_params: + problem: classification + n_jobs: -1 + show_progress: false +baseline_model: 'xgboost' diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index c0461f30..9920d4d5 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -2,6 +2,10 @@ import json import logging import timeit +from pathlib import Path + +import yaml + from datetime import datetime from itertools import chain from typing import Dict, List, Tuple, Sequence @@ -31,24 +35,32 @@ from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor -# Meta-alg hyperparameters -SEED = 42 -# Datasets sampling -N_DATASETS = 3 -TEST_SIZE = 0.33 -# Evaluation timeouts -TRAIN_TIMEOUT = 0.01 -TEST_TIMEOUT = 0.01 -# Models & datasets -N_BEST_DATASET_MODELS_TO_MEMORIZE = 10 -N_CLOSEST_DATASETS_TO_PROPOSE = 5 -MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1 -N_BEST_MODELS_TO_ADVISE = 5 -# Meta-features -MF_EXTRACTOR_PARAMS = {'groups': 'general'} -COLLECT_METRICS = ['f1', 'roc_auc', 'accuracy', 'neg_log_loss', 'precision'] + +CONFIG_PATH = 'config.yaml' + + +with open(CONFIG_PATH, 'r') as config_file: + config = yaml.load(config_file, yaml.Loader) + +# Load constants +SEED = config['seed'] +N_DATASETS = config['n_datasets'] +TEST_SIZE = config['test_size'] +TRAIN_TIMEOUT = config['train_timeout'] +TEST_TIMEOUT = config['test_timeout'] +N_BEST_DATASET_MODELS_TO_MEMORIZE = config['n_best_dataset_models_to_memorize'] +N_CLOSEST_DATASETS_TO_PROPOSE = config['n_closest_datasets_to_propose'] +MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = config['minimal_distance_between_advised_models'] +N_BEST_MODELS_TO_ADVISE = config['n_best_models_to_advise'] +MF_EXTRACTOR_PARAMS = config['mf_extractor_params'] +COLLECT_METRICS = config['collect_metrics'] +COMMON_FEDOT_PARAMS = config['common_fedot_params'] +BASELINE_MODEL = config['baseline_model'] + +# Postprocess constants COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.metric_by_id, COLLECT_METRICS)) COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss' +COMMON_FEDOT_PARAMS['seed'] = SEED COMMON_FEDOT_PARAMS = dict( problem='classification', From d4d50ce8e4a12eaff673120cb30f31674c3a0b13 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Thu, 6 Jul 2023 18:11:57 +0300 Subject: [PATCH 52/60] refactor run.py --- experiments/fedot_warm_start/run.py | 187 +++++++++++++++------------- 1 file changed, 97 insertions(+), 90 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 9920d4d5..20b3eee4 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -8,7 +8,7 @@ from datetime import datetime from itertools import chain -from typing import Dict, List, Tuple, Sequence +from typing import Dict, List, Tuple, Sequence, Any import numpy as np import openml @@ -23,6 +23,7 @@ from fedot.core.repository.quality_metrics_repository import QualityMetricsEnum, MetricsRepository from fedot.core.validation.split import tabular_cv_generator from golem.core.log import Log +from golem.core.optimisers.fitness import SingleObjFitness from sklearn.model_selection import StratifiedKFold from tqdm import tqdm @@ -30,6 +31,7 @@ from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split +from meta_automl.data_preparation.file_system import get_cache_dir from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor from meta_automl.data_preparation.model import Model from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor @@ -62,32 +64,34 @@ COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss' COMMON_FEDOT_PARAMS['seed'] = SEED -COMMON_FEDOT_PARAMS = dict( - problem='classification', - n_jobs=-1, - seed=SEED, - show_progress=False, -) - -# Setup logging -time_now = datetime.now() -time_now_iso = time_now.isoformat(timespec="minutes") -time_now_for_path = time_now_iso.replace(":", ".") -save_dir = get_data_dir(). \ - joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') -save_dir.mkdir(parents=True) -log_file = save_dir.joinpath('log.txt') -Log(log_file=log_file) -logging.basicConfig( - filename=log_file, - filemode='a', - format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', - datefmt='%H:%M:%S', - force=True, -) - - -def prepare_data() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]: + +def setup_logging(save_dir): + log_file = save_dir.joinpath('log.txt') + Log(log_file=log_file) + logging.basicConfig( + filename=log_file, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', + datefmt='%H:%M:%S', + force=True, + ) + + +def get_formatted_time() -> (datetime, str, str): + time_now = datetime.now() + time_now_iso = time_now.isoformat(timespec="minutes") + time_now_for_path = time_now_iso.replace(":", ".") + return time_now, time_now_iso, time_now_for_path + + +def get_save_dir(time_now_for_path) -> Path: + save_dir = get_cache_dir(). \ + joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}') + save_dir.mkdir(parents=True) + return save_dir + + +def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]: """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" dataset_ids = openml.study.get_suite(99).data @@ -103,18 +107,10 @@ def prepare_data() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset] return df_datasets_train, df_datasets_test, datasets -def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array): - x = data.x - y = data.y - if len(y.shape) == 1: - y = y.reshape(-1, 1) - return x, y - - -def get_pipeline_metrics(pipeline: Pipeline, - input_data: InputData, - metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM, - metric_names: Sequence[str] = COLLECT_METRICS) -> dict: +def evaluate_pipeline(pipeline: Pipeline, + input_data: InputData, + metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM, + metric_names: Sequence[str] = COLLECT_METRICS) -> Dict[str, float]: """Gets quality metrics for the fitted pipeline. The function is based on `Fedot.get_metrics()` @@ -134,14 +130,30 @@ def get_pipeline_metrics(pipeline: Pipeline, return metric_values -def prepare_extractor_and_assessor(datasets_train: List[str]): +def fit_offline_meta_learning_components(best_models_per_dataset_id: Dict[int, Sequence[Model]]) \ + -> (KNeighborsBasedSimilarityAssessor, PymfeExtractor, DiverseFEDOTPipelineAdvisor): + dataset_ids = list(best_models_per_dataset_id.keys()) + # Meta Features extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS) - meta_features_train = extractor.extract(datasets_train, fill_input_nans=True) + meta_features_train = extractor.extract(dataset_ids, fill_input_nans=True) meta_features_train = meta_features_train.fillna(0) + # Datasets similarity data_similarity_assessor = KNeighborsBasedSimilarityAssessor( - n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE)) - data_similarity_assessor.fit(meta_features_train, datasets_train) - return data_similarity_assessor, extractor + n_neighbors=min(len(dataset_ids), N_CLOSEST_DATASETS_TO_PROPOSE)) + data_similarity_assessor.fit(meta_features_train, dataset_ids) + # Model advisor + model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, + minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) + model_advisor.fit(best_models_per_dataset_id) + return extractor, model_advisor + + +def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array): + x = data.x + y = data.y + if len(y.shape) == 1: + y = y.reshape(-1, 1) + return x, y def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None): @@ -152,7 +164,7 @@ def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_as fedot.fit(x, y) automl_time = timeit.default_timer() - time_start - metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data) + metrics = evaluate_pipeline(fedot.current_pipeline, fedot.train_data) pipeline = fedot.current_pipeline run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time, automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics) @@ -160,7 +172,7 @@ def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_as def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., - automl_timeout_min=0., **metrics): + automl_timeout_min=0., **metrics) -> Dict[str, Any]: run_results = dict(dataset_id=dataset.id_, dataset_name=dataset.name, run_label=run_label, @@ -174,23 +186,32 @@ def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, histor return run_results -def extract_best_history_models(dataset, history): - best_individuals = sorted(chain(*history.individuals), - key=lambda ind: ind.fitness, - reverse=True) - best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - best_models = [] - for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: - pipeline = PipelineAdapter().restore(individual.graph) - model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset) - best_models.append(model) +def extract_best_models_from_history(dataset, history) -> List[Model]: + if history.individuals: + best_individuals = sorted(chain(*history.individuals), + key=lambda ind: ind.fitness, + reverse=True) + best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) + best_models = [] + for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]: + pipeline = PipelineAdapter().restore(individual.graph) + model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset) + best_models.append(model) + else: + pipeline = PipelineAdapter().restore(history.tuning_result) + best_models = [Model(pipeline, SingleObjFitness(), history.objective.metric_names[0], dataset)] + return best_models def main(): - baseline_pipeline = PipelineBuilder().add_node('rf').build() + time_now, time_now_iso, time_now_for_path = get_formatted_time() + save_dir = get_save_dir(time_now_for_path) + setup_logging(save_dir) + + baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() - df_datasets_train, df_datasets_test, datasets = prepare_data() + df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets() dataset_ids_train = df_datasets_train.index.to_list() dataset_ids_test = df_datasets_test.index.to_list() @@ -198,9 +219,8 @@ def main(): evaluation_results = [] best_models_per_dataset = {} progress_file = open(save_dir.joinpath('progress.txt'), 'a') - for dataset_id in tqdm(datasets.keys(), 'FEDOT, all datasets', file=progress_file): + for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file): try: - dataset = datasets[dataset_id] timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT') evaluation_results.append(run_results) @@ -211,24 +231,20 @@ def main(): # Filter out unique individuals with the best fitness history = fedot.history - best_models = extract_best_history_models(dataset, history) + best_models = extract_best_models_from_history(dataset, history) best_models_per_dataset[dataset_id] = best_models except Exception: logging.exception(f'Train dataset "{dataset_id}"') - data_similarity_assessor, extractor = prepare_extractor_and_assessor(dataset_ids_train) - model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, - minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) - model_advisor.fit(best_models_per_dataset) + mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset) - for dataset_id in tqdm(dataset_ids_test, 'MetaFEDOT, Test datasets', file=progress_file): + datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items())) + for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file): try: - dataset = datasets[dataset_id] - # Run meta AutoML # 1 time_start = timeit.default_timer() - meta_features = extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True) + meta_features = mf_extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True) meta_features = meta_features.fillna(0) meta_learning_time_sec = timeit.default_timer() - time_start initial_assumptions = model_advisor.predict(meta_features)[0] @@ -240,7 +256,7 @@ def main(): evaluation_results.append(fedot_meta_results) # Fit & evaluate simple baseline - baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data) + baseline_metrics = evaluate_pipeline(baseline_pipeline, fedot_meta.train_data) baseline_res = get_result_data_row(dataset=dataset, run_label='simple baseline', pipeline=baseline_pipeline, **baseline_metrics) evaluation_results.append(baseline_res) @@ -248,7 +264,7 @@ def main(): # Fit & evaluate initial assumptions for i, assumption in enumerate(initial_assumptions): pipeline = assumption.predictor - assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data) + assumption_metrics = evaluate_pipeline(pipeline, fedot_meta.train_data) assumption_res = get_result_data_row(dataset=dataset, run_label=f'MetaFEDOT - initial assumption {i}', pipeline=pipeline, **assumption_metrics) evaluation_results.append(assumption_res) @@ -281,25 +297,16 @@ def main(): pd.DataFrame(evaluation_results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) # save experiment hyperparameters - params = { - 'run_date': time_now_iso, - 'seed': SEED, - 'n_datasets': N_DATASETS or len(datasets), - 'test_size': TEST_SIZE, - 'dataset_ids': list(datasets.keys()), - 'dataset_ids_train': dataset_ids_train, - 'dataset_ids_test': dataset_ids_test, - 'dataset_names_train': df_datasets_train['dataset_name'].to_list(), - 'dataset_names_test': df_datasets_test['dataset_name'].to_list(), - 'train_timeout': TRAIN_TIMEOUT, - 'test_timeout': TEST_TIMEOUT, - 'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE, - 'n_closest_datasets_to_propose': N_CLOSEST_DATASETS_TO_PROPOSE, - 'minimal_distance_between_advised_models': MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS, - 'n_best_models_to_advise': N_BEST_MODELS_TO_ADVISE, - 'common_fedot_params': COMMON_FEDOT_PARAMS, - 'baseline_pipeline': baseline_pipeline.descriptive_id, - } + params = dict( + run_date=time_now_iso, + input_config=config, + dataset_ids=list(datasets_dict.keys()), + dataset_ids_train=dataset_ids_train, + dataset_names_train=df_datasets_train['dataset_name'].to_list(), + dataset_ids_test=dataset_ids_test, + dataset_names_test=df_datasets_test['dataset_name'].to_list(), + baseline_pipeline=baseline_pipeline.descriptive_id, + ) with open(save_dir.joinpath('parameters.json'), 'w') as params_file: json.dump(params, params_file, indent=2) From e581c9e564a729972efd6b22ffa0d06155db15e0 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Fri, 7 Jul 2023 18:16:36 +0300 Subject: [PATCH 53/60] update requirements --- .../data_preparation/dataset/openml_dataset.py | 2 ++ .../data_preparation/file_system/__init__.py | 2 +- .../data_preparation/file_system/cache.py | 12 ++++-------- requirements.txt | Bin 460 -> 310 bytes 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/meta_automl/data_preparation/dataset/openml_dataset.py b/meta_automl/data_preparation/dataset/openml_dataset.py index 08fc5c1d..72fbb1f8 100644 --- a/meta_automl/data_preparation/dataset/openml_dataset.py +++ b/meta_automl/data_preparation/dataset/openml_dataset.py @@ -20,6 +20,7 @@ def __init__(self, id_: OpenMLDatasetIDType): raise ValueError('Creating OpenMLDataset by dataset name is ambiguous. Please, use dataset id.' f'Otherwise, you can perform search by f{self.__class__.__name__}.from_search().') self._openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False, + download_features_meta_data=False, error_if_multiple=True) id_ = self._openml_dataset.id name = self._openml_dataset.name @@ -28,6 +29,7 @@ def __init__(self, id_: OpenMLDatasetIDType): @classmethod def from_search(cls, id_: Union[OpenMLDatasetIDType, str], **get_dataset_kwargs) -> OpenMLDataset: openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False, + download_features_meta_data=False, **get_dataset_kwargs) return cls(openml_dataset.id) diff --git a/meta_automl/data_preparation/file_system/__init__.py b/meta_automl/data_preparation/file_system/__init__.py index c9f8393a..1d52c516 100644 --- a/meta_automl/data_preparation/file_system/__init__.py +++ b/meta_automl/data_preparation/file_system/__init__.py @@ -2,4 +2,4 @@ from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_cache_dir, get_dataset_cache_path, get_dataset_cache_path_by_id, get_meta_features_cache_path, get_local_meta_features, update_local_meta_features, - get_openml_cache_dir, update_openml_cache_dir) + update_openml_cache_dir) diff --git a/meta_automl/data_preparation/file_system/cache.py b/meta_automl/data_preparation/file_system/cache.py index 04a904b7..0b021abe 100644 --- a/meta_automl/data_preparation/file_system/cache.py +++ b/meta_automl/data_preparation/file_system/cache.py @@ -24,16 +24,12 @@ def get_cache_dir() -> Path: def get_openml_cache_dir() -> Path: - return get_cache_dir().joinpath('openml_cache') - - -def get_full_openml_cache_dir() -> Path: - return get_cache_dir().joinpath('openml_cache/org/openml/www') + return Path(openml.config.get_cache_directory()) def update_openml_cache_dir(): - openml_cache_path = str(get_openml_cache_dir()) - openml.config.set_cache_directory(openml_cache_path) + openml_cache_path = get_cache_dir().joinpath('openml_cache') + openml.config.set_root_cache_directory(str(openml_cache_path)) def _get_cache_path(object_class: Type[CacheOperator], object_id: str, _create_parent_dir: bool = True) -> Path: @@ -82,7 +78,7 @@ def get_cache_properties(class_name: str) -> CacheProperties: cache_properties_by_class_name = { 'OpenMLDataset': CacheProperties( type_=CacheType.directory, - dir_=get_full_openml_cache_dir().joinpath('datasets'), + dir_=get_openml_cache_dir().joinpath('datasets'), template='{id_}'), 'CustomDataset': CacheProperties( type_=CacheType.file, diff --git a/requirements.txt b/requirements.txt index ad0a22332f176f2c866188116575624428ac1536..2337a1746f2da881a344c83f36cd8b55cba6ffff 100644 GIT binary patch delta 76 zcmX@Zyp2in|GzYbRE89We1;MRTOc%G&|@$MV#A5+Wf@H-o|Og?FBKV$Cv!8JvYG%T V3?{oWs)gr{(6EVL8~BqT1-ts7qeWcUdTMHv=-dG))uHfScDcJ6o2oHNt! zudVh8y)p>NDE0PxV)g|4L)zRyHz35pBz3NR*nrH|gYN`?MoO!LW z3YFTRC?J)Sg6afbGAa#t)~qUYfoDw56tdDi>n&rBKG0K>uctem@Qx&HAaj(Nd)REW zh6H*d8I2F%Io=sx#?NFsvD=c98ZVsOXoJ%HI&JCux5|bD0bO!hIq7^aoN=#%-{vFU r0f$eVZW;e~{>`8NrZktN57I@~nYZ0XmoD*_>`rRmzwNQ}wP*MNE&M|H From 2f8b409881fe70d705727a53b86c94cd7d6d285a Mon Sep 17 00:00:00 2001 From: max Date: Sat, 8 Jul 2023 22:39:11 +0300 Subject: [PATCH 54/60] Removing IDE configuration files. --- .gitignore | 2 ++ .idea/.gitignore | 8 -------- .idea/inspectionProfiles/Project_Default.xml | 13 ------------- .idea/libraries/py4j0_10_9_7.xml | 9 --------- .idea/libraries/ziptestdata.xml | 9 --------- .idea/libraries/ziptestdata1.xml | 9 --------- .idea/libraries/ziptestdata2.xml | 9 --------- .idea/libraries/ziptestdata3.xml | 9 --------- .idea/meta-automl-research.iml | 9 --------- .idea/misc.xml | 6 ------ .idea/modules.xml | 8 -------- .idea/runConfigurations.xml | 10 ---------- .idea/vcs.xml | 6 ------ 13 files changed, 2 insertions(+), 105 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/inspectionProfiles/Project_Default.xml delete mode 100644 .idea/libraries/py4j0_10_9_7.xml delete mode 100644 .idea/libraries/ziptestdata.xml delete mode 100644 .idea/libraries/ziptestdata1.xml delete mode 100644 .idea/libraries/ziptestdata2.xml delete mode 100644 .idea/libraries/ziptestdata3.xml delete mode 100644 .idea/meta-automl-research.iml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/runConfigurations.xml delete mode 100644 .idea/vcs.xml diff --git a/.gitignore b/.gitignore index 9e584fd4..44149102 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.idea + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b81..00000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 0616d54f..00000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/libraries/py4j0_10_9_7.xml b/.idea/libraries/py4j0_10_9_7.xml deleted file mode 100644 index f6a7627a..00000000 --- a/.idea/libraries/py4j0_10_9_7.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/ziptestdata.xml b/.idea/libraries/ziptestdata.xml deleted file mode 100644 index 7f8b1b21..00000000 --- a/.idea/libraries/ziptestdata.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/ziptestdata1.xml b/.idea/libraries/ziptestdata1.xml deleted file mode 100644 index 054994be..00000000 --- a/.idea/libraries/ziptestdata1.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/ziptestdata2.xml b/.idea/libraries/ziptestdata2.xml deleted file mode 100644 index 7b383000..00000000 --- a/.idea/libraries/ziptestdata2.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/ziptestdata3.xml b/.idea/libraries/ziptestdata3.xml deleted file mode 100644 index a0322347..00000000 --- a/.idea/libraries/ziptestdata3.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/.idea/meta-automl-research.iml b/.idea/meta-automl-research.iml deleted file mode 100644 index d6ebd480..00000000 --- a/.idea/meta-automl-research.iml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 33b8d9d1..00000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index c8283092..00000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/runConfigurations.xml b/.idea/runConfigurations.xml deleted file mode 100644 index 797acea5..00000000 --- a/.idea/runConfigurations.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1ddf..00000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 67812b757f2ddd4c1a7969f55a6b6abfeaca0bfd Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 16 Jul 2023 17:02:12 +0300 Subject: [PATCH 55/60] make absolute path to config.yaml --- experiments/fedot_warm_start/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 20b3eee4..8f3c6d85 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -38,7 +38,7 @@ from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor -CONFIG_PATH = 'config.yaml' +CONFIG_PATH = Path(__file__).parent.joinpath('config.yaml') with open(CONFIG_PATH, 'r') as config_file: From 4a0b144dd447edeed0affc12466edd90a4ddb68e Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 16 Jul 2023 17:40:55 +0300 Subject: [PATCH 56/60] fix train test split --- experiments/fedot_warm_start/run.py | 2 +- .../data_preparation/datasets_train_test_split.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 8f3c6d85..53a130cd 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -99,7 +99,7 @@ def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDatase dataset_ids = pd.Series(dataset_ids) dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) - df_split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED) + df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED) df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1] df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0] diff --git a/meta_automl/data_preparation/datasets_train_test_split.py b/meta_automl/data_preparation/datasets_train_test_split.py index 101b7ce8..b262a44c 100644 --- a/meta_automl/data_preparation/datasets_train_test_split.py +++ b/meta_automl/data_preparation/datasets_train_test_split.py @@ -2,12 +2,15 @@ import pandas as pd from sklearn.model_selection import train_test_split +from typing import List +from meta_automl.data_preparation.dataset import OpenMLDatasetIDType -def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed: int = 42): + +def openml_datasets_train_test_split(dataset_ids: List[OpenMLDatasetIDType], test_size: float, seed=None): df_openml_datasets = openml.datasets.list_datasets(dataset_ids, output_format='dataframe') df_openml_datasets_split_features = df_openml_datasets[ - ['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']] + ['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']].copy(deep=False) for column in df_openml_datasets_split_features.columns[1:]: if column != 'NumberOfClasses': median = df_openml_datasets_split_features[column].median() @@ -31,7 +34,7 @@ def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed: if not df_datasets_to_split.empty: df_train_datasets, df_test_datasets = train_test_split( df_datasets_to_split, - train_size=train_size, + test_size=test_size, shuffle=True, stratify=df_datasets_to_split['category'], random_state=seed @@ -40,7 +43,7 @@ def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed: else: df_train_datasets, df_test_datasets = train_test_split( df_split_categories, - train_size=train_size, + test_size=test_size, shuffle=True, random_state=seed ) @@ -56,7 +59,7 @@ def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed: def main(): dataset_ids = openml.study.get_suite(99).data - df_split_datasets = openml_datasets_train_test_split(dataset_ids) + df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=0.3) df_split_datasets.to_csv('train_test_datasets_opencc18.csv') From 44857b0f99756b72d8a5d4f73ab7827d00c9c847 Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 16 Jul 2023 17:41:35 +0300 Subject: [PATCH 57/60] refactor for frequent results saving --- experiments/fedot_warm_start/run.py | 241 ++++++++++-------- .../dataset/custom_dataset.py | 1 - 2 files changed, 141 insertions(+), 101 deletions(-) diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index 53a130cd..956ecf30 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -24,11 +24,12 @@ from fedot.core.validation.split import tabular_cv_generator from golem.core.log import Log from golem.core.optimisers.fitness import SingleObjFitness +from golem.core.optimisers.opt_history_objects.opt_history import OptHistory from sklearn.model_selection import StratifiedKFold from tqdm import tqdm -from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData +from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData, DatasetBase from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split from meta_automl.data_preparation.file_system import get_cache_dir @@ -65,7 +66,8 @@ COMMON_FEDOT_PARAMS['seed'] = SEED -def setup_logging(save_dir): +def setup_logging(save_dir: Path): + """ Creates "log.txt" at the "save_dir" and redirects all logging output to it. """ log_file = save_dir.joinpath('log.txt') Log(log_file=log_file) logging.basicConfig( @@ -77,7 +79,12 @@ def setup_logging(save_dir): ) -def get_formatted_time() -> (datetime, str, str): +def get_current_formatted_date() -> (datetime, str, str): + """ Returns current date in the following formats: + + 1. datetime + 2. str: ISO + 3. str: ISO compatible with Windows file system path (with "." instead of ":") """ time_now = datetime.now() time_now_iso = time_now.isoformat(timespec="minutes") time_now_for_path = time_now_iso.replace(":", ".") @@ -156,7 +163,10 @@ def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array): return x, y -def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None): +def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None) \ + -> (Fedot, Dict[str, Any]): + """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset. + Returns Fedot instance & properties of the run along with the evaluated metrics. """ x, y = transform_data_for_fedot(dataset.get_data(dataset_format='array')) time_start = timeit.default_timer() @@ -166,8 +176,9 @@ def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_as metrics = evaluate_pipeline(fedot.current_pipeline, fedot.train_data) pipeline = fedot.current_pipeline - run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time, - automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics) + run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, + automl_time_sec=automl_time, automl_timeout_min=fedot.params.timeout, + history_obj=fedot.history, **metrics) return fedot, run_results @@ -186,7 +197,7 @@ def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, histor return run_results -def extract_best_models_from_history(dataset, history) -> List[Model]: +def extract_best_models_from_history(dataset: DatasetBase, history: OptHistory) -> List[Model]: if history.individuals: best_individuals = sorted(chain(*history.individuals), key=lambda ind: ind.fitness, @@ -204,111 +215,141 @@ def extract_best_models_from_history(dataset, history) -> List[Model]: return best_models +def save_experiment_params(params_dict: Dict[str, Any], save_dir: Path): + """ Save the hyperparameters of the experiment """ + params_file_path = save_dir.joinpath('parameters.json') + with open(params_file_path, 'w') as params_file: + json.dump(params_dict, params_file, indent=2) + + +def save_evaluation(evaluation_properties: Dict[str, Any], run_date: datetime, experiment_date: datetime, + save_dir: Path): + histories_dir = save_dir.joinpath('histories') + models_dir = save_dir.joinpath('models') + eval_results_path = save_dir.joinpath('evaluation_results.csv') + + histories_dir.mkdir(exist_ok=True) + models_dir.mkdir(exist_ok=True) + + try: + evaluation_properties['experiment_date'] = experiment_date + evaluation_properties['run_date'] = run_date + dataset_id = evaluation_properties['dataset_id'] + run_label = evaluation_properties['run_label'] + # define saving paths + model_path = models_dir.joinpath(f'{dataset_id}_{run_label}') + history_path = histories_dir.joinpath(f'{dataset_id}_{run_label}_history.json') + # replace objects with export paths for csv + evaluation_properties['model_path'] = str(model_path) + evaluation_properties.pop('model_obj').save(model_path) + evaluation_properties['history_path'] = str(history_path) + history_obj = evaluation_properties.pop('history_obj') + if history_obj is not None: + history_obj.save(evaluation_properties['history_path']) + + df_evaluation_properties = pd.DataFrame([evaluation_properties]) + + if eval_results_path.exists(): + df_results = pd.read_csv(eval_results_path) + df_results = pd.concat([df_results, df_evaluation_properties]) + else: + df_results = df_evaluation_properties + df_results.to_csv(eval_results_path, index=False) + + except Exception: + logging.exception(f'Saving results "{evaluation_properties}"') + + def main(): - time_now, time_now_iso, time_now_for_path = get_formatted_time() - save_dir = get_save_dir(time_now_for_path) + experiment_date, experiment_date_iso, experiment_date_for_path = get_current_formatted_date() + save_dir = get_save_dir(experiment_date_for_path) setup_logging(save_dir) - - baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() + progress_file_path = save_dir.joinpath('progress.txt') df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets() + dataset_ids = list(datasets_dict.keys()) dataset_ids_train = df_datasets_train.index.to_list() dataset_ids_test = df_datasets_test.index.to_list() - evaluation_results = [] + dataset_names_train = df_datasets_train['dataset_name'].to_list() + dataset_names_test = df_datasets_test['dataset_name'].to_list() + + datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items())) + + experiment_params_dict = dict( + experiment_start_date_iso=experiment_date_iso, + input_config=config, + dataset_ids=dataset_ids, + dataset_ids_train=dataset_ids_train, + dataset_names_train=dataset_names_train, + dataset_ids_test=dataset_ids_test, + dataset_names_test=dataset_names_test, + baseline_pipeline=BASELINE_MODEL, + ) + save_experiment_params(experiment_params_dict, save_dir) + best_models_per_dataset = {} - progress_file = open(save_dir.joinpath('progress.txt'), 'a') - for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file): - try: - timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT - fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT') - evaluation_results.append(run_results) - # TODO: - # x Turn the tuned pipeline into a model (evaluate its fitness on the data) - # x Evaluate historical pipelines on the data instead of using fitness - # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run - - # Filter out unique individuals with the best fitness - history = fedot.history - best_models = extract_best_models_from_history(dataset, history) - best_models_per_dataset[dataset_id] = best_models - except Exception: - logging.exception(f'Train dataset "{dataset_id}"') + with open(progress_file_path, 'a') as progress_file: + for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file): + try: + timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT + run_date = datetime.now() + fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT') + save_evaluation(run_results, run_date, experiment_date, save_dir) + # TODO: + # x Turn the tuned pipeline into a model (evaluate its fitness on the data) + # x Evaluate historical pipelines on the data instead of using fitness + # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run + + # Filter out unique individuals with the best fitness + history = fedot.history + best_models = extract_best_models_from_history(dataset, history) + best_models_per_dataset[dataset_id] = best_models + except Exception: + logging.exception(f'Train dataset "{dataset_id}"') mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset) - datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items())) - for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file): - try: - # Run meta AutoML - # 1 - time_start = timeit.default_timer() - meta_features = mf_extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True) - meta_features = meta_features.fillna(0) - meta_learning_time_sec = timeit.default_timer() - time_start - initial_assumptions = model_advisor.predict(meta_features)[0] - assumption_pipelines = [model.predictor for model in initial_assumptions] - # 2 - fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', - initial_assumption=assumption_pipelines) - fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec - evaluation_results.append(fedot_meta_results) - - # Fit & evaluate simple baseline - baseline_metrics = evaluate_pipeline(baseline_pipeline, fedot_meta.train_data) - baseline_res = get_result_data_row(dataset=dataset, run_label='simple baseline', pipeline=baseline_pipeline, - **baseline_metrics) - evaluation_results.append(baseline_res) - - # Fit & evaluate initial assumptions - for i, assumption in enumerate(initial_assumptions): - pipeline = assumption.predictor - assumption_metrics = evaluate_pipeline(pipeline, fedot_meta.train_data) - assumption_res = get_result_data_row(dataset=dataset, run_label=f'MetaFEDOT - initial assumption {i}', - pipeline=pipeline, **assumption_metrics) - evaluation_results.append(assumption_res) - except Exception: - logging.exception(f'Test dataset "{dataset_id}"') - progress_file.close() - - # Save the accumulated results - history_dir = save_dir.joinpath('histories') - history_dir.mkdir() - models_dir = save_dir.joinpath('models') - for res in evaluation_results: - try: - res['run_date'] = time_now - dataset_id = res['dataset_id'] - run_label = res['run_label'] - # define saving paths - model_path = models_dir.joinpath(f'{dataset_id}_{run_label}') - history_path = history_dir.joinpath(f'{dataset_id}_{run_label}_history.json') - # replace objects with export paths for csv - res['model_path'] = str(model_path) - res.pop('model_obj').save(res['model_path']) - res['history_path'] = str(history_path) - history_obj = res.pop('history_obj') - if history_obj is not None: - history_obj.save(res['history_path']) - except Exception: - logging.exception(f'Saving results "{res}"') - - pd.DataFrame(evaluation_results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv')) - - # save experiment hyperparameters - params = dict( - run_date=time_now_iso, - input_config=config, - dataset_ids=list(datasets_dict.keys()), - dataset_ids_train=dataset_ids_train, - dataset_names_train=df_datasets_train['dataset_name'].to_list(), - dataset_ids_test=dataset_ids_test, - dataset_names_test=df_datasets_test['dataset_name'].to_list(), - baseline_pipeline=baseline_pipeline.descriptive_id, - ) - with open(save_dir.joinpath('parameters.json'), 'w') as params_file: - json.dump(params, params_file, indent=2) + with open(progress_file_path, 'a') as progress_file: + for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file): + try: + # Run meta AutoML + # 1 + time_start = timeit.default_timer() + meta_features = mf_extractor.extract([dataset], + fill_input_nans=True, use_cached=False, update_cached=True) + meta_features = meta_features.fillna(0) + meta_learning_time_sec = timeit.default_timer() - time_start + initial_assumptions = model_advisor.predict(meta_features)[0] + assumption_pipelines = [model.predictor for model in initial_assumptions] + # 2 + run_date = datetime.now() + fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT', + initial_assumption=assumption_pipelines) + fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec + save_evaluation(fedot_meta_results, run_date, experiment_date, save_dir) + + # Fit & evaluate simple baseline + baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build() + run_date = datetime.now() + baseline_metrics = evaluate_pipeline(baseline_pipeline, fedot_meta.train_data) + baseline_res = get_result_data_row(dataset=dataset, run_label=f'simple baseline {BASELINE_MODEL}', + pipeline=baseline_pipeline, + **baseline_metrics) + save_evaluation(baseline_res, run_date, experiment_date, save_dir) + + # Fit & evaluate initial assumptions + for i, assumption in enumerate(initial_assumptions): + pipeline = assumption.predictor + run_date = datetime.now() + assumption_metrics = evaluate_pipeline(pipeline, fedot_meta.train_data) + assumption_res = get_result_data_row(dataset=dataset, + run_label=f'MetaFEDOT - initial assumption {i}', + pipeline=pipeline, **assumption_metrics) + save_evaluation(assumption_res, run_date, experiment_date, save_dir) + except Exception: + logging.exception(f'Test dataset "{dataset_id}"') if __name__ == "__main__": diff --git a/meta_automl/data_preparation/dataset/custom_dataset.py b/meta_automl/data_preparation/dataset/custom_dataset.py index 505868f6..1001b5be 100644 --- a/meta_automl/data_preparation/dataset/custom_dataset.py +++ b/meta_automl/data_preparation/dataset/custom_dataset.py @@ -8,7 +8,6 @@ from meta_automl.data_preparation.dataset.dataset_base import DatasetData - class DataNotFoundError(FileNotFoundError): pass From 68a24433ca953ebadbea3a7e73f0f5d492eaa28d Mon Sep 17 00:00:00 2001 From: morrisnein Date: Sun, 16 Jul 2023 18:13:21 +0300 Subject: [PATCH 58/60] fix logging --- .../datasets_loaders/openml_datasets_loader.py | 6 ------ .../meta_features_extractors/pymfe_extractor.py | 5 ++--- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py index f7fbfb80..89cd2445 100644 --- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py +++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py @@ -2,8 +2,6 @@ from typing import List, Union, Optional, Sequence -from golem.core.log import default_log - from meta_automl.data_preparation.dataset import OpenMLDataset, OpenMLDatasetIDType from meta_automl.data_preparation.datasets_loaders import DatasetsLoader @@ -37,7 +35,3 @@ def load_single(self, dataset_id: Union[OpenMLDatasetIDType, str], self.dataset_ids.add(dataset.id_) return dataset - - @property - def _log(self): - return default_log(self) diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py index edfa6925..1542e823 100644 --- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py +++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py @@ -1,9 +1,9 @@ from __future__ import annotations +import logging from typing import List, Union, Dict, Any import pandas as pd -from golem.core.log import default_log from pymfe.mfe import MFE from meta_automl.data_preparation.dataset import DatasetBase, DatasetIDType @@ -18,7 +18,6 @@ def __init__(self, extractor_params: Dict[str, Any] = None, datasets_loader: Dat self.extractor_params = extractor_params if extractor_params is not None else self.default_params self._datasets_loader = datasets_loader or OpenMLDatasetsLoader() self._extractor = MFE(**self.extractor_params) - self._logger = default_log(self) @property def datasets_loader(self) -> DatasetsLoader: @@ -35,7 +34,7 @@ def extract(self, datasets_or_ids: List[Union[DatasetBase, DatasetIDType]], if not isinstance(dataset, DatasetBase): dataset = self._datasets_loader.load_single(dataset) - self._logger.info(f'Extracting meta features of the dataset {dataset}...') + logging.critical(f'Extracting meta features of the dataset {dataset}...') if (use_cached and (mfs := self._get_meta_features_cache(dataset.id_, meta_feature_names))): meta_features[dataset.id_] = mfs From b4c714f3a245ed33537a63d804d3dad286c0e81c Mon Sep 17 00:00:00 2001 From: max Date: Wed, 19 Jul 2023 18:10:47 +0300 Subject: [PATCH 59/60] Adding an AutoML baseline class --- baselines/__init__.py | 0 baselines/automl_baseline.py | 11 +++++++++++ 2 files changed, 11 insertions(+) create mode 100644 baselines/__init__.py create mode 100644 baselines/automl_baseline.py diff --git a/baselines/__init__.py b/baselines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/baselines/automl_baseline.py b/baselines/automl_baseline.py new file mode 100644 index 00000000..36a82b28 --- /dev/null +++ b/baselines/automl_baseline.py @@ -0,0 +1,11 @@ +from abc import ABC + + +class AutoMLBaseline(ABC): + def run(self): + raise NotImplementedError + + @staticmethod + def save_on_disk(data): + raise NotImplementedError + From 645a98f8f3806bdb1c89f538691cb6decaac2754 Mon Sep 17 00:00:00 2001 From: max Date: Wed, 19 Jul 2023 18:15:04 +0300 Subject: [PATCH 60/60] Reflecting API changes in an asklearn baseline --- baselines/auto-sklearn/__init__.py | 0 .../auto-sklearn/auto-sklearn_baseline.py | 166 ++++++++++++++++++ .../auto-sklearn/data/experimental_data.csv | 57 ++++++ 3 files changed, 223 insertions(+) create mode 100644 baselines/auto-sklearn/__init__.py create mode 100644 baselines/auto-sklearn/auto-sklearn_baseline.py create mode 100644 baselines/auto-sklearn/data/experimental_data.csv diff --git a/baselines/auto-sklearn/__init__.py b/baselines/auto-sklearn/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/baselines/auto-sklearn/auto-sklearn_baseline.py b/baselines/auto-sklearn/auto-sklearn_baseline.py new file mode 100644 index 00000000..e467e8f1 --- /dev/null +++ b/baselines/auto-sklearn/auto-sklearn_baseline.py @@ -0,0 +1,166 @@ +import csv +import time + +from typing import Any, Tuple, Dict + +import numpy as np +import logging + +import autosklearn.classification +import autosklearn.ensembles + +from sklearn import model_selection, metrics + +from baselines.automl_baseline import AutoMLBaseline +from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.data_preparation.models_loaders import KnowledgeBaseModelsLoader +from autosklearn.classification import AutoSklearnClassifier + + +class AutoSklearnBaseline(AutoMLBaseline): + def __init__(self, ensemble_type, time_limit): + self.estimator = AutoSklearnClassifier( + ensemble_class=ensemble_type, + time_left_for_this_task=time_limit, + ) + self.knowledge_base_loader = KnowledgeBaseModelsLoader() + + @staticmethod + def make_quality_metric_estimates(y, predictions, prediction_proba, is_multi_label): + """ Compute roc_auc, f1, accuracy, log_loss and precision scores. """ + results = { + 'roc_auc': -1 * float( + "{:.3f}".format( + metrics.roc_auc_score( + y, + prediction_proba if is_multi_label else predictions, + multi_class='ovr' + ) + ) + ), + 'f1': -1 * float( + "{:.3f}".format( + metrics.f1_score( + y, + predictions, + average='macro' if is_multi_label else 'binary' + ) + ) + ), + 'accuracy': -1 * float( + "{:.3f}".format( + metrics.accuracy_score( + y, + predictions + ) + ) + ), + 'logloss': float( + "{:.3f}".format( + metrics.log_loss( + y, + prediction_proba if is_multi_label else predictions + ) + ) + ), + 'precision': -1 * float( + "{:.3f}".format( + metrics.precision_score( + y, + predictions, + average='macro' if is_multi_label else 'binary', + labels=np.unique(predictions) + ) + ) + ) + } + return results + + def run(self): + """ Fit auto-sklearn meta-optimizer to knowledge base datasets and output a single best model. """ + dataset_ids_to_load = [ + dataset_id for dataset_id in self.knowledge_base_loader + .parse_datasets('test') + .loc[:, 'dataset_id'] + ] + # dataset_ids_to_load = [dataset_ids_to_load[dataset_ids_to_load.index(41166)]] + + loaded_datasets = OpenMLDatasetsLoader().load(dataset_ids_to_load) + + for iteration, dataset in enumerate(loaded_datasets): + logging.log(logging.INFO, f"Loaded dataset name: {dataset.name}") + dataset_data = dataset.get_data() + + X_train, X_test, y_train, y_test = model_selection.train_test_split( + dataset_data.x, + dataset_data.y, + test_size=0.2, + random_state=42, + stratify=dataset_data.y + ) + + fitting_start_time = time.time() + ensemble = self.estimator.fit(X_train, y_train) + fitting_time = time.time() - fitting_start_time + logging.log(logging.INFO, f"Fitting time is {fitting_time}sec") + + inference_start_time = time.time() + predicted_results = self.estimator.predict(X_test) + inference_time = time.time() - inference_start_time + logging.log(logging.INFO, f"Inference time is {inference_time}sec") + + predicted_probabilities = self.estimator.predict_proba(X_test) + + best_single_model = list(ensemble.show_models().values())[0].get('sklearn_classifier') + + # autosklearn_ensemble = pipeline.show_models() + # formatted_ensemble = { + # model_id: { + # 'rank': autosklearn_ensemble[model_id].get('rank'), + # 'cost': float(f"{autosklearn_ensemble[model_id].get('cost'):.3f}"), + # 'ensemble_weight': autosklearn_ensemble[model_id].get('ensemble_weight'), + # 'model': autosklearn_ensemble[model_id].get('sklearn_classifier') + # } for model_id in autosklearn_ensemble.keys() + # } + + general_run_info = { + 'dataset_id': dataset.id_, + 'dataset_name': dataset.name, + 'run_label': 'Auto-sklearn', + } + + is_multilabel_classification = True if len(set(predicted_results)) > 2 else False + quality_metric_estimates = AutoSklearnBaseline.make_quality_metric_estimates( + y_test, + predicted_results, + predicted_probabilities, + is_multilabel_classification + ) + + model_dependent_run_info = { + 'fit_time': float(f'{fitting_time:.1f}'), + 'inference_time': float(f'{inference_time:.1f}'), + 'model_str': repr(best_single_model) + } + + results = {**general_run_info, **quality_metric_estimates, **model_dependent_run_info} + + # for key in autosklearn_ensemble.keys(): + # ensemble_model = autosklearn_ensemble[key] + # formatted_ensemble = results['ensemble'] + # for model_id in formatted_ensemble.keys(): + # formatted_ensemble[model_id] = ensemble_model.get("rank", None) + + AutoSklearnBaseline.save_on_disk(results.valuess()) + + return results + + @staticmethod + def save_on_disk(data): + with open('data/experimental_data.csv', 'a', newline='') as file: + writer = csv.writer(file, delimiter=',') + writer.writerow(data) + + +if __name__ == '__main__': + AutoSklearnBaseline(autosklearn.ensembles.SingleBest, 600).run() diff --git a/baselines/auto-sklearn/data/experimental_data.csv b/baselines/auto-sklearn/data/experimental_data.csv new file mode 100644 index 00000000..7a3f3cfa --- /dev/null +++ b/baselines/auto-sklearn/data/experimental_data.csv @@ -0,0 +1,57 @@ +1461,bank-marketing,Auto-sklearn,-0.711,-0.535,-0.907,3.34,-0.648,598.0,0.1,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=1.7108930238344161e-10, + learning_rate=0.010827728124541558, loss='auto', + max_iter=512, max_leaf_nodes=25, + min_samples_leaf=4, n_iter_no_change=19, + random_state=1, + validation_fraction=0.1759114608225653, + warm_start=True)" +179,adult,Auto-sklearn,-0.774,-0.91,-0.859,5.077,-0.885,595.3,0.1,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=1.7108930238344161e-10, + learning_rate=0.010827728124541558, loss='auto', + max_iter=512, max_leaf_nodes=25, + min_samples_leaf=4, n_iter_no_change=19, + random_state=1, + validation_fraction=0.1759114608225653, + warm_start=True)" +1464,blood-transfusion-service-center,Auto-sklearn,-0.669,-0.5,-0.8,7.209,-0.625,597.6,0.0,"PassiveAggressiveClassifier(C=0.253246830865058, average=True, max_iter=16, + random_state=1, tol=0.01676578241454229, + warm_start=True)" +991,car,Auto-sklearn,-1.0,-1.0,-1.0,0.0,-1.0,596.8,0.0,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=1.9280388598217333e-10, + learning_rate=0.24233932723531437, loss='auto', + max_iter=128, max_leaf_nodes=35, + min_samples_leaf=17, n_iter_no_change=1, + random_state=1, validation_fraction=None, + warm_start=True)" +1489,phoneme,Auto-sklearn,-0.848,-0.797,-0.887,4.068,-0.845,600.4,0.1,"AdaBoostClassifier(algorithm='SAMME', + base_estimator=DecisionTreeClassifier(max_depth=10), + learning_rate=1.1377640450285444, n_estimators=352, + random_state=1)" +41027,jungle_chess_2pcs_raw_endgame_complete,Auto-sklearn,-0.975,-0.816,-0.865,0.271,-0.824,595.1,0.2,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=9.674948183980905e-09, + learning_rate=0.014247987845444413, loss='auto', + max_iter=512, max_leaf_nodes=55, + min_samples_leaf=164, n_iter_no_change=1, + random_state=1, + validation_fraction=0.11770489601182355, + warm_start=True)" +41166,volkert,Auto-sklearn,-0.874,-0.586,-0.644,1.829,-0.587,595.8,0.3,"LinearDiscriminantAnalysis(shrinkage='auto', solver='lsqr', + tol=0.018821286956948503)" +54,vehicle,Auto-sklearn,-0.964,-0.86,-0.859,0.408,-0.861,595.5,0.0,"MLPClassifier(activation='tanh', alpha=0.0002060405669905105, beta_1=0.999, + beta_2=0.9, hidden_layer_sizes=(87, 87, 87), + learning_rate_init=0.00040205833939989724, max_iter=256, + n_iter_no_change=32, random_state=1, validation_fraction=0.0, + verbose=0, warm_start=True)" +40996,fashion-mnist,Auto-sklearn,-0.968,-0.864,-0.865,1.913,-0.866,296.1,1.2,"KNeighborsClassifier(n_neighbors=4, weights='distance')" +40996,fashion-mnist,Auto-sklearn,-0.968,-0.864,-0.865,1.913,-0.866,595.5,0.8,"KNeighborsClassifier(n_neighbors=4, weights='distance')" +42344,sf-police-incidents,Auto-sklearn,-0.574,-0.589,-0.574,15.367,-0.569,594.8,0.5,"HistGradientBoostingClassifier(early_stopping=True, + l2_regularization=3.609412172481434e-10, + learning_rate=0.05972079854295879, loss='auto', + max_iter=512, max_leaf_nodes=4, + min_samples_leaf=2, n_iter_no_change=14, + random_state=1, validation_fraction=None, + warm_start=True)" +1240,airlinescodrnaadult,Auto-sklearn,-0.62,-0.683,-0.631,13.306,-0.658,594.3,0.1,"SGDClassifier(alpha=1.6992296128865824e-07, average=True, eta0=0.01, loss='log', + max_iter=512, penalty='l1', random_state=1, + tol=1.535384699341134e-05, warm_start=True)" \ No newline at end of file