From 107714e03efa60f507eb3411c27a8dc7a4d69459 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 26 Feb 2023 17:44:30 +0300
Subject: [PATCH 01/60] create requirements.txt

---
 requirements.txt | Bin 0 -> 280 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..17db3011d9c2832987c96b696a85b52c145131e1
GIT binary patch
literal 280
zcmY+9Ne;p=5CrRt#8W_V!gAs<hy)3NNg(FH=YgIv7?3qSxZKM;AA34ldJ-F*iaBdd
z#gqwWZbSy|&Vt@+7JeJ;MLPY7oQgKk6!pMXwADQp{Zh1XZ*z@U`X~46Y%Gb4?^f2R
q)&y6M>^N}LBxcI)lyFs&`Q(oOKY#lN-!S_mE@~HDXW1*?EwdK^Q70h)

literal 0
HcmV?d00001


From e67bde826ffd27b397dac94d3f1861859f831a26 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 26 Feb 2023 17:53:07 +0300
Subject: [PATCH 02/60] move to FEDOT 0.7.0

---
 .../advise_models_from_similar_datasets.py      |   2 +-
 meta_automl/data_preparation/model.py           |   2 +-
 .../models_loaders/fedot_pipelines_loader.py    |   2 +-
 requirements.txt                                | Bin 280 -> 310 bytes
 4 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py
index 85d62b48..f7d583c5 100644
--- a/examples/4_advising_models/advise_models_from_similar_datasets.py
+++ b/examples/4_advising_models/advise_models_from_similar_datasets.py
@@ -1,5 +1,5 @@
-from fedot.core.optimisers.fitness import SingleObjFitness
 from fedot.core.pipelines.pipeline_builder import PipelineBuilder
+from golem.core.optimisers.fitness import SingleObjFitness
 from sklearn.model_selection import train_test_split
 
 from meta_automl.data_preparation.dataset import DatasetCache
diff --git a/meta_automl/data_preparation/model.py b/meta_automl/data_preparation/model.py
index 44543dfe..f999368d 100644
--- a/meta_automl/data_preparation/model.py
+++ b/meta_automl/data_preparation/model.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from typing import Any
 
-from fedot.core.optimisers.fitness import Fitness
+from golem.core.optimisers.fitness import Fitness
 
 from meta_automl.data_preparation.dataset import DatasetCache
 
diff --git a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py
index b5fe2abf..bb66c3aa 100644
--- a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py
+++ b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py
@@ -4,7 +4,6 @@
 
 import numpy as np
 from fedot.core.data.data import InputData
-from fedot.core.log import default_log
 from fedot.core.optimisers.objective import PipelineObjectiveEvaluate
 from fedot.core.optimisers.objective.metrics_objective import MetricsObjective
 from fedot.core.pipelines.pipeline import Pipeline
@@ -12,6 +11,7 @@
 from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 from fedot.core.validation.split import tabular_cv_generator
+from golem.core.log import default_log
 from tqdm import tqdm
 
 from meta_automl.data_preparation.data_manager import PathType
diff --git a/requirements.txt b/requirements.txt
index 17db3011d9c2832987c96b696a85b52c145131e1..1f48eb66a4e24225c52ee8361e241051939a511e 100644
GIT binary patch
delta 38
pcmbQiw2f(l1fyIDLk2@CLpnn~Lk^J6Wv~T80|q??BOo?l005_?2KN8}

delta 7
OcmdnSG=ph_1S0?nP68YN


From 245865421caadbb29027df0dbebd90d11547e1be Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 26 Feb 2023 18:07:22 +0300
Subject: [PATCH 03/60] create Dockerfile

---
 Dockerfile | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..e17e17cd
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,29 @@
+# Download base image ubuntu 20.04
+FROM ubuntu:20.04
+
+# For apt to be noninteractive
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN true
+
+# Preseed tzdata, update package index, upgrade packages and install needed software
+RUN truncate -s0 /tmp/preseed.cfg; \
+    echo "tzdata tzdata/Areas select Europe" >> /tmp/preseed.cfg; \
+    echo "tzdata tzdata/Zones/Europe select Berlin" >> /tmp/preseed.cfg; \
+    debconf-set-selections /tmp/preseed.cfg && \
+    rm -f /etc/timezone /etc/localtime && \
+	apt-get update && \
+	apt-get install -y nano  && \
+	apt-get install -y mc && \
+    apt-get install -y python3.9 python3-pip && \
+	apt-get install -y git && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set the workdir
+ENV WORKDIR /home/meta-automl-research
+WORKDIR $WORKDIR
+COPY . $WORKDIR
+
+RUN pip3 install pip && \
+    pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt
+
+ENV PYTHONPATH $WORKDIR

From e8fee3014ed570ec39d672d1d95c7953cf26c196 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 26 Feb 2023 22:06:22 +0300
Subject: [PATCH 04/60] prepare experiment demo

---
 experiments/fedot_warm_start/run.py | 81 +++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 experiments/fedot_warm_start/run.py

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
new file mode 100644
index 00000000..f0342126
--- /dev/null
+++ b/experiments/fedot_warm_start/run.py
@@ -0,0 +1,81 @@
+import functools
+import timeit
+
+import openml
+import pandas as pd
+from fedot.api.main import Fedot
+from sklearn.model_selection import train_test_split
+
+from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
+from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
+
+SEED = 42
+
+
+def prepare_data():
+    dataset_ids = pd.Series(openml.study.get_suite(99).data)
+    dataset_ids = dataset_ids.sample(n=15, random_state=SEED)
+    dataset_ids = list(dataset_ids)
+    return OpenMLDatasetsLoader().load(dataset_ids)
+
+
+def timeit_decorator(function):
+    @functools.wraps(function)
+    def wrapped(*args, **kwargs):
+        start_time = timeit.default_timer()
+        res = function(*args, **kwargs)
+        time = timeit.default_timer() - start_time
+        return res, time
+
+    return wrapped
+
+
+def main():
+    datasets_cache = prepare_data()
+    datasets_train, datasets_test = train_test_split(datasets_cache, test_size=0.33, random_state=SEED)
+
+    # TODO:
+    #  - Extract meta-features for train datasets
+    #  - Fit 'DatasetsSimilarityAssessor'
+
+    results_pre = []
+    for cache in datasets_train:
+        data = cache.from_cache()
+        fedot = Fedot('classification', timeout=15, n_jobs=-1, seed=SEED)
+        _, automl_time = timeit_decorator(fedot.fit)(data.x, data.y)
+        results_pre.append({'dataset': data.name, 'model': fedot, 'automl_time': automl_time})
+
+    # TODO:
+    #  - Prepare 'ModelAdvisor'
+
+    results = []
+    for cache in datasets_test:
+        data = cache.from_cache()
+        fedot_naive = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED)
+        _, automl_time_naive = timeit_decorator(fedot_naive.fit)(data.x, data.y)
+
+        time_start = timeit.default_timer()
+        # TODO:
+        #  - Extract meta-features for current test dataset
+        #  - Get suitable assumptions from 'ModelAdvisor'
+        initial_assumption = ...
+        fedot_meta = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED, initial_assumption=initial_assumption)
+        automl_time_meta = timeit.default_timer() - time_start
+
+        metrics_naive = fedot_naive.get_metrics()
+        metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()}
+        metrics_meta = fedot_meta.get_metrics()
+        metrics_meta = {f'{key}_meta': val for key, val in metrics_meta.items()}
+
+        results.append({
+            'dataset': data.name,
+            'model_naive': fedot_naive,
+            'model_meta': fedot_meta,
+            'automl_time_naive': automl_time_naive,
+            'automl_time_meta': automl_time_meta,
+            **metrics_naive, **metrics_meta
+        })
+
+
+if __name__ == "__main__":
+    main()

From 5fb00f0f760947739c6d4a8bef84a877f05c2df6 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Fri, 3 Mar 2023 11:24:52 +0300
Subject: [PATCH 05/60] adapt to FEDOT 0.7.0 again

---
 .../advise_models_from_similar_datasets.py                | 8 ++++----
 .../model_advisors/diverse_fedot_pipeline_advisor.py      | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py
index f7d583c5..5d948e0b 100644
--- a/examples/4_advising_models/advise_models_from_similar_datasets.py
+++ b/examples/4_advising_models/advise_models_from_similar_datasets.py
@@ -21,13 +21,13 @@ def main():
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
     x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
     y_train = x_train.index
-    assessor = KNNSimilarityAssessor({'n_neighbors': 2}, n_best=2)
+    assessor = KNNSimilarityAssessor({'n_neighbors': 3}, n_best=2)
     assessor.fit(x_train, y_train)
     # Define best models for datasets.
     best_pipelines = [
-        PipelineBuilder().add_node('scaling').add_node('rf').to_pipeline(),
-        PipelineBuilder().add_node('normalization').add_node('logit').to_pipeline(),
-        PipelineBuilder().add_node('rf').add_node('logit').to_pipeline()
+        PipelineBuilder().add_node('scaling').add_node('rf').build(),
+        PipelineBuilder().add_node('normalization').add_node('logit').build(),
+        PipelineBuilder().add_node('rf').add_node('logit').build()
     ]
     best_models = [[Model(pipeline, SingleObjFitness(1), DatasetCache(dataset_name))]
                    for dataset_name, pipeline in zip(y_train, best_pipelines)]
diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
index aa91e0db..15ef1f57 100644
--- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
+++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
@@ -1,7 +1,7 @@
 from typing import Callable, List, Iterable
 
-from fedot.core.dag.linked_graph import get_distance_between
 from fedot.core.pipelines.pipeline import Pipeline
+from golem.core.dag.linked_graph import get_distance_between
 
 from meta_automl.data_preparation.model import Model
 from meta_automl.meta_algorithm.datasets_similarity_assessors import DatasetsSimilarityAssessor

From 310a5788baad24a360f838c8d1bd9f743dd95952 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Fri, 3 Mar 2023 17:40:35 +0300
Subject: [PATCH 06/60] fix similarity assessors

---
 .../select_similar_datasets_by_knn.py         |  6 +-
 .../advise_models_from_similar_datasets.py    |  4 +-
 .../datasets_similarity_assessors/__init__.py |  2 +-
 .../model_based_similarity_assessors.py       | 51 ++++++++++++++++
 .../predict_proba_similarity_assessors.py     | 59 -------------------
 5 files changed, 57 insertions(+), 65 deletions(-)
 create mode 100644 meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
 delete mode 100644 meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py

diff --git a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
index dc1c190c..b6f2bb8c 100644
--- a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
+++ b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
@@ -2,7 +2,7 @@
 
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
-from meta_automl.meta_algorithm.datasets_similarity_assessors import KNNSimilarityAssessor
+from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
 
 
 def main():
@@ -16,10 +16,10 @@ def main():
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
     x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
     y_train = x_train.index
-    assessor = KNNSimilarityAssessor({'n_neighbors': 1}, n_best=2)
+    assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=3)
     assessor.fit(x_train, y_train)
     # Get models for the best fitting datasets from train.
-    return x_test.index, assessor.predict(x_test)
+    return x_test.index, assessor.predict(x_test, return_distance=True)
 
 
 if __name__ == '__main__':
diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py
index 5d948e0b..d10dad85 100644
--- a/examples/4_advising_models/advise_models_from_similar_datasets.py
+++ b/examples/4_advising_models/advise_models_from_similar_datasets.py
@@ -6,7 +6,7 @@
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
 from meta_automl.data_preparation.model import Model
-from meta_automl.meta_algorithm.datasets_similarity_assessors import KNNSimilarityAssessor
+from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
 from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
 
 
@@ -21,7 +21,7 @@ def main():
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
     x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
     y_train = x_train.index
-    assessor = KNNSimilarityAssessor({'n_neighbors': 3}, n_best=2)
+    assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=2)
     assessor.fit(x_train, y_train)
     # Define best models for datasets.
     best_pipelines = [
diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py
index 621a68e0..0c33e2c4 100644
--- a/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py
+++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py
@@ -1,2 +1,2 @@
 from .datasets_similarity_assessor import DatasetsSimilarityAssessor
-from .predict_proba_similarity_assessors import KNNSimilarityAssessor, PredictProbaSimilarityAssessor
+from .model_based_similarity_assessors import KNeighborsBasedSimilarityAssessor, ModelBasedSimilarityAssessor
diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
new file mode 100644
index 00000000..09720a1e
--- /dev/null
+++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
@@ -0,0 +1,51 @@
+from abc import ABC
+from typing import Optional, Dict, Any, List, Iterable
+
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import NearestNeighbors
+
+from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \
+    DatasetsSimilarityAssessor
+
+
+class ModelBasedSimilarityAssessor(ABC, DatasetsSimilarityAssessor):
+    def __init__(self, model, n_best: int = 1):
+        self._inner_model = model
+        self.n_best = n_best
+        self._datasets: Optional[Iterable[str]] = None
+
+
+class KNeighborsBasedSimilarityAssessor(ModelBasedSimilarityAssessor):
+    def __init__(self, n_neighbors: int = 1, **model_params):
+        model = NearestNeighbors(n_neighbors=n_neighbors, **model_params)
+        super().__init__(model, n_neighbors)
+
+    def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]):
+        meta_features = self.preprocess_meta_features(meta_features)
+        self._datasets = np.array(datasets)
+        self._inner_model.fit(meta_features)
+
+    @staticmethod
+    def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame:
+        return meta_features.dropna(axis=1, how='any')
+
+    def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[str]]:
+        dataset_indexes = self._inner_model.kneighbors(meta_features, return_distance=return_distance)
+        if return_distance:
+            distances, dataset_indexes = dataset_indexes
+            dataset_names = np.take(self._datasets, dataset_indexes, axis=0)
+            return distances, dataset_names
+        else:
+            return np.take(self._datasets, dataset_indexes, axis=0)
+
+    @property
+    def datasets(self) -> Optional[Iterable[str]]:
+        return self._datasets
+
+    @property
+    def feature_names(self) -> List[str]:
+        return self._inner_model.feature_names_in_
+
+    def _preprocess_predict_features(self, meta_features: pd.DataFrame) -> pd.DataFrame:
+        return meta_features[self.feature_names]
diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py
deleted file mode 100644
index 8254c745..00000000
--- a/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from typing import Optional, Dict, Any, List, Iterable
-
-import numpy as np
-import pandas as pd
-from sklearn.neighbors import KNeighborsClassifier
-
-from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \
-    DatasetsSimilarityAssessor
-
-
-class PredictProbaSimilarityAssessor(DatasetsSimilarityAssessor):
-    def __init__(self, model, n_best: int = 1):
-        self._inner_model = model
-        self.n_best = n_best
-
-    @property
-    def datasets(self) -> List[str]:
-        return self._inner_model.classes_
-
-    @property
-    def feature_names(self) -> List[str]:
-        return self._inner_model.feature_names_in_
-
-    @staticmethod
-    def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame:
-        return meta_features.dropna(axis=1, how='any')
-
-    def _preprocess_predict_features(self, meta_features: pd.DataFrame) -> pd.DataFrame:
-        return meta_features[self.feature_names]
-
-    def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]):
-        meta_features = self.preprocess_meta_features(meta_features)
-        self._inner_model.fit(meta_features, datasets)
-
-    def predict_proba(self, meta_features: pd.DataFrame) -> List[List[float]]:
-        return self._inner_model.predict_proba(meta_features)
-
-    def predict(self, meta_features: pd.DataFrame) -> List[List[str]]:
-        meta_features = self._preprocess_predict_features(meta_features)
-        predict_probs = self.predict_proba(meta_features)
-        final_prediction = []
-        for probabilities in predict_probs:
-            probabilities = list(probabilities)
-            predictions = []
-            for _ in range(self.n_best):
-                predicted_class_idx = np.argmax(probabilities)
-                predicted_class = self.datasets[predicted_class_idx]
-                predictions.append(predicted_class)
-                probabilities.pop(predicted_class_idx)
-            final_prediction.append(predictions)
-
-        return final_prediction
-
-
-class KNNSimilarityAssessor(PredictProbaSimilarityAssessor):
-    def __init__(self, model_params: Optional[Dict[str, Any]] = None, n_best: int = 1):
-        model_params = model_params or dict()
-        model = KNeighborsClassifier(**model_params)
-        super().__init__(model, n_best)

From ae9c9098959fb4bb00f5f9b958309ca2ba118712 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sat, 4 Mar 2023 13:32:47 +0300
Subject: [PATCH 07/60] allow PymfeExtractor fill values with median

---
 .../meta_features_extractors/pymfe_extractor.py   | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
index 2848998e..3a379f6f 100644
--- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
+++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
@@ -25,7 +25,7 @@ def datasets_loader(self) -> DatasetsLoader:
             raise ValueError("Datasets loader not provided!")
         return self._datasets_loader
 
-    def extract(self, datasets: List[Union[DatasetCache, str]]) -> pd.DataFrame:
+    def extract(self, datasets: List[Union[DatasetCache, str]], fill_nans: bool = False) -> pd.DataFrame:
         meta_features = {}
         meta_feature_names = self._extractor.extract_metafeature_names()
         load_dataset = self.datasets_loader.cache_to_memory
@@ -37,10 +37,21 @@ def extract(self, datasets: List[Union[DatasetCache, str]]) -> pd.DataFrame:
             else:
                 loaded_dataset = load_dataset(dataset)
                 cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val]
-                mfe = self._extractor.fit(loaded_dataset.x, loaded_dataset.y, cat_cols=cat_cols)
+                x = loaded_dataset.x
+                y = loaded_dataset.y
+                if fill_nans:
+                    x = self.fill_nans(x)
+                mfe = self._extractor.fit(x, y, cat_cols=cat_cols)
                 feature_names, dataset_features = mfe.extract(out_type=tuple)
                 mfs = dict(zip(feature_names, dataset_features))
                 self._update_meta_features_cache(dataset.name, mfs)
                 meta_features[dataset.name] = mfs
         meta_features = pd.DataFrame.from_dict(meta_features, orient='index')
         return meta_features
+
+    @staticmethod
+    def fill_nans(x):
+        if not isinstance(x, pd.DataFrame):
+            x = pd.DataFrame(x)
+        x = x.fillna(x.median())
+        return x.to_numpy()

From 60dc77ad0b50c33bd5537647db7ce991df0c7917 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Fri, 24 Mar 2023 14:10:48 +0300
Subject: [PATCH 08/60] use FEDOT version with fixed initial assumptions

---
 requirements.txt | Bin 310 -> 460 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1f48eb66a4e24225c52ee8361e241051939a511e..4785a7e31c157dd88d70c0da7317bb52f92cfde0 100644
GIT binary patch
delta 185
zcmXAjOA5k35JaEm6fRu36q25h#19BA)U6vY5Mw@qA%f%DGr92;o=Ght%~VfUJ(_+E
z-@)VisX61B(nvj5ZY+sZf4||*k&uFpR&Fl3)pY6oh#gxt>^ZO&SCEllob()(dO|9V
q)vNyR<j=Jw6+6S3Y*QRdGerwkuD}Et6H1*?vl3NmO^1Ju#)BUY{U7!K

delta 34
mcmX@Zyp2in|GzYbRE89We1;MRTOc%G&|@$MVuOwAWf=joy$G8C


From cf250661790a219b467895e3e6c034b943732135 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 30 Mar 2023 13:47:06 +0300
Subject: [PATCH 09/60] optional cache usage for MFE extractor

---
 .../meta_features_extractors/pymfe_extractor.py      | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
index 3a379f6f..36cb9d45 100644
--- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
+++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
@@ -25,26 +25,30 @@ def datasets_loader(self) -> DatasetsLoader:
             raise ValueError("Datasets loader not provided!")
         return self._datasets_loader
 
-    def extract(self, datasets: List[Union[DatasetCache, str]], fill_nans: bool = False) -> pd.DataFrame:
+    def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: bool = False,
+                use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame:
         meta_features = {}
         meta_feature_names = self._extractor.extract_metafeature_names()
         load_dataset = self.datasets_loader.cache_to_memory
         for dataset in datasets:
             if isinstance(dataset, str):
                 dataset = DatasetCache(dataset)
-            if mfs := self._get_meta_features_cache(dataset.name, meta_feature_names):
+
+            if (use_cached and
+                    (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))):
                 meta_features[dataset.name] = mfs
             else:
                 loaded_dataset = load_dataset(dataset)
                 cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val]
                 x = loaded_dataset.x
                 y = loaded_dataset.y
-                if fill_nans:
+                if fill_input_nans:
                     x = self.fill_nans(x)
                 mfe = self._extractor.fit(x, y, cat_cols=cat_cols)
                 feature_names, dataset_features = mfe.extract(out_type=tuple)
                 mfs = dict(zip(feature_names, dataset_features))
-                self._update_meta_features_cache(dataset.name, mfs)
+                if update_cached:
+                    self._update_meta_features_cache(dataset.name, mfs)
                 meta_features[dataset.name] = mfs
         meta_features = pd.DataFrame.from_dict(meta_features, orient='index')
         return meta_features

From a5a0c8abf96729de915da5a69d1cfc89aba25cca Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 30 Mar 2023 13:47:54 +0300
Subject: [PATCH 10/60] allow to advise only the n best models

---
 .../model_advisors/diverse_fedot_pipeline_advisor.py      | 8 +++++++-
 .../meta_algorithm/model_advisors/model_advisor.py        | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
index 15ef1f57..6f7e4a66 100644
--- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
+++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
@@ -1,4 +1,4 @@
-from typing import Callable, List, Iterable
+from typing import Callable, List, Iterable, Optional
 
 from fedot.core.pipelines.pipeline import Pipeline
 from golem.core.dag.linked_graph import get_distance_between
@@ -11,10 +11,12 @@
 class DiverseFEDOTPipelineAdvisor(SimpleSimilarityModelAdvisor):
     def __init__(self,
                  fitted_similarity_assessor: DatasetsSimilarityAssessor,
+                 n_best_to_advise: Optional[int] = None,
                  minimal_distance: int = 1,
                  distance_func: Callable[[Pipeline, Pipeline], int] = get_distance_between):
         super().__init__(fitted_similarity_assessor)
         self.minimal_distance = minimal_distance
+        self.n_best_to_advise = n_best_to_advise
         self.distance_func = distance_func
 
     def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]:
@@ -24,4 +26,8 @@ def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]:
         for model in dataset_advice[1:]:
             if self.distance_func(first_model.predictor, model.predictor) > self.minimal_distance:
                 diverse_dataset_advice.append(model)
+
+        if self.n_best_to_advise is not None:
+            diverse_dataset_advice = list(sorted(diverse_dataset_advice, key=lambda m: m.fitness, reverse=True))
+            diverse_dataset_advice = diverse_dataset_advice[:self.n_best_to_advise]
         return diverse_dataset_advice
diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py
index b585bf27..a9ca0d97 100644
--- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py
+++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import List, Dict, Iterable
+from typing import List, Dict, Iterable, Optional
 
 import pandas as pd
 

From 3bfaf5010f968f23dc98b3121948598a66626de2 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 30 Mar 2023 13:58:49 +0300
Subject: [PATCH 11/60] finalize experiment

---
 experiments/fedot_warm_start/run.py | 145 +++++++++++++++++++++++-----
 1 file changed, 119 insertions(+), 26 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index f0342126..be5f45f7 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -1,22 +1,58 @@
 import functools
 import timeit
+from datetime import datetime
+from itertools import chain
+from pathlib import Path
+from typing import Dict
 
+import numpy as np
 import openml
 import pandas as pd
 from fedot.api.main import Fedot
+from fedot.core.pipelines.adapters import PipelineAdapter
 from sklearn.model_selection import train_test_split
+from tqdm import tqdm
 
+from meta_automl.data_preparation.dataset import DatasetCache, Dataset
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
+from meta_automl.data_preparation.model import Model
+from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
+from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
 
+# Meta-alg hyperparameters
 SEED = 42
-
-
-def prepare_data():
-    dataset_ids = pd.Series(openml.study.get_suite(99).data)
-    dataset_ids = dataset_ids.sample(n=15, random_state=SEED)
+# Datasets sampling
+N_DATASETS = None
+TEST_SIZE = 0.33
+# Evaluation timeouts
+TRAIN_TIMEOUT = 15
+TEST_TIMEOUT = 10
+# Models & datasets
+N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
+N_CLOSEST_DATASETS_TO_PROPOSE = 5
+MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1
+N_BEST_MODELS_TO_ADVISE = 5
+
+
+COMMON_FEDOT_PARAMS = dict(
+    problem='classification',
+    with_tuning=False,
+    logging_level=50,
+    n_jobs=-1,
+    seed=SEED,
+)
+
+
+def prepare_data() -> Dict[str, DatasetCache]:
+    """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
+
+    dataset_ids = openml.study.get_suite(99).data
+    if N_DATASETS is not None:
+        dataset_ids = pd.Series(dataset_ids)
+        dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
     dataset_ids = list(dataset_ids)
-    return OpenMLDatasetsLoader().load(dataset_ids)
+    return {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
 
 
 def timeit_decorator(function):
@@ -30,37 +66,80 @@ def wrapped(*args, **kwargs):
     return wrapped
 
 
+def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
+    x = data.x
+    y = data.y
+    if len(y.shape) == 1:
+        y = y.reshape(-1, 1)
+    return x, y
+
+
 def main():
     datasets_cache = prepare_data()
-    datasets_train, datasets_test = train_test_split(datasets_cache, test_size=0.33, random_state=SEED)
+    datasets_train, datasets_test = train_test_split(list(datasets_cache.keys()),
+                                                     test_size=TEST_SIZE, random_state=SEED)
 
-    # TODO:
-    #  - Extract meta-features for train datasets
-    #  - Fit 'DatasetsSimilarityAssessor'
+    extractor = PymfeExtractor(extractor_params={'groups': 'general'})
+    meta_features_train = extractor.extract(datasets_train, fill_input_nans=True)
+    meta_features_train = meta_features_train.fillna(0)
+    data_similarity_assessor = KNeighborsBasedSimilarityAssessor(
+        n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE))
+    data_similarity_assessor.fit(meta_features_train, datasets_train)
 
     results_pre = []
-    for cache in datasets_train:
+    best_models_per_dataset = {}
+    for name in tqdm(datasets_train, 'Train datasets'):
+        cache = datasets_cache[name]
         data = cache.from_cache()
-        fedot = Fedot('classification', timeout=15, n_jobs=-1, seed=SEED)
-        _, automl_time = timeit_decorator(fedot.fit)(data.x, data.y)
-        results_pre.append({'dataset': data.name, 'model': fedot, 'automl_time': automl_time})
 
-    # TODO:
-    #  - Prepare 'ModelAdvisor'
+        fedot = Fedot(timeout=TRAIN_TIMEOUT, **COMMON_FEDOT_PARAMS)
+        x, y = transform_data_for_fedot(data)
+        _, automl_time = timeit_decorator(fedot.fit)(x, y)
+        results_pre.append({'dataset': name,
+                            'model': fedot.current_pipeline.descriptive_id,
+                            'automl_time': automl_time})
+        # TODO:
+        #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
+        #   x Evaluate historical pipelines on the data instead of using fitness
+
+        # Filter out unique individuals with the best fitness
+        best_individuals = sorted(chain(*fedot.history.individuals),
+                                  key=lambda ind: ind.fitness,
+                                  reverse=True)
+        best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
+        # best_models = list(fedot.best_models) or []
+        best_models = []
+        for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
+            pipeline = PipelineAdapter().restore(individual.graph)
+            model = Model(pipeline, individual.fitness, cache)
+            best_models.append(model)
+        best_models_per_dataset[name] = best_models
+
+    model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
+                                                minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
+    model_advisor.fit(best_models_per_dataset)
 
     results = []
-    for cache in datasets_test:
+    for name in tqdm(datasets_test, 'Test datasets'):
+        cache = datasets_cache[name]
         data = cache.from_cache()
-        fedot_naive = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED)
-        _, automl_time_naive = timeit_decorator(fedot_naive.fit)(data.x, data.y)
+        x, y = transform_data_for_fedot(data)
+
+        fedot_naive = Fedot(timeout=TEST_TIMEOUT, **COMMON_FEDOT_PARAMS)
+        _, automl_time_naive = timeit_decorator(fedot_naive.fit)(x, y)
+        fedot_naive.test_data = fedot_naive.train_data
+        fedot_naive.prediction = fedot_naive.train_data
 
         time_start = timeit.default_timer()
-        # TODO:
-        #  - Extract meta-features for current test dataset
-        #  - Get suitable assumptions from 'ModelAdvisor'
-        initial_assumption = ...
-        fedot_meta = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED, initial_assumption=initial_assumption)
+        meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
+        meta_features = meta_features.fillna(0)
+        initial_assumptions = model_advisor.predict(meta_features)[0]
+        initial_assumptions = [model.predictor for model in initial_assumptions]
+        fedot_meta = Fedot(timeout=TEST_TIMEOUT, initial_assumption=initial_assumptions, **COMMON_FEDOT_PARAMS)
+        fedot_meta.fit(x, y)
         automl_time_meta = timeit.default_timer() - time_start
+        fedot_meta.test_data = fedot_meta.train_data
+        fedot_meta.prediction = fedot_meta.train_data
 
         metrics_naive = fedot_naive.get_metrics()
         metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()}
@@ -69,13 +148,27 @@ def main():
 
         results.append({
             'dataset': data.name,
-            'model_naive': fedot_naive,
-            'model_meta': fedot_meta,
+            'model_naive': fedot_naive.current_pipeline.descriptive_id,
+            'model_meta': fedot_meta.current_pipeline.descriptive_id,
+            'history_naive': fedot_naive.history,
+            'history_meta': fedot_meta.history,
             'automl_time_naive': automl_time_naive,
             'automl_time_meta': automl_time_meta,
             **metrics_naive, **metrics_meta
         })
 
+    time_now = datetime.now().isoformat(timespec="minutes").replace(":", ".")
+    save_dir = Path(f'run_{time_now}')
+    save_dir.mkdir()
+    history_dir = save_dir.joinpath('histories')
+    history_dir.mkdir()
+    for res in results:
+        dataset = res['dataset']
+        res.pop('history_naive').save(history_dir.joinpath(f'{dataset}_history_naive.json'))
+        res.pop('history_meta').save(history_dir.joinpath(f'{dataset}_history_meta.json'))
+    pd.DataFrame(results_pre).to_csv(save_dir.joinpath(f'results_pre_{time_now}.csv'))
+    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now}.csv'))
+
 
 if __name__ == "__main__":
     main()

From 75ea275a8f446cefdd5ec654398d31c21a30ad54 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Fri, 7 Apr 2023 21:08:49 +0300
Subject: [PATCH 12/60] finalize experiment [2]

---
 experiments/fedot_warm_start/run.py | 224 +++++++++++++++++++---------
 requirements.txt                    | Bin 460 -> 460 bytes
 2 files changed, 155 insertions(+), 69 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index be5f45f7..6e043d55 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -1,16 +1,20 @@
 import functools
+import json
 import timeit
 from datetime import datetime
 from itertools import chain
 from pathlib import Path
-from typing import Dict
+from typing import Dict, List, Tuple
 
 import numpy as np
 import openml
 import pandas as pd
 from fedot.api.main import Fedot
+from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate
 from fedot.core.pipelines.adapters import PipelineAdapter
-from sklearn.model_selection import train_test_split
+from fedot.core.pipelines.pipeline_builder import PipelineBuilder
+from fedot.core.validation.split import tabular_cv_generator
+from sklearn.model_selection import train_test_split, StratifiedKFold
 from tqdm import tqdm
 
 from meta_automl.data_preparation.dataset import DatasetCache, Dataset
@@ -24,27 +28,28 @@
 SEED = 42
 # Datasets sampling
 N_DATASETS = None
-TEST_SIZE = 0.33
+TEST_SIZE = 0.2
 # Evaluation timeouts
-TRAIN_TIMEOUT = 15
-TEST_TIMEOUT = 10
+TRAIN_TIMEOUT = 5
+TEST_TIMEOUT = 5
 # Models & datasets
 N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
 N_CLOSEST_DATASETS_TO_PROPOSE = 5
 MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1
 N_BEST_MODELS_TO_ADVISE = 5
-
+# Meta-features
+MF_EXTRACTOR_PARAMS = {'groups': 'general'}
 
 COMMON_FEDOT_PARAMS = dict(
     problem='classification',
-    with_tuning=False,
     logging_level=50,
     n_jobs=-1,
     seed=SEED,
+    show_progress=False,
 )
 
 
-def prepare_data() -> Dict[str, DatasetCache]:
+def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
     """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
 
     dataset_ids = openml.study.get_suite(99).data
@@ -52,18 +57,7 @@ def prepare_data() -> Dict[str, DatasetCache]:
         dataset_ids = pd.Series(dataset_ids)
         dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
     dataset_ids = list(dataset_ids)
-    return {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
-
-
-def timeit_decorator(function):
-    @functools.wraps(function)
-    def wrapped(*args, **kwargs):
-        start_time = timeit.default_timer()
-        res = function(*args, **kwargs)
-        time = timeit.default_timer() - start_time
-        return res, time
-
-    return wrapped
+    return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
 
 
 def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
@@ -74,40 +68,99 @@ def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
     return x, y
 
 
-def main():
-    datasets_cache = prepare_data()
-    datasets_train, datasets_test = train_test_split(list(datasets_cache.keys()),
-                                                     test_size=TEST_SIZE, random_state=SEED)
+def get_pipeline_metrics(pipeline,
+                         input_data,
+                         metrics_obj) -> dict:
+    """Gets quality metrics for the fitted pipeline.
+    The function is based on `Fedot.get_metrics()`
+
+    Returns:
+        the values of quality metrics
+    """
+    metrics = metrics_obj.metric_functions
+    metric_names = metrics_obj.get_metric_names(metrics)
+
+    data_producer = functools.partial(tabular_cv_generator, input_data, 10, StratifiedKFold)
+
+    objective = MetricsObjective(metrics)
+    obj_eval = PipelineObjectiveEvaluate(objective=objective,
+                                         data_producer=data_producer,
+                                         eval_n_jobs=-1)
 
-    extractor = PymfeExtractor(extractor_params={'groups': 'general'})
+    metrics = obj_eval.evaluate(pipeline).values
+    metrics = {metric_name: round(metric, 3) for (metric_name, metric) in zip(metric_names, metrics)}
+
+    return metrics
+
+
+def prepare_extractor_and_assessor(datasets_train: List[str]):
+    extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS)
     meta_features_train = extractor.extract(datasets_train, fill_input_nans=True)
     meta_features_train = meta_features_train.fillna(0)
     data_similarity_assessor = KNeighborsBasedSimilarityAssessor(
         n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE))
     data_similarity_assessor.fit(meta_features_train, datasets_train)
+    return data_similarity_assessor, extractor
+
+
+def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=None):
+    x, y = transform_data_for_fedot(data)
+
+    time_start = timeit.default_timer()
+    fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS)
+    fedot.fit(x, y)
+    automl_time = timeit.default_timer() - time_start
+
+    metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data, fedot.metrics)
+    pipeline = fedot.current_pipeline
+    run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
+                                      automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics)
+    return fedot, run_results
 
-    results_pre = []
+
+def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., automl_timeout_min=0.,
+                        **metrics):
+    run_results = dict(dataset_id=dataset.id,
+                       dataset_name=dataset.name,
+                       run_label=run_label,
+                       model_obj=pipeline,
+                       model_str=pipeline.descriptive_id,
+                       history_obj=history_obj,
+                       automl_time_sec=automl_time_sec,
+                       automl_timeout_min=automl_timeout_min,
+                       **metrics)
+    return run_results
+
+
+def main():
+    baseline_pipeline = PipelineBuilder().add_node('rf').build()
+
+    dataset_ids, datasets_cache = prepare_data()
+
+    datasets_train, datasets_test = \
+        train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED)
+
+    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
+
+    results = []
     best_models_per_dataset = {}
     for name in tqdm(datasets_train, 'Train datasets'):
         cache = datasets_cache[name]
         data = cache.from_cache()
 
-        fedot = Fedot(timeout=TRAIN_TIMEOUT, **COMMON_FEDOT_PARAMS)
-        x, y = transform_data_for_fedot(data)
-        _, automl_time = timeit_decorator(fedot.fit)(x, y)
-        results_pre.append({'dataset': name,
-                            'model': fedot.current_pipeline.descriptive_id,
-                            'automl_time': automl_time})
+        fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
+        results.append(run_results)
         # TODO:
         #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
         #   x Evaluate historical pipelines on the data instead of using fitness
+        #   x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run
 
         # Filter out unique individuals with the best fitness
+        history = fedot.history
         best_individuals = sorted(chain(*fedot.history.individuals),
                                   key=lambda ind: ind.fitness,
                                   reverse=True)
         best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
-        # best_models = list(fedot.best_models) or []
         best_models = []
         for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
             pipeline = PipelineAdapter().restore(individual.graph)
@@ -119,55 +172,88 @@ def main():
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-    results = []
     for name in tqdm(datasets_test, 'Test datasets'):
         cache = datasets_cache[name]
         data = cache.from_cache()
-        x, y = transform_data_for_fedot(data)
 
-        fedot_naive = Fedot(timeout=TEST_TIMEOUT, **COMMON_FEDOT_PARAMS)
-        _, automl_time_naive = timeit_decorator(fedot_naive.fit)(x, y)
-        fedot_naive.test_data = fedot_naive.train_data
-        fedot_naive.prediction = fedot_naive.train_data
+        # Run pure AutoML
+        fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT')
+        results.append(fedot_naive_results)
 
+        # Run meta AutoML
+        # 1
         time_start = timeit.default_timer()
         meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
         meta_features = meta_features.fillna(0)
+        meta_learning_time = timeit.default_timer() - time_start
         initial_assumptions = model_advisor.predict(meta_features)[0]
-        initial_assumptions = [model.predictor for model in initial_assumptions]
-        fedot_meta = Fedot(timeout=TEST_TIMEOUT, initial_assumption=initial_assumptions, **COMMON_FEDOT_PARAMS)
-        fedot_meta.fit(x, y)
-        automl_time_meta = timeit.default_timer() - time_start
-        fedot_meta.test_data = fedot_meta.train_data
-        fedot_meta.prediction = fedot_meta.train_data
-
-        metrics_naive = fedot_naive.get_metrics()
-        metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()}
-        metrics_meta = fedot_meta.get_metrics()
-        metrics_meta = {f'{key}_meta': val for key, val in metrics_meta.items()}
-
-        results.append({
-            'dataset': data.name,
-            'model_naive': fedot_naive.current_pipeline.descriptive_id,
-            'model_meta': fedot_meta.current_pipeline.descriptive_id,
-            'history_naive': fedot_naive.history,
-            'history_meta': fedot_meta.history,
-            'automl_time_naive': automl_time_naive,
-            'automl_time_meta': automl_time_meta,
-            **metrics_naive, **metrics_meta
-        })
-
-    time_now = datetime.now().isoformat(timespec="minutes").replace(":", ".")
-    save_dir = Path(f'run_{time_now}')
+        assumption_pipelines = [model.predictor for model in initial_assumptions]
+        # 2
+        fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
+                                                   initial_assumption=assumption_pipelines)
+        fedot_meta_results['meta_learning_time'] = meta_learning_time
+        results.append(fedot_meta_results)
+
+        # Fit & evaluate simple baseline
+        baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics)
+        baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
+                                           **baseline_metrics)
+        results.append(baseline_res)
+
+        # Fit & evaluate initial assumptions
+        for i, assumption in enumerate(initial_assumptions):
+            pipeline = assumption.predictor
+            assumption_metrics = get_pipeline_metrics(assumption.predictor, fedot_meta.train_data, fedot_meta.metrics)
+            assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
+                                                 pipeline=assumption.predictor, **assumption_metrics)
+            results.append(assumption_res)
+
+    # Save the accumulated results
+    time_now = datetime.now().isoformat(timespec="minutes")
+    time_now_for_path = time_now.replace(":", ".")
+    save_dir = Path(f'run_{time_now_for_path}')
     save_dir.mkdir()
     history_dir = save_dir.joinpath('histories')
     history_dir.mkdir()
+    models_dir = save_dir.joinpath('models')
     for res in results:
-        dataset = res['dataset']
-        res.pop('history_naive').save(history_dir.joinpath(f'{dataset}_history_naive.json'))
-        res.pop('history_meta').save(history_dir.joinpath(f'{dataset}_history_meta.json'))
-    pd.DataFrame(results_pre).to_csv(save_dir.joinpath(f'results_pre_{time_now}.csv'))
-    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now}.csv'))
+        res['run_date'] = time_now
+        dataset_name = res['dataset_name']
+        run_label = res['run_label']
+        # define saving paths
+        model_path = models_dir.joinpath(f'{dataset_name}_{run_label}')
+        history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json')
+        # replace objects with export paths for csv
+        res['model_path'] = str(model_path)
+        res.pop('model_obj').save(res['model_path'])
+        res['history_path'] = str(history_path)
+        history_obj = res.pop('history_obj')
+        if history_obj is not None:
+            history_obj.save(res['history_path'])
+    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_pre_{time_now_for_path}.csv'))
+    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
+
+    # save experiment hyperparameters
+    params = {
+        'run_date': time_now,
+        'seed': SEED,
+        'n_datasets': N_DATASETS or len(dataset_ids),
+        'test_size': TEST_SIZE,
+        'dataset_ids': dataset_ids,
+        'dataset_names': list(datasets_cache.keys()),
+        'dataset_names_train': datasets_train,
+        'dataset_names_test': datasets_test,
+        'train_timeout': TRAIN_TIMEOUT,
+        'test_timeout': TEST_TIMEOUT,
+        'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE,
+        'n_closest_datasets_to_propose': N_CLOSEST_DATASETS_TO_PROPOSE,
+        'minimal_distance_between_advised_models': MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS,
+        'n_best_models_to_advise': N_BEST_MODELS_TO_ADVISE,
+        'common_fedot_params': COMMON_FEDOT_PARAMS,
+        'baseline_pipeline': baseline_pipeline.descriptive_id,
+    }
+    with open(save_dir.joinpath('parameters.json'), 'w') as params_file:
+        json.dump(params, params_file, indent=2)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index 4785a7e31c157dd88d70c0da7317bb52f92cfde0..106d90f8a77d6ca43c6b65218930c14f0f7924ec 100644
GIT binary patch
delta 90
zcmX@Ze1>^~kG}<jF+(B{TQZmeX+t2H!eGW=#9+dZ#*hr8jTy|rGDbjA3!uCS5SlX>
Z0C5VCXAabt3dCkWHI_h`q>Zx;83C?M4l4iv

delta 90
zcmX@Ze1>^~kAD(FGJ_d|34;Mc3J@DIqycFY22&tu1f<izY(t>91w$fGG?~F1ti}K+
YYXnqp2IN@+VIol60I0@b<7`7l0Lz9BW&i*H


From 8f29cf72de174fd810d35ced81df8ec1cf76ba70 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sat, 8 Apr 2023 13:14:10 +0300
Subject: [PATCH 13/60] wrap & log exceptions; log progress to file

---
 experiments/fedot_warm_start/run.py | 193 ++++++++++++++++------------
 1 file changed, 111 insertions(+), 82 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 6e043d55..3e5a2a28 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -1,5 +1,6 @@
 import functools
 import json
+import logging
 import timeit
 from datetime import datetime
 from itertools import chain
@@ -14,6 +15,7 @@
 from fedot.core.pipelines.adapters import PipelineAdapter
 from fedot.core.pipelines.pipeline_builder import PipelineBuilder
 from fedot.core.validation.split import tabular_cv_generator
+from golem.core.log import Log
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from tqdm import tqdm
 
@@ -27,11 +29,11 @@
 # Meta-alg hyperparameters
 SEED = 42
 # Datasets sampling
-N_DATASETS = None
-TEST_SIZE = 0.2
+N_DATASETS = 3
+TEST_SIZE = 0.33
 # Evaluation timeouts
-TRAIN_TIMEOUT = 5
-TEST_TIMEOUT = 5
+TRAIN_TIMEOUT = 0.5
+TEST_TIMEOUT = 0.5
 # Models & datasets
 N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
 N_CLOSEST_DATASETS_TO_PROPOSE = 5
@@ -42,12 +44,25 @@
 
 COMMON_FEDOT_PARAMS = dict(
     problem='classification',
-    logging_level=50,
     n_jobs=-1,
     seed=SEED,
     show_progress=False,
 )
 
+# Setup logging
+time_now = datetime.now().isoformat(timespec="minutes")
+time_now_for_path = time_now.replace(":", ".")
+save_dir = Path(f'run_{time_now_for_path}')
+save_dir.mkdir()
+log_file = save_dir.joinpath('log.txt')
+Log(log_file=log_file)
+logging.basicConfig(filename=log_file,
+                    filemode='a',
+                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
+                    datefmt='%H:%M:%S',
+                    force=True,
+                    )
+
 
 def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
     """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
@@ -132,6 +147,19 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut
     return run_results
 
 
+def extract_best_history_models(dataset_cache, history):
+    best_individuals = sorted(chain(*history.individuals),
+                              key=lambda ind: ind.fitness,
+                              reverse=True)
+    best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
+    best_models = []
+    for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
+        pipeline = PipelineAdapter().restore(individual.graph)
+        model = Model(pipeline, individual.fitness, dataset_cache)
+        best_models.append(model)
+    return best_models
+
+
 def main():
     baseline_pipeline = PipelineBuilder().add_node('rf').build()
 
@@ -144,93 +172,91 @@ def main():
 
     results = []
     best_models_per_dataset = {}
-    for name in tqdm(datasets_train, 'Train datasets'):
-        cache = datasets_cache[name]
-        data = cache.from_cache()
-
-        fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
-        results.append(run_results)
-        # TODO:
-        #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
-        #   x Evaluate historical pipelines on the data instead of using fitness
-        #   x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run
-
-        # Filter out unique individuals with the best fitness
-        history = fedot.history
-        best_individuals = sorted(chain(*fedot.history.individuals),
-                                  key=lambda ind: ind.fitness,
-                                  reverse=True)
-        best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
-        best_models = []
-        for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
-            pipeline = PipelineAdapter().restore(individual.graph)
-            model = Model(pipeline, individual.fitness, cache)
-            best_models.append(model)
-        best_models_per_dataset[name] = best_models
+    progress_file = open(save_dir.joinpath('progress.txt'), 'a')
+    for name in tqdm(datasets_train, 'Train datasets', file=progress_file):
+        try:
+            cache = datasets_cache[name]
+            data = cache.from_cache()
+
+            fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
+            results.append(run_results)
+            # TODO:
+            #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
+            #   x Evaluate historical pipelines on the data instead of using fitness
+            #   x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run
+
+            # Filter out unique individuals with the best fitness
+            history = fedot.history
+            best_models = extract_best_history_models(cache, history)
+            best_models_per_dataset[name] = best_models
+        except Exception:
+            logging.exception(f'Train dataset "{name}"')
 
     model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-    for name in tqdm(datasets_test, 'Test datasets'):
-        cache = datasets_cache[name]
-        data = cache.from_cache()
-
-        # Run pure AutoML
-        fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT')
-        results.append(fedot_naive_results)
-
-        # Run meta AutoML
-        # 1
-        time_start = timeit.default_timer()
-        meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
-        meta_features = meta_features.fillna(0)
-        meta_learning_time = timeit.default_timer() - time_start
-        initial_assumptions = model_advisor.predict(meta_features)[0]
-        assumption_pipelines = [model.predictor for model in initial_assumptions]
-        # 2
-        fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
-                                                   initial_assumption=assumption_pipelines)
-        fedot_meta_results['meta_learning_time'] = meta_learning_time
-        results.append(fedot_meta_results)
-
-        # Fit & evaluate simple baseline
-        baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics)
-        baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
-                                           **baseline_metrics)
-        results.append(baseline_res)
-
-        # Fit & evaluate initial assumptions
-        for i, assumption in enumerate(initial_assumptions):
-            pipeline = assumption.predictor
-            assumption_metrics = get_pipeline_metrics(assumption.predictor, fedot_meta.train_data, fedot_meta.metrics)
-            assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
-                                                 pipeline=assumption.predictor, **assumption_metrics)
-            results.append(assumption_res)
+    for name in tqdm(datasets_test, 'Test datasets', file=progress_file):
+        try:
+            cache = datasets_cache[name]
+            data = cache.from_cache()
+
+            # Run pure AutoML
+            fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT')
+            results.append(fedot_naive_results)
+
+            # Run meta AutoML
+            # 1
+            time_start = timeit.default_timer()
+            meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
+            meta_features = meta_features.fillna(0)
+            meta_learning_time = timeit.default_timer() - time_start
+            initial_assumptions = model_advisor.predict(meta_features)[0]
+            assumption_pipelines = [model.predictor for model in initial_assumptions]
+            # 2
+            fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
+                                                       initial_assumption=assumption_pipelines)
+            fedot_meta_results['meta_learning_time'] = meta_learning_time
+            results.append(fedot_meta_results)
+
+            # Fit & evaluate simple baseline
+            baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics)
+            baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
+                                               **baseline_metrics)
+            results.append(baseline_res)
+
+            # Fit & evaluate initial assumptions
+            for i, assumption in enumerate(initial_assumptions):
+                pipeline = assumption.predictor
+                assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data, fedot_meta.metrics)
+                assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
+                                                     pipeline=pipeline, **assumption_metrics)
+                results.append(assumption_res)
+        except Exception:
+            logging.exception(f'Test dataset "{name}"')
 
     # Save the accumulated results
-    time_now = datetime.now().isoformat(timespec="minutes")
-    time_now_for_path = time_now.replace(":", ".")
-    save_dir = Path(f'run_{time_now_for_path}')
-    save_dir.mkdir()
     history_dir = save_dir.joinpath('histories')
     history_dir.mkdir()
     models_dir = save_dir.joinpath('models')
     for res in results:
-        res['run_date'] = time_now
-        dataset_name = res['dataset_name']
-        run_label = res['run_label']
-        # define saving paths
-        model_path = models_dir.joinpath(f'{dataset_name}_{run_label}')
-        history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json')
-        # replace objects with export paths for csv
-        res['model_path'] = str(model_path)
-        res.pop('model_obj').save(res['model_path'])
-        res['history_path'] = str(history_path)
-        history_obj = res.pop('history_obj')
-        if history_obj is not None:
-            history_obj.save(res['history_path'])
-    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_pre_{time_now_for_path}.csv'))
+        try:
+            res['run_date'] = time_now
+            dataset_name = res['dataset_name']
+            run_label = res['run_label']
+            # define saving paths
+            model_path = models_dir.joinpath(f'{dataset_name}_{run_label}')
+            history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json')
+            # replace objects with export paths for csv
+            res['model_path'] = str(model_path)
+            res.pop('model_obj').save(res['model_path'])
+            res['history_path'] = str(history_path)
+            history_obj = res.pop('history_obj')
+            if history_obj is not None:
+                history_obj.save(res['history_path'])
+        except Exception:
+            logging.exception(f'Saving results "{res}"')
+
     pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
 
     # save experiment hyperparameters
@@ -257,4 +283,7 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    try:
+        main()
+    except Exception:
+        logging.exception(f'Main level cached the error')

From 168a4dd68be5a47879bdeb942334e5c2490cc8bd Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sat, 8 Apr 2023 13:14:20 +0300
Subject: [PATCH 14/60] update requirements.txt

---
 requirements.txt | Bin 460 -> 460 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 106d90f8a77d6ca43c6b65218930c14f0f7924ec..ad0a22332f176f2c866188116575624428ac1536 100644
GIT binary patch
delta 43
rcmX@Ze1>@g6C<Y)gC2tc2v6o_v}81z?8>MKB=Z?fSxv#BD;bpm#pwvS

delta 43
vcmX@Ze1>@g6C<Y~gC2t^5F1bCX0&89ob1Y|2_*9wO<7HVqK1=KGAaWA#)=5O


From 1ae8511ac28ecbe91dd4c54827e1e56dde11d915 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sat, 8 Apr 2023 13:16:28 +0300
Subject: [PATCH 15/60] update timeouts

---
 experiments/fedot_warm_start/run.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 3e5a2a28..3113ad97 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -29,11 +29,11 @@
 # Meta-alg hyperparameters
 SEED = 42
 # Datasets sampling
-N_DATASETS = 3
-TEST_SIZE = 0.33
+N_DATASETS = None
+TEST_SIZE = 0.2
 # Evaluation timeouts
-TRAIN_TIMEOUT = 0.5
-TEST_TIMEOUT = 0.5
+TRAIN_TIMEOUT = 15
+TEST_TIMEOUT = 15
 # Models & datasets
 N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
 N_CLOSEST_DATASETS_TO_PROPOSE = 5

From cc71c4745077aadeec07af66b9b6feeb5a151d19 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Tue, 18 Apr 2023 18:02:21 +0300
Subject: [PATCH 16/60] remove GOLEM from requirements.txt to inherit version
 required by FEDOT

---
 requirements.txt | Bin 460 -> 430 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ad0a22332f176f2c866188116575624428ac1536..eca13d853ca1f8e55c583bd3790a78a679ffee4d 100644
GIT binary patch
delta 7
OcmX@ZypDOpIz|8ti~`^Q

delta 38
pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006<B2fF|O


From 1e7be91b90792e6858c5cd49b2c952b86e7216be Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Tue, 18 Apr 2023 18:04:02 +0300
Subject: [PATCH 17/60] clean openml cache

---
 .../datasets_loaders/openml_datasets_loader.py      | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
index f22bb0c6..01584c23 100644
--- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
+++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import shutil
+from pathlib import Path
 from typing import List, Union
 
 import openml
@@ -10,6 +12,12 @@
 OpenMLDatasetID = Union[str, int]
 
 
+def _clear_openml_cache():
+    cache_dir = openml.config.get_cache_directory()
+    cache_dir = Path(cache_dir)
+    shutil.rmtree(cache_dir)
+
+
 class OpenMLDatasetsLoader(DatasetsLoader):
 
     def __init__(self):
@@ -27,7 +35,10 @@ def load(self, dataset_sources: List[OpenMLDatasetID]) -> List[DatasetCache]:
         return datasets
 
     def load_single(self, source: OpenMLDatasetID):
-        return self.get_openml_dataset(source)
+        try:
+            return self.get_openml_dataset(source)
+        finally:
+            _clear_openml_cache()
 
     def get_openml_dataset(self, dataset_id: OpenMLDatasetID, force_download: bool = False) -> DatasetCache:
         openml_dataset = openml.datasets.get_dataset(dataset_id, download_data=False)

From a10174ce62b5d2fe4e3ab85ee5e38546c6da4047 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Tue, 18 Apr 2023 18:04:35 +0300
Subject: [PATCH 18/60] update Dockerfile

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index e17e17cd..7958082a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,6 +24,7 @@ WORKDIR $WORKDIR
 COPY . $WORKDIR
 
 RUN pip3 install pip && \
+    pip install wheel && \
     pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt
 
 ENV PYTHONPATH $WORKDIR

From a309eef64e86a559880aad5451c33b50143eab46 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 20 Apr 2023 15:42:57 +0300
Subject: [PATCH 19/60] make experiment safer

---
 experiments/fedot_warm_start/run.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 3113ad97..66c80192 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -19,6 +19,7 @@
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from tqdm import tqdm
 
+from meta_automl.data_preparation.data_manager import DataManager
 from meta_automl.data_preparation.dataset import DatasetCache, Dataset
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
@@ -52,7 +53,8 @@
 # Setup logging
 time_now = datetime.now().isoformat(timespec="minutes")
 time_now_for_path = time_now.replace(":", ".")
-save_dir = Path(f'run_{time_now_for_path}')
+save_dir = DataManager.get_data_dir()\
+    .joinpath(f'run_{time_now_for_path}').joinpath('experiments').joinpath('fedot_warm_start')
 save_dir.mkdir()
 log_file = save_dir.joinpath('log.txt')
 Log(log_file=log_file)
@@ -168,17 +170,16 @@ def main():
     datasets_train, datasets_test = \
         train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED)
 
-    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
-
     results = []
     best_models_per_dataset = {}
     progress_file = open(save_dir.joinpath('progress.txt'), 'a')
-    for name in tqdm(datasets_train, 'Train datasets', file=progress_file):
+    for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file):
         try:
             cache = datasets_cache[name]
             data = cache.from_cache()
 
-            fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
+            timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT
+            fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT')
             results.append(run_results)
             # TODO:
             #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
@@ -192,19 +193,16 @@ def main():
         except Exception:
             logging.exception(f'Train dataset "{name}"')
 
+    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
     model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-    for name in tqdm(datasets_test, 'Test datasets', file=progress_file):
+    for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file):
         try:
             cache = datasets_cache[name]
             data = cache.from_cache()
 
-            # Run pure AutoML
-            fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT')
-            results.append(fedot_naive_results)
-
             # Run meta AutoML
             # 1
             time_start = timeit.default_timer()
@@ -234,6 +232,7 @@ def main():
                 results.append(assumption_res)
         except Exception:
             logging.exception(f'Test dataset "{name}"')
+    progress_file.close()
 
     # Save the accumulated results
     history_dir = save_dir.joinpath('histories')

From 066cd3e6d110a5918010795b310f07599d985d7f Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 20 Apr 2023 16:38:47 +0300
Subject: [PATCH 20/60] add .dockerignore

---
 .dockerignore | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..2bfa6863
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,13 @@
+# Config & info files
+.pep8speaks.yml
+Dockerfile
+LICENSE
+README.md
+
+# Unnecessary files
+examples
+notebooks
+test
+
+# User data
+data

From 69b4915f8620c1b6753fb1a058d6839f1fe374ab Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 20 Apr 2023 17:39:14 +0300
Subject: [PATCH 21/60] fix save path

---
 experiments/fedot_warm_start/run.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 66c80192..9bf33ccb 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -4,7 +4,6 @@
 import timeit
 from datetime import datetime
 from itertools import chain
-from pathlib import Path
 from typing import Dict, List, Tuple
 
 import numpy as np
@@ -53,9 +52,9 @@
 # Setup logging
 time_now = datetime.now().isoformat(timespec="minutes")
 time_now_for_path = time_now.replace(":", ".")
-save_dir = DataManager.get_data_dir()\
-    .joinpath(f'run_{time_now_for_path}').joinpath('experiments').joinpath('fedot_warm_start')
-save_dir.mkdir()
+save_dir = DataManager.get_data_dir().\
+    joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}')
+save_dir.mkdir(parents=True)
 log_file = save_dir.joinpath('log.txt')
 Log(log_file=log_file)
 logging.basicConfig(filename=log_file,

From b490f05b643cc856d9616f2f02c7700813d71cb1 Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Tue, 2 May 2023 03:43:03 +0300
Subject: [PATCH 22/60] Making code more reusable and qualitative

---
 experiments/fedot_warm_start/__init__.py |   0
 experiments/fedot_warm_start/run.py      | 105 ++++++++++++++---------
 2 files changed, 66 insertions(+), 39 deletions(-)
 create mode 100644 experiments/fedot_warm_start/__init__.py

diff --git a/experiments/fedot_warm_start/__init__.py b/experiments/fedot_warm_start/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 3113ad97..4a9d8afd 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -26,6 +26,7 @@
 from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
 from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
 
+
 # Meta-alg hyperparameters
 SEED = 42
 # Datasets sampling
@@ -49,30 +50,41 @@
     show_progress=False,
 )
 
-# Setup logging
-time_now = datetime.now().isoformat(timespec="minutes")
-time_now_for_path = time_now.replace(":", ".")
-save_dir = Path(f'run_{time_now_for_path}')
-save_dir.mkdir()
-log_file = save_dir.joinpath('log.txt')
-Log(log_file=log_file)
-logging.basicConfig(filename=log_file,
-                    filemode='a',
-                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
-                    datefmt='%H:%M:%S',
-                    force=True,
-                    )
-
-
-def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
-    """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
+SAVE_DIR = None
+TIME_NOW = None
+TIME_NOW_FOR_PATH = None
+
+DEBUG = False
+
+
+def setup_logging():
+    global TIME_NOW
+    TIME_NOW = time_now = datetime.now().isoformat(timespec="minutes")
+    global TIME_NOW_FOR_PATH
+    TIME_NOW_FOR_PATH = time_now_for_path = time_now.replace(":", ".")
+    global SAVE_DIR
+    SAVE_DIR = save_dir = Path(f'run_{time_now_for_path}')
+    save_dir.mkdir()
+    log_file = save_dir.joinpath('log.txt')
+    Log(log_file=log_file)
+    logging.basicConfig(filename=log_file,
+                        filemode='a',
+                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
+                        datefmt='%H:%M:%S',
+                        force=True,
+                        )
+
 
+def fetch_openml_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
+    """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
     dataset_ids = openml.study.get_suite(99).data
     if N_DATASETS is not None:
         dataset_ids = pd.Series(dataset_ids)
         dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
-    dataset_ids = list(dataset_ids)
-    return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
+        dataset_ids = list(dataset_ids)
+
+    datasets = {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
+    return dataset_ids, datasets
 
 
 def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
@@ -92,6 +104,7 @@ def get_pipeline_metrics(pipeline,
     Returns:
         the values of quality metrics
     """
+    # print(str(metrics_obj))
     metrics = metrics_obj.metric_functions
     metric_names = metrics_obj.get_metric_names(metrics)
 
@@ -160,22 +173,34 @@ def extract_best_history_models(dataset_cache, history):
     return best_models
 
 
+def prepare_data() -> Tuple[Tuple[List[int], Dict[str, DatasetCache]], Tuple[List[str], List[str]]]:
+    dataset_ids, datasets = fetch_openml_data()
+
+    train_data_names, test_data_names = train_test_split(
+        list(datasets.keys()),
+        test_size=TEST_SIZE,
+        random_state=SEED
+    )
+    return (dataset_ids, datasets), (train_data_names, test_data_names)
+
+
 def main():
     baseline_pipeline = PipelineBuilder().add_node('rf').build()
 
-    dataset_ids, datasets_cache = prepare_data()
+    ds_with_ids, dataset_names = prepare_data()
+
+    train_ds_names, test_ds_names = dataset_names
 
-    datasets_train, datasets_test = \
-        train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED)
+    ds_ids, datasets = ds_with_ids
 
-    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
+    data_similarity_assessor, extractor = prepare_extractor_and_assessor(train_ds_names)
 
     results = []
     best_models_per_dataset = {}
-    progress_file = open(save_dir.joinpath('progress.txt'), 'a')
-    for name in tqdm(datasets_train, 'Train datasets', file=progress_file):
+    progress_file = open(SAVE_DIR.joinpath('progress.txt'), 'a')
+    for name in tqdm(train_ds_names, 'Train datasets', file=progress_file):
         try:
-            cache = datasets_cache[name]
+            cache = datasets[name]
             data = cache.from_cache()
 
             fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
@@ -196,9 +221,9 @@ def main():
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-    for name in tqdm(datasets_test, 'Test datasets', file=progress_file):
+    for name in tqdm(test_ds_names, 'Test datasets', file=progress_file):
         try:
-            cache = datasets_cache[name]
+            cache = datasets[name]
             data = cache.from_cache()
 
             # Run pure AutoML
@@ -236,12 +261,12 @@ def main():
             logging.exception(f'Test dataset "{name}"')
 
     # Save the accumulated results
-    history_dir = save_dir.joinpath('histories')
+    history_dir = SAVE_DIR.joinpath('histories')
     history_dir.mkdir()
-    models_dir = save_dir.joinpath('models')
+    models_dir = SAVE_DIR.joinpath('models')
     for res in results:
         try:
-            res['run_date'] = time_now
+            res['run_date'] = TIME_NOW
             dataset_name = res['dataset_name']
             run_label = res['run_label']
             # define saving paths
@@ -257,18 +282,18 @@ def main():
         except Exception:
             logging.exception(f'Saving results "{res}"')
 
-    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
+    pd.DataFrame(results).to_csv(SAVE_DIR.joinpath(f'results_{TIME_NOW_FOR_PATH}.csv'))
 
     # save experiment hyperparameters
     params = {
-        'run_date': time_now,
+        'run_date': TIME_NOW,
         'seed': SEED,
-        'n_datasets': N_DATASETS or len(dataset_ids),
+        'n_datasets': N_DATASETS or len(ds_ids),
         'test_size': TEST_SIZE,
-        'dataset_ids': dataset_ids,
-        'dataset_names': list(datasets_cache.keys()),
-        'dataset_names_train': datasets_train,
-        'dataset_names_test': datasets_test,
+        'dataset_ids': ds_ids,
+        'dataset_names': list(dataset_names.keys()),
+        'dataset_names_train': train_ds_names,
+        'dataset_names_test': test_ds_names,
         'train_timeout': TRAIN_TIMEOUT,
         'test_timeout': TEST_TIMEOUT,
         'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE,
@@ -278,12 +303,14 @@ def main():
         'common_fedot_params': COMMON_FEDOT_PARAMS,
         'baseline_pipeline': baseline_pipeline.descriptive_id,
     }
-    with open(save_dir.joinpath('parameters.json'), 'w') as params_file:
+    with open(SAVE_DIR.joinpath('parameters.json'), 'w') as params_file:
         json.dump(params, params_file, indent=2)
 
 
 if __name__ == "__main__":
     try:
+        if DEBUG:
+            setup_logging()
         main()
     except Exception:
         logging.exception(f'Main level cached the error')

From e7e4bf8b733fe8e562c6cc6923beffb4c7b34acb Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Tue, 2 May 2023 03:45:12 +0300
Subject: [PATCH 23/60] Adding auto-sklearn run script with an example

---
 experiments/auto-sklearn_run/__init__.py     |  0
 experiments/auto-sklearn_run/openml_suite.py | 91 ++++++++++++++++++++
 experiments/auto-sklearn_run/results.json    | 45 ++++++++++
 3 files changed, 136 insertions(+)
 create mode 100644 experiments/auto-sklearn_run/__init__.py
 create mode 100644 experiments/auto-sklearn_run/openml_suite.py
 create mode 100644 experiments/auto-sklearn_run/results.json

diff --git a/experiments/auto-sklearn_run/__init__.py b/experiments/auto-sklearn_run/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/experiments/auto-sklearn_run/openml_suite.py b/experiments/auto-sklearn_run/openml_suite.py
new file mode 100644
index 00000000..588d3b93
--- /dev/null
+++ b/experiments/auto-sklearn_run/openml_suite.py
@@ -0,0 +1,91 @@
+import pickle
+import re
+
+import numpy as np
+import json
+
+import autosklearn.classification
+from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import Balancing
+from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice
+from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
+from autosklearn.pipeline.components.classification import AutoSklearnClassificationAlgorithm, ClassifierChoice
+
+from experiments.fedot_warm_start.run import prepare_data
+from sklearn import model_selection, metrics
+from sklearn.base import ClassifierMixin
+
+
+class AutoSklearnEncoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, ClassifierChoice):
+            return repr(o.choice.estimator)
+        # if isinstance(o, (DataPreprocessorChoice, FeaturePreprocessorChoice)):
+        #     return None
+        if isinstance(o, ClassifierMixin):
+            return re.sub(r'\s{2,}', ' ', repr(o))
+        elif isinstance(o, Balancing):
+            return repr(o)
+        elif isinstance(o, np.integer):
+            return int(o)
+        elif isinstance(o, np.floating):
+            return float(o)
+
+
+class AutoSklearnValidator:
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def main():
+        ds_with_ids, ds_names = prepare_data()
+        train_ds_names, test_ds_names = ds_names
+
+        ds_ids, datasets = ds_with_ids
+
+        for ds_name in train_ds_names:
+        # if train_ds_names[0] is not None:
+            print("Sanity check")
+            dataset = datasets[ds_name].from_cache()
+
+            # cannot wait longer because of the slow data fetching, issue#9
+            estimator = autosklearn.classification.AutoSklearnClassifier(
+                time_left_for_this_task=60
+            )
+
+            X_train, X_test, y_train, y_test = model_selection.train_test_split(
+                dataset.x,
+                dataset.y,
+                test_size=0.2,
+                random_state=42
+            )
+
+            pipeline = estimator.fit(X_train, y_train)
+
+            predictions = estimator.predict(X_test)
+
+            quality_estimation = metrics.roc_auc_score(y_test, predictions)
+
+            results = {
+                'ensemble': pipeline.show_models(),
+                'score': quality_estimation
+            }
+
+            # pickle.dump(pipeline.show_models(), open("results.pickle", "wb"))
+
+            # print(type(pipeline.show_models().get(list(pipeline.show_models().keys())[0]).get("classifier")))
+
+            with open("results.json", "w") as file:
+                json.dump(
+                    results,
+                    file,
+                    cls=AutoSklearnEncoder,
+                    indent=2
+                )
+
+if __name__ == '__main__':
+    AutoSklearnValidator.main()
+
+
+
+
diff --git a/experiments/auto-sklearn_run/results.json b/experiments/auto-sklearn_run/results.json
new file mode 100644
index 00000000..b4ce4cbf
--- /dev/null
+++ b/experiments/auto-sklearn_run/results.json
@@ -0,0 +1,45 @@
+{
+  "ensemble": {
+    "2": {
+      "model_id": 2,
+      "rank": 1,
+      "cost": 0.02008032128514059,
+      "ensemble_weight": 0.1,
+      "balancing": "Balancing(random_state=1)",
+      "sklearn_classifier": "RandomForestClassifier(max_features=5, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)"
+    },
+    "6": {
+      "model_id": 6,
+      "rank": 2,
+      "cost": 0.04216867469879515,
+      "ensemble_weight": 0.02,
+      "balancing": "Balancing(random_state=1)",
+      "sklearn_classifier": "RandomForestClassifier(bootstrap=False, max_features=4, min_samples_leaf=4, min_samples_split=20, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)"
+    },
+    "7": {
+      "model_id": 7,
+      "rank": 3,
+      "cost": 0.025100401606425682,
+      "ensemble_weight": 0.08,
+      "balancing": "Balancing(random_state=1)",
+      "sklearn_classifier": "HistGradientBoostingClassifier(early_stopping=True, l2_regularization=5.759216242427118e-07, learning_rate=0.14515873247977112, loss='auto', max_iter=64, max_leaf_nodes=11, min_samples_leaf=1, n_iter_no_change=18, random_state=1, validation_fraction=0.06967552984405034, warm_start=True)"
+    },
+    "8": {
+      "model_id": 8,
+      "rank": 4,
+      "cost": 0.02208835341365467,
+      "ensemble_weight": 0.54,
+      "balancing": "Balancing(random_state=1, strategy='weighting')",
+      "sklearn_classifier": "AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=2), learning_rate=1.7653851967971248, n_estimators=290, random_state=1)"
+    },
+    "11": {
+      "model_id": 11,
+      "rank": 5,
+      "cost": 0.017068273092369468,
+      "ensemble_weight": 0.26,
+      "balancing": "Balancing(random_state=1)",
+      "sklearn_classifier": "AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=5), learning_rate=0.9772078202526538, n_estimators=418, random_state=1)"
+    }
+  },
+  "score": 0.9182632313000073
+}
\ No newline at end of file

From 7f74e70fb0b5f3409b7af98defb2f6b6ace11e9e Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 26 Feb 2023 17:53:07 +0300
Subject: [PATCH 24/60] move to FEDOT 0.7.0

---
 requirements.txt | Bin 430 -> 460 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index eca13d853ca1f8e55c583bd3790a78a679ffee4d..ad0a22332f176f2c866188116575624428ac1536 100644
GIT binary patch
delta 38
pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006<B2fF|O

delta 7
OcmX@ZypDOpIz|8ti~`^Q


From 94e0afac04b46f9b7d350202ec356fc6b36f0c4a Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 26 Feb 2023 18:07:22 +0300
Subject: [PATCH 25/60] create Dockerfile

---
 Dockerfile | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..e17e17cd
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,29 @@
+# Download base image ubuntu 20.04
+FROM ubuntu:20.04
+
+# For apt to be noninteractive
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN true
+
+# Preseed tzdata, update package index, upgrade packages and install needed software
+RUN truncate -s0 /tmp/preseed.cfg; \
+    echo "tzdata tzdata/Areas select Europe" >> /tmp/preseed.cfg; \
+    echo "tzdata tzdata/Zones/Europe select Berlin" >> /tmp/preseed.cfg; \
+    debconf-set-selections /tmp/preseed.cfg && \
+    rm -f /etc/timezone /etc/localtime && \
+	apt-get update && \
+	apt-get install -y nano  && \
+	apt-get install -y mc && \
+    apt-get install -y python3.9 python3-pip && \
+	apt-get install -y git && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set the workdir
+ENV WORKDIR /home/meta-automl-research
+WORKDIR $WORKDIR
+COPY . $WORKDIR
+
+RUN pip3 install pip && \
+    pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt
+
+ENV PYTHONPATH $WORKDIR

From d24247fd2512bfc580e7664c5a88bea926d6799c Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 26 Feb 2023 22:06:22 +0300
Subject: [PATCH 26/60] prepare experiment demo

---
 experiments/fedot_warm_start/run.py | 81 +++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 experiments/fedot_warm_start/run.py

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
new file mode 100644
index 00000000..f0342126
--- /dev/null
+++ b/experiments/fedot_warm_start/run.py
@@ -0,0 +1,81 @@
+import functools
+import timeit
+
+import openml
+import pandas as pd
+from fedot.api.main import Fedot
+from sklearn.model_selection import train_test_split
+
+from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
+from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
+
+SEED = 42
+
+
+def prepare_data():
+    dataset_ids = pd.Series(openml.study.get_suite(99).data)
+    dataset_ids = dataset_ids.sample(n=15, random_state=SEED)
+    dataset_ids = list(dataset_ids)
+    return OpenMLDatasetsLoader().load(dataset_ids)
+
+
+def timeit_decorator(function):
+    @functools.wraps(function)
+    def wrapped(*args, **kwargs):
+        start_time = timeit.default_timer()
+        res = function(*args, **kwargs)
+        time = timeit.default_timer() - start_time
+        return res, time
+
+    return wrapped
+
+
+def main():
+    datasets_cache = prepare_data()
+    datasets_train, datasets_test = train_test_split(datasets_cache, test_size=0.33, random_state=SEED)
+
+    # TODO:
+    #  - Extract meta-features for train datasets
+    #  - Fit 'DatasetsSimilarityAssessor'
+
+    results_pre = []
+    for cache in datasets_train:
+        data = cache.from_cache()
+        fedot = Fedot('classification', timeout=15, n_jobs=-1, seed=SEED)
+        _, automl_time = timeit_decorator(fedot.fit)(data.x, data.y)
+        results_pre.append({'dataset': data.name, 'model': fedot, 'automl_time': automl_time})
+
+    # TODO:
+    #  - Prepare 'ModelAdvisor'
+
+    results = []
+    for cache in datasets_test:
+        data = cache.from_cache()
+        fedot_naive = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED)
+        _, automl_time_naive = timeit_decorator(fedot_naive.fit)(data.x, data.y)
+
+        time_start = timeit.default_timer()
+        # TODO:
+        #  - Extract meta-features for current test dataset
+        #  - Get suitable assumptions from 'ModelAdvisor'
+        initial_assumption = ...
+        fedot_meta = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED, initial_assumption=initial_assumption)
+        automl_time_meta = timeit.default_timer() - time_start
+
+        metrics_naive = fedot_naive.get_metrics()
+        metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()}
+        metrics_meta = fedot_meta.get_metrics()
+        metrics_meta = {f'{key}_meta': val for key, val in metrics_meta.items()}
+
+        results.append({
+            'dataset': data.name,
+            'model_naive': fedot_naive,
+            'model_meta': fedot_meta,
+            'automl_time_naive': automl_time_naive,
+            'automl_time_meta': automl_time_meta,
+            **metrics_naive, **metrics_meta
+        })
+
+
+if __name__ == "__main__":
+    main()

From c4b3f9173ac8eca24e61e6d299467286de170854 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Fri, 3 Mar 2023 17:40:35 +0300
Subject: [PATCH 27/60] fix similarity assessors

---
 .../select_similar_datasets_by_knn.py         |  6 +-
 .../advise_models_from_similar_datasets.py    |  4 +-
 .../datasets_similarity_assessors/__init__.py |  2 +-
 .../model_based_similarity_assessors.py       | 51 ++++++++++++++++
 .../predict_proba_similarity_assessors.py     | 59 -------------------
 5 files changed, 57 insertions(+), 65 deletions(-)
 create mode 100644 meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
 delete mode 100644 meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py

diff --git a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
index dc1c190c..b6f2bb8c 100644
--- a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
+++ b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
@@ -2,7 +2,7 @@
 
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
-from meta_automl.meta_algorithm.datasets_similarity_assessors import KNNSimilarityAssessor
+from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
 
 
 def main():
@@ -16,10 +16,10 @@ def main():
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
     x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
     y_train = x_train.index
-    assessor = KNNSimilarityAssessor({'n_neighbors': 1}, n_best=2)
+    assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=3)
     assessor.fit(x_train, y_train)
     # Get models for the best fitting datasets from train.
-    return x_test.index, assessor.predict(x_test)
+    return x_test.index, assessor.predict(x_test, return_distance=True)
 
 
 if __name__ == '__main__':
diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py
index 993ac04a..37c3b2db 100644
--- a/examples/4_advising_models/advise_models_from_similar_datasets.py
+++ b/examples/4_advising_models/advise_models_from_similar_datasets.py
@@ -6,7 +6,7 @@
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
 from meta_automl.data_preparation.model import Model
-from meta_automl.meta_algorithm.datasets_similarity_assessors import KNNSimilarityAssessor
+from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
 from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
 
 
@@ -21,7 +21,7 @@ def main():
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
     x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
     y_train = x_train.index
-    assessor = KNNSimilarityAssessor({'n_neighbors': 3}, n_best=2)
+    assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=2)
     assessor.fit(x_train, y_train)
     # Define best models for datasets.
     best_pipelines = [
diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py
index 621a68e0..0c33e2c4 100644
--- a/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py
+++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/__init__.py
@@ -1,2 +1,2 @@
 from .datasets_similarity_assessor import DatasetsSimilarityAssessor
-from .predict_proba_similarity_assessors import KNNSimilarityAssessor, PredictProbaSimilarityAssessor
+from .model_based_similarity_assessors import KNeighborsBasedSimilarityAssessor, ModelBasedSimilarityAssessor
diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
new file mode 100644
index 00000000..09720a1e
--- /dev/null
+++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
@@ -0,0 +1,51 @@
+from abc import ABC
+from typing import Optional, Dict, Any, List, Iterable
+
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import NearestNeighbors
+
+from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \
+    DatasetsSimilarityAssessor
+
+
+class ModelBasedSimilarityAssessor(ABC, DatasetsSimilarityAssessor):
+    def __init__(self, model, n_best: int = 1):
+        self._inner_model = model
+        self.n_best = n_best
+        self._datasets: Optional[Iterable[str]] = None
+
+
+class KNeighborsBasedSimilarityAssessor(ModelBasedSimilarityAssessor):
+    def __init__(self, n_neighbors: int = 1, **model_params):
+        model = NearestNeighbors(n_neighbors=n_neighbors, **model_params)
+        super().__init__(model, n_neighbors)
+
+    def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]):
+        meta_features = self.preprocess_meta_features(meta_features)
+        self._datasets = np.array(datasets)
+        self._inner_model.fit(meta_features)
+
+    @staticmethod
+    def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame:
+        return meta_features.dropna(axis=1, how='any')
+
+    def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[str]]:
+        dataset_indexes = self._inner_model.kneighbors(meta_features, return_distance=return_distance)
+        if return_distance:
+            distances, dataset_indexes = dataset_indexes
+            dataset_names = np.take(self._datasets, dataset_indexes, axis=0)
+            return distances, dataset_names
+        else:
+            return np.take(self._datasets, dataset_indexes, axis=0)
+
+    @property
+    def datasets(self) -> Optional[Iterable[str]]:
+        return self._datasets
+
+    @property
+    def feature_names(self) -> List[str]:
+        return self._inner_model.feature_names_in_
+
+    def _preprocess_predict_features(self, meta_features: pd.DataFrame) -> pd.DataFrame:
+        return meta_features[self.feature_names]
diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py
deleted file mode 100644
index 8254c745..00000000
--- a/meta_automl/meta_algorithm/datasets_similarity_assessors/predict_proba_similarity_assessors.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from typing import Optional, Dict, Any, List, Iterable
-
-import numpy as np
-import pandas as pd
-from sklearn.neighbors import KNeighborsClassifier
-
-from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \
-    DatasetsSimilarityAssessor
-
-
-class PredictProbaSimilarityAssessor(DatasetsSimilarityAssessor):
-    def __init__(self, model, n_best: int = 1):
-        self._inner_model = model
-        self.n_best = n_best
-
-    @property
-    def datasets(self) -> List[str]:
-        return self._inner_model.classes_
-
-    @property
-    def feature_names(self) -> List[str]:
-        return self._inner_model.feature_names_in_
-
-    @staticmethod
-    def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame:
-        return meta_features.dropna(axis=1, how='any')
-
-    def _preprocess_predict_features(self, meta_features: pd.DataFrame) -> pd.DataFrame:
-        return meta_features[self.feature_names]
-
-    def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]):
-        meta_features = self.preprocess_meta_features(meta_features)
-        self._inner_model.fit(meta_features, datasets)
-
-    def predict_proba(self, meta_features: pd.DataFrame) -> List[List[float]]:
-        return self._inner_model.predict_proba(meta_features)
-
-    def predict(self, meta_features: pd.DataFrame) -> List[List[str]]:
-        meta_features = self._preprocess_predict_features(meta_features)
-        predict_probs = self.predict_proba(meta_features)
-        final_prediction = []
-        for probabilities in predict_probs:
-            probabilities = list(probabilities)
-            predictions = []
-            for _ in range(self.n_best):
-                predicted_class_idx = np.argmax(probabilities)
-                predicted_class = self.datasets[predicted_class_idx]
-                predictions.append(predicted_class)
-                probabilities.pop(predicted_class_idx)
-            final_prediction.append(predictions)
-
-        return final_prediction
-
-
-class KNNSimilarityAssessor(PredictProbaSimilarityAssessor):
-    def __init__(self, model_params: Optional[Dict[str, Any]] = None, n_best: int = 1):
-        model_params = model_params or dict()
-        model = KNeighborsClassifier(**model_params)
-        super().__init__(model, n_best)

From e0661f35579cbce48ef3f3f20b4c025de3c48ded Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sat, 4 Mar 2023 13:32:47 +0300
Subject: [PATCH 28/60] allow PymfeExtractor fill values with median

---
 .../meta_features_extractors/pymfe_extractor.py   | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
index 2848998e..3a379f6f 100644
--- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
+++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
@@ -25,7 +25,7 @@ def datasets_loader(self) -> DatasetsLoader:
             raise ValueError("Datasets loader not provided!")
         return self._datasets_loader
 
-    def extract(self, datasets: List[Union[DatasetCache, str]]) -> pd.DataFrame:
+    def extract(self, datasets: List[Union[DatasetCache, str]], fill_nans: bool = False) -> pd.DataFrame:
         meta_features = {}
         meta_feature_names = self._extractor.extract_metafeature_names()
         load_dataset = self.datasets_loader.cache_to_memory
@@ -37,10 +37,21 @@ def extract(self, datasets: List[Union[DatasetCache, str]]) -> pd.DataFrame:
             else:
                 loaded_dataset = load_dataset(dataset)
                 cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val]
-                mfe = self._extractor.fit(loaded_dataset.x, loaded_dataset.y, cat_cols=cat_cols)
+                x = loaded_dataset.x
+                y = loaded_dataset.y
+                if fill_nans:
+                    x = self.fill_nans(x)
+                mfe = self._extractor.fit(x, y, cat_cols=cat_cols)
                 feature_names, dataset_features = mfe.extract(out_type=tuple)
                 mfs = dict(zip(feature_names, dataset_features))
                 self._update_meta_features_cache(dataset.name, mfs)
                 meta_features[dataset.name] = mfs
         meta_features = pd.DataFrame.from_dict(meta_features, orient='index')
         return meta_features
+
+    @staticmethod
+    def fill_nans(x):
+        if not isinstance(x, pd.DataFrame):
+            x = pd.DataFrame(x)
+        x = x.fillna(x.median())
+        return x.to_numpy()

From 4f10b0386be01ec79d84033399852b09b88ab334 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Fri, 24 Mar 2023 14:10:48 +0300
Subject: [PATCH 29/60] use FEDOT version with fixed initial assumptions

---
 requirements.txt | Bin 460 -> 310 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ad0a22332f176f2c866188116575624428ac1536..4b8e1290af910b183fc689ad8e6fdcccc56d827e 100644
GIT binary patch
delta 34
mcmX@Zyp2in|GzYbRE89We1;MRTOc%G&|@$MVuOwAWf=joy$G8C

delta 185
zcmXAh%?`m(5QRVb6c$#N66N;lUm~%<*2W8{Rx8o8Nw2kMvhft22}flzGdbrwGtc46
zyMNv#7aUV6O-D;dim&?n6*n?woM@=!9+j@8uD$QGW6Op;2iC*{;sFUu?S*Deh{1B!
og%OE*V&ul+7CV>q>s&I@VWAEcN(3_|(xqyp2Zb^X?lBq<zU-DCy8r+H


From a78be30421c7bd1eb25011ec12ac153206872f50 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 30 Mar 2023 13:47:06 +0300
Subject: [PATCH 30/60] optional cache usage for MFE extractor

---
 .../meta_features_extractors/pymfe_extractor.py      | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
index 3a379f6f..36cb9d45 100644
--- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
+++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
@@ -25,26 +25,30 @@ def datasets_loader(self) -> DatasetsLoader:
             raise ValueError("Datasets loader not provided!")
         return self._datasets_loader
 
-    def extract(self, datasets: List[Union[DatasetCache, str]], fill_nans: bool = False) -> pd.DataFrame:
+    def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: bool = False,
+                use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame:
         meta_features = {}
         meta_feature_names = self._extractor.extract_metafeature_names()
         load_dataset = self.datasets_loader.cache_to_memory
         for dataset in datasets:
             if isinstance(dataset, str):
                 dataset = DatasetCache(dataset)
-            if mfs := self._get_meta_features_cache(dataset.name, meta_feature_names):
+
+            if (use_cached and
+                    (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))):
                 meta_features[dataset.name] = mfs
             else:
                 loaded_dataset = load_dataset(dataset)
                 cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val]
                 x = loaded_dataset.x
                 y = loaded_dataset.y
-                if fill_nans:
+                if fill_input_nans:
                     x = self.fill_nans(x)
                 mfe = self._extractor.fit(x, y, cat_cols=cat_cols)
                 feature_names, dataset_features = mfe.extract(out_type=tuple)
                 mfs = dict(zip(feature_names, dataset_features))
-                self._update_meta_features_cache(dataset.name, mfs)
+                if update_cached:
+                    self._update_meta_features_cache(dataset.name, mfs)
                 meta_features[dataset.name] = mfs
         meta_features = pd.DataFrame.from_dict(meta_features, orient='index')
         return meta_features

From 9bf6d9782a3b65af9d7754b37c328bcbea0d48ea Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 30 Mar 2023 13:47:54 +0300
Subject: [PATCH 31/60] allow to advise only the n best models

---
 .../model_advisors/diverse_fedot_pipeline_advisor.py      | 8 +++++++-
 .../meta_algorithm/model_advisors/model_advisor.py        | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
index 15ef1f57..6f7e4a66 100644
--- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
+++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
@@ -1,4 +1,4 @@
-from typing import Callable, List, Iterable
+from typing import Callable, List, Iterable, Optional
 
 from fedot.core.pipelines.pipeline import Pipeline
 from golem.core.dag.linked_graph import get_distance_between
@@ -11,10 +11,12 @@
 class DiverseFEDOTPipelineAdvisor(SimpleSimilarityModelAdvisor):
     def __init__(self,
                  fitted_similarity_assessor: DatasetsSimilarityAssessor,
+                 n_best_to_advise: Optional[int] = None,
                  minimal_distance: int = 1,
                  distance_func: Callable[[Pipeline, Pipeline], int] = get_distance_between):
         super().__init__(fitted_similarity_assessor)
         self.minimal_distance = minimal_distance
+        self.n_best_to_advise = n_best_to_advise
         self.distance_func = distance_func
 
     def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]:
@@ -24,4 +26,8 @@ def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]:
         for model in dataset_advice[1:]:
             if self.distance_func(first_model.predictor, model.predictor) > self.minimal_distance:
                 diverse_dataset_advice.append(model)
+
+        if self.n_best_to_advise is not None:
+            diverse_dataset_advice = list(sorted(diverse_dataset_advice, key=lambda m: m.fitness, reverse=True))
+            diverse_dataset_advice = diverse_dataset_advice[:self.n_best_to_advise]
         return diverse_dataset_advice
diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py
index b585bf27..a9ca0d97 100644
--- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py
+++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import List, Dict, Iterable
+from typing import List, Dict, Iterable, Optional
 
 import pandas as pd
 

From fdee481fbd3ad0d42e19ff4f3fe08e330295c372 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 30 Mar 2023 13:58:49 +0300
Subject: [PATCH 32/60] finalize experiment

---
 experiments/fedot_warm_start/run.py | 145 +++++++++++++++++++++++-----
 1 file changed, 119 insertions(+), 26 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index f0342126..be5f45f7 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -1,22 +1,58 @@
 import functools
 import timeit
+from datetime import datetime
+from itertools import chain
+from pathlib import Path
+from typing import Dict
 
+import numpy as np
 import openml
 import pandas as pd
 from fedot.api.main import Fedot
+from fedot.core.pipelines.adapters import PipelineAdapter
 from sklearn.model_selection import train_test_split
+from tqdm import tqdm
 
+from meta_automl.data_preparation.dataset import DatasetCache, Dataset
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
+from meta_automl.data_preparation.model import Model
+from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
+from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
 
+# Meta-alg hyperparameters
 SEED = 42
-
-
-def prepare_data():
-    dataset_ids = pd.Series(openml.study.get_suite(99).data)
-    dataset_ids = dataset_ids.sample(n=15, random_state=SEED)
+# Datasets sampling
+N_DATASETS = None
+TEST_SIZE = 0.33
+# Evaluation timeouts
+TRAIN_TIMEOUT = 15
+TEST_TIMEOUT = 10
+# Models & datasets
+N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
+N_CLOSEST_DATASETS_TO_PROPOSE = 5
+MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1
+N_BEST_MODELS_TO_ADVISE = 5
+
+
+COMMON_FEDOT_PARAMS = dict(
+    problem='classification',
+    with_tuning=False,
+    logging_level=50,
+    n_jobs=-1,
+    seed=SEED,
+)
+
+
+def prepare_data() -> Dict[str, DatasetCache]:
+    """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
+
+    dataset_ids = openml.study.get_suite(99).data
+    if N_DATASETS is not None:
+        dataset_ids = pd.Series(dataset_ids)
+        dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
     dataset_ids = list(dataset_ids)
-    return OpenMLDatasetsLoader().load(dataset_ids)
+    return {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
 
 
 def timeit_decorator(function):
@@ -30,37 +66,80 @@ def wrapped(*args, **kwargs):
     return wrapped
 
 
+def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
+    x = data.x
+    y = data.y
+    if len(y.shape) == 1:
+        y = y.reshape(-1, 1)
+    return x, y
+
+
 def main():
     datasets_cache = prepare_data()
-    datasets_train, datasets_test = train_test_split(datasets_cache, test_size=0.33, random_state=SEED)
+    datasets_train, datasets_test = train_test_split(list(datasets_cache.keys()),
+                                                     test_size=TEST_SIZE, random_state=SEED)
 
-    # TODO:
-    #  - Extract meta-features for train datasets
-    #  - Fit 'DatasetsSimilarityAssessor'
+    extractor = PymfeExtractor(extractor_params={'groups': 'general'})
+    meta_features_train = extractor.extract(datasets_train, fill_input_nans=True)
+    meta_features_train = meta_features_train.fillna(0)
+    data_similarity_assessor = KNeighborsBasedSimilarityAssessor(
+        n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE))
+    data_similarity_assessor.fit(meta_features_train, datasets_train)
 
     results_pre = []
-    for cache in datasets_train:
+    best_models_per_dataset = {}
+    for name in tqdm(datasets_train, 'Train datasets'):
+        cache = datasets_cache[name]
         data = cache.from_cache()
-        fedot = Fedot('classification', timeout=15, n_jobs=-1, seed=SEED)
-        _, automl_time = timeit_decorator(fedot.fit)(data.x, data.y)
-        results_pre.append({'dataset': data.name, 'model': fedot, 'automl_time': automl_time})
 
-    # TODO:
-    #  - Prepare 'ModelAdvisor'
+        fedot = Fedot(timeout=TRAIN_TIMEOUT, **COMMON_FEDOT_PARAMS)
+        x, y = transform_data_for_fedot(data)
+        _, automl_time = timeit_decorator(fedot.fit)(x, y)
+        results_pre.append({'dataset': name,
+                            'model': fedot.current_pipeline.descriptive_id,
+                            'automl_time': automl_time})
+        # TODO:
+        #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
+        #   x Evaluate historical pipelines on the data instead of using fitness
+
+        # Filter out unique individuals with the best fitness
+        best_individuals = sorted(chain(*fedot.history.individuals),
+                                  key=lambda ind: ind.fitness,
+                                  reverse=True)
+        best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
+        # best_models = list(fedot.best_models) or []
+        best_models = []
+        for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
+            pipeline = PipelineAdapter().restore(individual.graph)
+            model = Model(pipeline, individual.fitness, cache)
+            best_models.append(model)
+        best_models_per_dataset[name] = best_models
+
+    model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
+                                                minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
+    model_advisor.fit(best_models_per_dataset)
 
     results = []
-    for cache in datasets_test:
+    for name in tqdm(datasets_test, 'Test datasets'):
+        cache = datasets_cache[name]
         data = cache.from_cache()
-        fedot_naive = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED)
-        _, automl_time_naive = timeit_decorator(fedot_naive.fit)(data.x, data.y)
+        x, y = transform_data_for_fedot(data)
+
+        fedot_naive = Fedot(timeout=TEST_TIMEOUT, **COMMON_FEDOT_PARAMS)
+        _, automl_time_naive = timeit_decorator(fedot_naive.fit)(x, y)
+        fedot_naive.test_data = fedot_naive.train_data
+        fedot_naive.prediction = fedot_naive.train_data
 
         time_start = timeit.default_timer()
-        # TODO:
-        #  - Extract meta-features for current test dataset
-        #  - Get suitable assumptions from 'ModelAdvisor'
-        initial_assumption = ...
-        fedot_meta = Fedot('classification', timeout=5, n_jobs=-1, seed=SEED, initial_assumption=initial_assumption)
+        meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
+        meta_features = meta_features.fillna(0)
+        initial_assumptions = model_advisor.predict(meta_features)[0]
+        initial_assumptions = [model.predictor for model in initial_assumptions]
+        fedot_meta = Fedot(timeout=TEST_TIMEOUT, initial_assumption=initial_assumptions, **COMMON_FEDOT_PARAMS)
+        fedot_meta.fit(x, y)
         automl_time_meta = timeit.default_timer() - time_start
+        fedot_meta.test_data = fedot_meta.train_data
+        fedot_meta.prediction = fedot_meta.train_data
 
         metrics_naive = fedot_naive.get_metrics()
         metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()}
@@ -69,13 +148,27 @@ def main():
 
         results.append({
             'dataset': data.name,
-            'model_naive': fedot_naive,
-            'model_meta': fedot_meta,
+            'model_naive': fedot_naive.current_pipeline.descriptive_id,
+            'model_meta': fedot_meta.current_pipeline.descriptive_id,
+            'history_naive': fedot_naive.history,
+            'history_meta': fedot_meta.history,
             'automl_time_naive': automl_time_naive,
             'automl_time_meta': automl_time_meta,
             **metrics_naive, **metrics_meta
         })
 
+    time_now = datetime.now().isoformat(timespec="minutes").replace(":", ".")
+    save_dir = Path(f'run_{time_now}')
+    save_dir.mkdir()
+    history_dir = save_dir.joinpath('histories')
+    history_dir.mkdir()
+    for res in results:
+        dataset = res['dataset']
+        res.pop('history_naive').save(history_dir.joinpath(f'{dataset}_history_naive.json'))
+        res.pop('history_meta').save(history_dir.joinpath(f'{dataset}_history_meta.json'))
+    pd.DataFrame(results_pre).to_csv(save_dir.joinpath(f'results_pre_{time_now}.csv'))
+    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now}.csv'))
+
 
 if __name__ == "__main__":
     main()

From 169ab3ef409aa8580776625ae955788de73a4fc2 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Fri, 7 Apr 2023 21:08:49 +0300
Subject: [PATCH 33/60] finalize experiment [2]

---
 experiments/fedot_warm_start/run.py | 224 +++++++++++++++++++---------
 requirements.txt                    | Bin 310 -> 460 bytes
 2 files changed, 155 insertions(+), 69 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index be5f45f7..6e043d55 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -1,16 +1,20 @@
 import functools
+import json
 import timeit
 from datetime import datetime
 from itertools import chain
 from pathlib import Path
-from typing import Dict
+from typing import Dict, List, Tuple
 
 import numpy as np
 import openml
 import pandas as pd
 from fedot.api.main import Fedot
+from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate
 from fedot.core.pipelines.adapters import PipelineAdapter
-from sklearn.model_selection import train_test_split
+from fedot.core.pipelines.pipeline_builder import PipelineBuilder
+from fedot.core.validation.split import tabular_cv_generator
+from sklearn.model_selection import train_test_split, StratifiedKFold
 from tqdm import tqdm
 
 from meta_automl.data_preparation.dataset import DatasetCache, Dataset
@@ -24,27 +28,28 @@
 SEED = 42
 # Datasets sampling
 N_DATASETS = None
-TEST_SIZE = 0.33
+TEST_SIZE = 0.2
 # Evaluation timeouts
-TRAIN_TIMEOUT = 15
-TEST_TIMEOUT = 10
+TRAIN_TIMEOUT = 5
+TEST_TIMEOUT = 5
 # Models & datasets
 N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
 N_CLOSEST_DATASETS_TO_PROPOSE = 5
 MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1
 N_BEST_MODELS_TO_ADVISE = 5
-
+# Meta-features
+MF_EXTRACTOR_PARAMS = {'groups': 'general'}
 
 COMMON_FEDOT_PARAMS = dict(
     problem='classification',
-    with_tuning=False,
     logging_level=50,
     n_jobs=-1,
     seed=SEED,
+    show_progress=False,
 )
 
 
-def prepare_data() -> Dict[str, DatasetCache]:
+def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
     """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
 
     dataset_ids = openml.study.get_suite(99).data
@@ -52,18 +57,7 @@ def prepare_data() -> Dict[str, DatasetCache]:
         dataset_ids = pd.Series(dataset_ids)
         dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
     dataset_ids = list(dataset_ids)
-    return {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
-
-
-def timeit_decorator(function):
-    @functools.wraps(function)
-    def wrapped(*args, **kwargs):
-        start_time = timeit.default_timer()
-        res = function(*args, **kwargs)
-        time = timeit.default_timer() - start_time
-        return res, time
-
-    return wrapped
+    return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
 
 
 def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
@@ -74,40 +68,99 @@ def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
     return x, y
 
 
-def main():
-    datasets_cache = prepare_data()
-    datasets_train, datasets_test = train_test_split(list(datasets_cache.keys()),
-                                                     test_size=TEST_SIZE, random_state=SEED)
+def get_pipeline_metrics(pipeline,
+                         input_data,
+                         metrics_obj) -> dict:
+    """Gets quality metrics for the fitted pipeline.
+    The function is based on `Fedot.get_metrics()`
+
+    Returns:
+        the values of quality metrics
+    """
+    metrics = metrics_obj.metric_functions
+    metric_names = metrics_obj.get_metric_names(metrics)
+
+    data_producer = functools.partial(tabular_cv_generator, input_data, 10, StratifiedKFold)
+
+    objective = MetricsObjective(metrics)
+    obj_eval = PipelineObjectiveEvaluate(objective=objective,
+                                         data_producer=data_producer,
+                                         eval_n_jobs=-1)
 
-    extractor = PymfeExtractor(extractor_params={'groups': 'general'})
+    metrics = obj_eval.evaluate(pipeline).values
+    metrics = {metric_name: round(metric, 3) for (metric_name, metric) in zip(metric_names, metrics)}
+
+    return metrics
+
+
+def prepare_extractor_and_assessor(datasets_train: List[str]):
+    extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS)
     meta_features_train = extractor.extract(datasets_train, fill_input_nans=True)
     meta_features_train = meta_features_train.fillna(0)
     data_similarity_assessor = KNeighborsBasedSimilarityAssessor(
         n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE))
     data_similarity_assessor.fit(meta_features_train, datasets_train)
+    return data_similarity_assessor, extractor
+
+
+def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=None):
+    x, y = transform_data_for_fedot(data)
+
+    time_start = timeit.default_timer()
+    fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS)
+    fedot.fit(x, y)
+    automl_time = timeit.default_timer() - time_start
+
+    metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data, fedot.metrics)
+    pipeline = fedot.current_pipeline
+    run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
+                                      automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics)
+    return fedot, run_results
 
-    results_pre = []
+
+def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., automl_timeout_min=0.,
+                        **metrics):
+    run_results = dict(dataset_id=dataset.id,
+                       dataset_name=dataset.name,
+                       run_label=run_label,
+                       model_obj=pipeline,
+                       model_str=pipeline.descriptive_id,
+                       history_obj=history_obj,
+                       automl_time_sec=automl_time_sec,
+                       automl_timeout_min=automl_timeout_min,
+                       **metrics)
+    return run_results
+
+
+def main():
+    baseline_pipeline = PipelineBuilder().add_node('rf').build()
+
+    dataset_ids, datasets_cache = prepare_data()
+
+    datasets_train, datasets_test = \
+        train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED)
+
+    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
+
+    results = []
     best_models_per_dataset = {}
     for name in tqdm(datasets_train, 'Train datasets'):
         cache = datasets_cache[name]
         data = cache.from_cache()
 
-        fedot = Fedot(timeout=TRAIN_TIMEOUT, **COMMON_FEDOT_PARAMS)
-        x, y = transform_data_for_fedot(data)
-        _, automl_time = timeit_decorator(fedot.fit)(x, y)
-        results_pre.append({'dataset': name,
-                            'model': fedot.current_pipeline.descriptive_id,
-                            'automl_time': automl_time})
+        fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
+        results.append(run_results)
         # TODO:
         #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
         #   x Evaluate historical pipelines on the data instead of using fitness
+        #   x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run
 
         # Filter out unique individuals with the best fitness
+        history = fedot.history
         best_individuals = sorted(chain(*fedot.history.individuals),
                                   key=lambda ind: ind.fitness,
                                   reverse=True)
         best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
-        # best_models = list(fedot.best_models) or []
         best_models = []
         for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
             pipeline = PipelineAdapter().restore(individual.graph)
@@ -119,55 +172,88 @@ def main():
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-    results = []
     for name in tqdm(datasets_test, 'Test datasets'):
         cache = datasets_cache[name]
         data = cache.from_cache()
-        x, y = transform_data_for_fedot(data)
 
-        fedot_naive = Fedot(timeout=TEST_TIMEOUT, **COMMON_FEDOT_PARAMS)
-        _, automl_time_naive = timeit_decorator(fedot_naive.fit)(x, y)
-        fedot_naive.test_data = fedot_naive.train_data
-        fedot_naive.prediction = fedot_naive.train_data
+        # Run pure AutoML
+        fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT')
+        results.append(fedot_naive_results)
 
+        # Run meta AutoML
+        # 1
         time_start = timeit.default_timer()
         meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
         meta_features = meta_features.fillna(0)
+        meta_learning_time = timeit.default_timer() - time_start
         initial_assumptions = model_advisor.predict(meta_features)[0]
-        initial_assumptions = [model.predictor for model in initial_assumptions]
-        fedot_meta = Fedot(timeout=TEST_TIMEOUT, initial_assumption=initial_assumptions, **COMMON_FEDOT_PARAMS)
-        fedot_meta.fit(x, y)
-        automl_time_meta = timeit.default_timer() - time_start
-        fedot_meta.test_data = fedot_meta.train_data
-        fedot_meta.prediction = fedot_meta.train_data
-
-        metrics_naive = fedot_naive.get_metrics()
-        metrics_naive = {f'{key}_naive': val for key, val in metrics_naive.items()}
-        metrics_meta = fedot_meta.get_metrics()
-        metrics_meta = {f'{key}_meta': val for key, val in metrics_meta.items()}
-
-        results.append({
-            'dataset': data.name,
-            'model_naive': fedot_naive.current_pipeline.descriptive_id,
-            'model_meta': fedot_meta.current_pipeline.descriptive_id,
-            'history_naive': fedot_naive.history,
-            'history_meta': fedot_meta.history,
-            'automl_time_naive': automl_time_naive,
-            'automl_time_meta': automl_time_meta,
-            **metrics_naive, **metrics_meta
-        })
-
-    time_now = datetime.now().isoformat(timespec="minutes").replace(":", ".")
-    save_dir = Path(f'run_{time_now}')
+        assumption_pipelines = [model.predictor for model in initial_assumptions]
+        # 2
+        fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
+                                                   initial_assumption=assumption_pipelines)
+        fedot_meta_results['meta_learning_time'] = meta_learning_time
+        results.append(fedot_meta_results)
+
+        # Fit & evaluate simple baseline
+        baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics)
+        baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
+                                           **baseline_metrics)
+        results.append(baseline_res)
+
+        # Fit & evaluate initial assumptions
+        for i, assumption in enumerate(initial_assumptions):
+            pipeline = assumption.predictor
+            assumption_metrics = get_pipeline_metrics(assumption.predictor, fedot_meta.train_data, fedot_meta.metrics)
+            assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
+                                                 pipeline=assumption.predictor, **assumption_metrics)
+            results.append(assumption_res)
+
+    # Save the accumulated results
+    time_now = datetime.now().isoformat(timespec="minutes")
+    time_now_for_path = time_now.replace(":", ".")
+    save_dir = Path(f'run_{time_now_for_path}')
     save_dir.mkdir()
     history_dir = save_dir.joinpath('histories')
     history_dir.mkdir()
+    models_dir = save_dir.joinpath('models')
     for res in results:
-        dataset = res['dataset']
-        res.pop('history_naive').save(history_dir.joinpath(f'{dataset}_history_naive.json'))
-        res.pop('history_meta').save(history_dir.joinpath(f'{dataset}_history_meta.json'))
-    pd.DataFrame(results_pre).to_csv(save_dir.joinpath(f'results_pre_{time_now}.csv'))
-    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now}.csv'))
+        res['run_date'] = time_now
+        dataset_name = res['dataset_name']
+        run_label = res['run_label']
+        # define saving paths
+        model_path = models_dir.joinpath(f'{dataset_name}_{run_label}')
+        history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json')
+        # replace objects with export paths for csv
+        res['model_path'] = str(model_path)
+        res.pop('model_obj').save(res['model_path'])
+        res['history_path'] = str(history_path)
+        history_obj = res.pop('history_obj')
+        if history_obj is not None:
+            history_obj.save(res['history_path'])
+    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_pre_{time_now_for_path}.csv'))
+    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
+
+    # save experiment hyperparameters
+    params = {
+        'run_date': time_now,
+        'seed': SEED,
+        'n_datasets': N_DATASETS or len(dataset_ids),
+        'test_size': TEST_SIZE,
+        'dataset_ids': dataset_ids,
+        'dataset_names': list(datasets_cache.keys()),
+        'dataset_names_train': datasets_train,
+        'dataset_names_test': datasets_test,
+        'train_timeout': TRAIN_TIMEOUT,
+        'test_timeout': TEST_TIMEOUT,
+        'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE,
+        'n_closest_datasets_to_propose': N_CLOSEST_DATASETS_TO_PROPOSE,
+        'minimal_distance_between_advised_models': MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS,
+        'n_best_models_to_advise': N_BEST_MODELS_TO_ADVISE,
+        'common_fedot_params': COMMON_FEDOT_PARAMS,
+        'baseline_pipeline': baseline_pipeline.descriptive_id,
+    }
+    with open(save_dir.joinpath('parameters.json'), 'w') as params_file:
+        json.dump(params, params_file, indent=2)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index 4b8e1290af910b183fc689ad8e6fdcccc56d827e..ad0a22332f176f2c866188116575624428ac1536 100644
GIT binary patch
delta 185
zcmXAh%?`m(5QRVb6c$#N66N;lUm~%<*2W8{Rx8o8Nw2kMvhft22}flzGdbrwGtc46
zyMNv#7aUV6O-D;dim&?n6*n?woM@=!9+j@8uD$QGW6Op;2iC*{;sFUu?S*Deh{1B!
og%OE*V&ul+7CV>q>s&I@VWAEcN(3_|(xqyp2Zb^X?lBq<zU-DCy8r+H

delta 34
mcmX@Zyp2in|GzYbRE89We1;MRTOc%G&|@$MVuOwAWf=joy$G8C


From 1270d80765fc71d4fde0b0e3566d0ee1e2f11e60 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sat, 8 Apr 2023 13:14:10 +0300
Subject: [PATCH 34/60] wrap & log exceptions; log progress to file

---
 experiments/fedot_warm_start/run.py | 193 ++++++++++++++++------------
 1 file changed, 111 insertions(+), 82 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 6e043d55..3e5a2a28 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -1,5 +1,6 @@
 import functools
 import json
+import logging
 import timeit
 from datetime import datetime
 from itertools import chain
@@ -14,6 +15,7 @@
 from fedot.core.pipelines.adapters import PipelineAdapter
 from fedot.core.pipelines.pipeline_builder import PipelineBuilder
 from fedot.core.validation.split import tabular_cv_generator
+from golem.core.log import Log
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from tqdm import tqdm
 
@@ -27,11 +29,11 @@
 # Meta-alg hyperparameters
 SEED = 42
 # Datasets sampling
-N_DATASETS = None
-TEST_SIZE = 0.2
+N_DATASETS = 3
+TEST_SIZE = 0.33
 # Evaluation timeouts
-TRAIN_TIMEOUT = 5
-TEST_TIMEOUT = 5
+TRAIN_TIMEOUT = 0.5
+TEST_TIMEOUT = 0.5
 # Models & datasets
 N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
 N_CLOSEST_DATASETS_TO_PROPOSE = 5
@@ -42,12 +44,25 @@
 
 COMMON_FEDOT_PARAMS = dict(
     problem='classification',
-    logging_level=50,
     n_jobs=-1,
     seed=SEED,
     show_progress=False,
 )
 
+# Setup logging
+time_now = datetime.now().isoformat(timespec="minutes")
+time_now_for_path = time_now.replace(":", ".")
+save_dir = Path(f'run_{time_now_for_path}')
+save_dir.mkdir()
+log_file = save_dir.joinpath('log.txt')
+Log(log_file=log_file)
+logging.basicConfig(filename=log_file,
+                    filemode='a',
+                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
+                    datefmt='%H:%M:%S',
+                    force=True,
+                    )
+
 
 def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
     """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
@@ -132,6 +147,19 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut
     return run_results
 
 
+def extract_best_history_models(dataset_cache, history):
+    best_individuals = sorted(chain(*history.individuals),
+                              key=lambda ind: ind.fitness,
+                              reverse=True)
+    best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
+    best_models = []
+    for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
+        pipeline = PipelineAdapter().restore(individual.graph)
+        model = Model(pipeline, individual.fitness, dataset_cache)
+        best_models.append(model)
+    return best_models
+
+
 def main():
     baseline_pipeline = PipelineBuilder().add_node('rf').build()
 
@@ -144,93 +172,91 @@ def main():
 
     results = []
     best_models_per_dataset = {}
-    for name in tqdm(datasets_train, 'Train datasets'):
-        cache = datasets_cache[name]
-        data = cache.from_cache()
-
-        fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
-        results.append(run_results)
-        # TODO:
-        #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
-        #   x Evaluate historical pipelines on the data instead of using fitness
-        #   x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run
-
-        # Filter out unique individuals with the best fitness
-        history = fedot.history
-        best_individuals = sorted(chain(*fedot.history.individuals),
-                                  key=lambda ind: ind.fitness,
-                                  reverse=True)
-        best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
-        best_models = []
-        for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
-            pipeline = PipelineAdapter().restore(individual.graph)
-            model = Model(pipeline, individual.fitness, cache)
-            best_models.append(model)
-        best_models_per_dataset[name] = best_models
+    progress_file = open(save_dir.joinpath('progress.txt'), 'a')
+    for name in tqdm(datasets_train, 'Train datasets', file=progress_file):
+        try:
+            cache = datasets_cache[name]
+            data = cache.from_cache()
+
+            fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
+            results.append(run_results)
+            # TODO:
+            #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
+            #   x Evaluate historical pipelines on the data instead of using fitness
+            #   x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run
+
+            # Filter out unique individuals with the best fitness
+            history = fedot.history
+            best_models = extract_best_history_models(cache, history)
+            best_models_per_dataset[name] = best_models
+        except Exception:
+            logging.exception(f'Train dataset "{name}"')
 
     model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-    for name in tqdm(datasets_test, 'Test datasets'):
-        cache = datasets_cache[name]
-        data = cache.from_cache()
-
-        # Run pure AutoML
-        fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT')
-        results.append(fedot_naive_results)
-
-        # Run meta AutoML
-        # 1
-        time_start = timeit.default_timer()
-        meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
-        meta_features = meta_features.fillna(0)
-        meta_learning_time = timeit.default_timer() - time_start
-        initial_assumptions = model_advisor.predict(meta_features)[0]
-        assumption_pipelines = [model.predictor for model in initial_assumptions]
-        # 2
-        fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
-                                                   initial_assumption=assumption_pipelines)
-        fedot_meta_results['meta_learning_time'] = meta_learning_time
-        results.append(fedot_meta_results)
-
-        # Fit & evaluate simple baseline
-        baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics)
-        baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
-                                           **baseline_metrics)
-        results.append(baseline_res)
-
-        # Fit & evaluate initial assumptions
-        for i, assumption in enumerate(initial_assumptions):
-            pipeline = assumption.predictor
-            assumption_metrics = get_pipeline_metrics(assumption.predictor, fedot_meta.train_data, fedot_meta.metrics)
-            assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
-                                                 pipeline=assumption.predictor, **assumption_metrics)
-            results.append(assumption_res)
+    for name in tqdm(datasets_test, 'Test datasets', file=progress_file):
+        try:
+            cache = datasets_cache[name]
+            data = cache.from_cache()
+
+            # Run pure AutoML
+            fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT')
+            results.append(fedot_naive_results)
+
+            # Run meta AutoML
+            # 1
+            time_start = timeit.default_timer()
+            meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
+            meta_features = meta_features.fillna(0)
+            meta_learning_time = timeit.default_timer() - time_start
+            initial_assumptions = model_advisor.predict(meta_features)[0]
+            assumption_pipelines = [model.predictor for model in initial_assumptions]
+            # 2
+            fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
+                                                       initial_assumption=assumption_pipelines)
+            fedot_meta_results['meta_learning_time'] = meta_learning_time
+            results.append(fedot_meta_results)
+
+            # Fit & evaluate simple baseline
+            baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics)
+            baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
+                                               **baseline_metrics)
+            results.append(baseline_res)
+
+            # Fit & evaluate initial assumptions
+            for i, assumption in enumerate(initial_assumptions):
+                pipeline = assumption.predictor
+                assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data, fedot_meta.metrics)
+                assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
+                                                     pipeline=pipeline, **assumption_metrics)
+                results.append(assumption_res)
+        except Exception:
+            logging.exception(f'Test dataset "{name}"')
 
     # Save the accumulated results
-    time_now = datetime.now().isoformat(timespec="minutes")
-    time_now_for_path = time_now.replace(":", ".")
-    save_dir = Path(f'run_{time_now_for_path}')
-    save_dir.mkdir()
     history_dir = save_dir.joinpath('histories')
     history_dir.mkdir()
     models_dir = save_dir.joinpath('models')
     for res in results:
-        res['run_date'] = time_now
-        dataset_name = res['dataset_name']
-        run_label = res['run_label']
-        # define saving paths
-        model_path = models_dir.joinpath(f'{dataset_name}_{run_label}')
-        history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json')
-        # replace objects with export paths for csv
-        res['model_path'] = str(model_path)
-        res.pop('model_obj').save(res['model_path'])
-        res['history_path'] = str(history_path)
-        history_obj = res.pop('history_obj')
-        if history_obj is not None:
-            history_obj.save(res['history_path'])
-    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_pre_{time_now_for_path}.csv'))
+        try:
+            res['run_date'] = time_now
+            dataset_name = res['dataset_name']
+            run_label = res['run_label']
+            # define saving paths
+            model_path = models_dir.joinpath(f'{dataset_name}_{run_label}')
+            history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json')
+            # replace objects with export paths for csv
+            res['model_path'] = str(model_path)
+            res.pop('model_obj').save(res['model_path'])
+            res['history_path'] = str(history_path)
+            history_obj = res.pop('history_obj')
+            if history_obj is not None:
+                history_obj.save(res['history_path'])
+        except Exception:
+            logging.exception(f'Saving results "{res}"')
+
     pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
 
     # save experiment hyperparameters
@@ -257,4 +283,7 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    try:
+        main()
+    except Exception:
+        logging.exception(f'Main level cached the error')

From a796ea73caccb9eead7cfe659f387df3db41d449 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sat, 8 Apr 2023 13:16:28 +0300
Subject: [PATCH 35/60] update timeouts

---
 experiments/fedot_warm_start/run.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 3e5a2a28..3113ad97 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -29,11 +29,11 @@
 # Meta-alg hyperparameters
 SEED = 42
 # Datasets sampling
-N_DATASETS = 3
-TEST_SIZE = 0.33
+N_DATASETS = None
+TEST_SIZE = 0.2
 # Evaluation timeouts
-TRAIN_TIMEOUT = 0.5
-TEST_TIMEOUT = 0.5
+TRAIN_TIMEOUT = 15
+TEST_TIMEOUT = 15
 # Models & datasets
 N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
 N_CLOSEST_DATASETS_TO_PROPOSE = 5

From 86652043368290d95039b3ae99ccdff1744cbe18 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Tue, 18 Apr 2023 18:02:21 +0300
Subject: [PATCH 36/60] remove GOLEM from requirements.txt to inherit version
 required by FEDOT

---
 requirements.txt | Bin 460 -> 430 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ad0a22332f176f2c866188116575624428ac1536..eca13d853ca1f8e55c583bd3790a78a679ffee4d 100644
GIT binary patch
delta 7
OcmX@ZypDOpIz|8ti~`^Q

delta 38
pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006<B2fF|O


From 0f5ac5320960a0f135fae16efc7e6e51689251d5 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Tue, 18 Apr 2023 18:04:02 +0300
Subject: [PATCH 37/60] clean openml cache

---
 .../datasets_loaders/openml_datasets_loader.py      | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
index f23510d7..7959ca61 100644
--- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
+++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import shutil
+from pathlib import Path
 from typing import List, Union
 
 import openml
@@ -10,6 +12,12 @@
 OpenMLDatasetID = Union[str, int]
 
 
+def _clear_openml_cache():
+    cache_dir = openml.config.get_cache_directory()
+    cache_dir = Path(cache_dir)
+    shutil.rmtree(cache_dir)
+
+
 class OpenMLDatasetsLoader(DatasetsLoader):
 
     def __init__(self):
@@ -27,7 +35,10 @@ def load(self, dataset_sources: List[OpenMLDatasetID]) -> List[DatasetCache]:
         return datasets
 
     def load_single(self, source: OpenMLDatasetID):
-        return self.get_openml_dataset(source)
+        try:
+            return self.get_openml_dataset(source)
+        finally:
+            _clear_openml_cache()
 
     def get_openml_dataset(self, dataset_id: OpenMLDatasetID, force_download: bool = False) -> DatasetCache:
         openml_dataset = openml.datasets.get_dataset(dataset_id, download_data=False, download_qualities=False)

From 6eddbb19dea0f3b8d16aad3a6b6af451277c9f4e Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Tue, 18 Apr 2023 18:04:35 +0300
Subject: [PATCH 38/60] update Dockerfile

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index e17e17cd..7958082a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,6 +24,7 @@ WORKDIR $WORKDIR
 COPY . $WORKDIR
 
 RUN pip3 install pip && \
+    pip install wheel && \
     pip install --trusted-host pypi.python.org -r ${WORKDIR}/requirements.txt
 
 ENV PYTHONPATH $WORKDIR

From d8bd536935ad02c98d21a39b543c0027ad60be24 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 20 Apr 2023 15:42:57 +0300
Subject: [PATCH 39/60] make experiment safer

---
 experiments/fedot_warm_start/run.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 3113ad97..66c80192 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -19,6 +19,7 @@
 from sklearn.model_selection import train_test_split, StratifiedKFold
 from tqdm import tqdm
 
+from meta_automl.data_preparation.data_manager import DataManager
 from meta_automl.data_preparation.dataset import DatasetCache, Dataset
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
@@ -52,7 +53,8 @@
 # Setup logging
 time_now = datetime.now().isoformat(timespec="minutes")
 time_now_for_path = time_now.replace(":", ".")
-save_dir = Path(f'run_{time_now_for_path}')
+save_dir = DataManager.get_data_dir()\
+    .joinpath(f'run_{time_now_for_path}').joinpath('experiments').joinpath('fedot_warm_start')
 save_dir.mkdir()
 log_file = save_dir.joinpath('log.txt')
 Log(log_file=log_file)
@@ -168,17 +170,16 @@ def main():
     datasets_train, datasets_test = \
         train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED)
 
-    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
-
     results = []
     best_models_per_dataset = {}
     progress_file = open(save_dir.joinpath('progress.txt'), 'a')
-    for name in tqdm(datasets_train, 'Train datasets', file=progress_file):
+    for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file):
         try:
             cache = datasets_cache[name]
             data = cache.from_cache()
 
-            fedot, run_results = fit_fedot(data=data, timeout=TRAIN_TIMEOUT, run_label='FEDOT')
+            timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT
+            fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT')
             results.append(run_results)
             # TODO:
             #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
@@ -192,19 +193,16 @@ def main():
         except Exception:
             logging.exception(f'Train dataset "{name}"')
 
+    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
     model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-    for name in tqdm(datasets_test, 'Test datasets', file=progress_file):
+    for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file):
         try:
             cache = datasets_cache[name]
             data = cache.from_cache()
 
-            # Run pure AutoML
-            fedot_naive, fedot_naive_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='FEDOT')
-            results.append(fedot_naive_results)
-
             # Run meta AutoML
             # 1
             time_start = timeit.default_timer()
@@ -234,6 +232,7 @@ def main():
                 results.append(assumption_res)
         except Exception:
             logging.exception(f'Test dataset "{name}"')
+    progress_file.close()
 
     # Save the accumulated results
     history_dir = save_dir.joinpath('histories')

From 36c1d0155440db71c2dcd6bd74d96aad1c87ff7e Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 20 Apr 2023 16:38:47 +0300
Subject: [PATCH 40/60] add .dockerignore

---
 .dockerignore | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..2bfa6863
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,13 @@
+# Config & info files
+.pep8speaks.yml
+Dockerfile
+LICENSE
+README.md
+
+# Unnecessary files
+examples
+notebooks
+test
+
+# User data
+data

From 29b8cb9d3c50aea8e911d072f4071301a7c8d201 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 20 Apr 2023 17:39:14 +0300
Subject: [PATCH 41/60] fix save path

---
 experiments/fedot_warm_start/run.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 66c80192..9bf33ccb 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -4,7 +4,6 @@
 import timeit
 from datetime import datetime
 from itertools import chain
-from pathlib import Path
 from typing import Dict, List, Tuple
 
 import numpy as np
@@ -53,9 +52,9 @@
 # Setup logging
 time_now = datetime.now().isoformat(timespec="minutes")
 time_now_for_path = time_now.replace(":", ".")
-save_dir = DataManager.get_data_dir()\
-    .joinpath(f'run_{time_now_for_path}').joinpath('experiments').joinpath('fedot_warm_start')
-save_dir.mkdir()
+save_dir = DataManager.get_data_dir().\
+    joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}')
+save_dir.mkdir(parents=True)
 log_file = save_dir.joinpath('log.txt')
 Log(log_file=log_file)
 logging.basicConfig(filename=log_file,

From fbe04eac5c9fcfa330039340acb581fb5d767a0f Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Tue, 16 May 2023 01:52:03 +0300
Subject: [PATCH 42/60] Resolving conflict

---
 experiments/auto-sklearn_run/results.json | 45 -----------------------
 experiments/fedot_warm_start/run.py       | 41 +++------------------
 2 files changed, 6 insertions(+), 80 deletions(-)
 delete mode 100644 experiments/auto-sklearn_run/results.json

diff --git a/experiments/auto-sklearn_run/results.json b/experiments/auto-sklearn_run/results.json
deleted file mode 100644
index b4ce4cbf..00000000
--- a/experiments/auto-sklearn_run/results.json
+++ /dev/null
@@ -1,45 +0,0 @@
-{
-  "ensemble": {
-    "2": {
-      "model_id": 2,
-      "rank": 1,
-      "cost": 0.02008032128514059,
-      "ensemble_weight": 0.1,
-      "balancing": "Balancing(random_state=1)",
-      "sklearn_classifier": "RandomForestClassifier(max_features=5, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)"
-    },
-    "6": {
-      "model_id": 6,
-      "rank": 2,
-      "cost": 0.04216867469879515,
-      "ensemble_weight": 0.02,
-      "balancing": "Balancing(random_state=1)",
-      "sklearn_classifier": "RandomForestClassifier(bootstrap=False, max_features=4, min_samples_leaf=4, min_samples_split=20, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)"
-    },
-    "7": {
-      "model_id": 7,
-      "rank": 3,
-      "cost": 0.025100401606425682,
-      "ensemble_weight": 0.08,
-      "balancing": "Balancing(random_state=1)",
-      "sklearn_classifier": "HistGradientBoostingClassifier(early_stopping=True, l2_regularization=5.759216242427118e-07, learning_rate=0.14515873247977112, loss='auto', max_iter=64, max_leaf_nodes=11, min_samples_leaf=1, n_iter_no_change=18, random_state=1, validation_fraction=0.06967552984405034, warm_start=True)"
-    },
-    "8": {
-      "model_id": 8,
-      "rank": 4,
-      "cost": 0.02208835341365467,
-      "ensemble_weight": 0.54,
-      "balancing": "Balancing(random_state=1, strategy='weighting')",
-      "sklearn_classifier": "AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=2), learning_rate=1.7653851967971248, n_estimators=290, random_state=1)"
-    },
-    "11": {
-      "model_id": 11,
-      "rank": 5,
-      "cost": 0.017068273092369468,
-      "ensemble_weight": 0.26,
-      "balancing": "Balancing(random_state=1)",
-      "sklearn_classifier": "AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=5), learning_rate=0.9772078202526538, n_estimators=418, random_state=1)"
-    }
-  },
-  "score": 0.9182632313000073
-}
\ No newline at end of file
diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 26741afd..1df6a0b6 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -4,6 +4,8 @@
 import timeit
 from datetime import datetime
 from itertools import chain
+from pathlib import Path
+
 from typing import Dict, List, Tuple
 
 import numpy as np
@@ -50,7 +52,7 @@
     show_progress=False,
 )
 
-<<<<<<< HEAD
+
 SAVE_DIR = None
 TIME_NOW = None
 TIME_NOW_FOR_PATH = None
@@ -75,26 +77,6 @@ def setup_logging():
                         force=True,
                         )
 
-=======
-# Setup logging
-time_now = datetime.now().isoformat(timespec="minutes")
-time_now_for_path = time_now.replace(":", ".")
-save_dir = DataManager.get_data_dir().\
-    joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}')
-save_dir.mkdir(parents=True)
-log_file = save_dir.joinpath('log.txt')
-Log(log_file=log_file)
-logging.basicConfig(filename=log_file,
-                    filemode='a',
-                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
-                    datefmt='%H:%M:%S',
-                    force=True,
-                    )
-
-
-def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
-    """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
->>>>>>> origin/docker_and_experiments
 
 def fetch_openml_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
     """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
@@ -214,24 +196,17 @@ def main():
 
     ds_ids, datasets = ds_with_ids
 
-<<<<<<< HEAD
     data_similarity_assessor, extractor = prepare_extractor_and_assessor(train_ds_names)
 
     results = []
     best_models_per_dataset = {}
     progress_file = open(SAVE_DIR.joinpath('progress.txt'), 'a')
-    for name in tqdm(train_ds_names, 'Train datasets', file=progress_file):
-=======
-    results = []
-    best_models_per_dataset = {}
-    progress_file = open(save_dir.joinpath('progress.txt'), 'a')
-    for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file):
->>>>>>> origin/docker_and_experiments
+    for name in tqdm(train_ds_names, 'FEDOT, all datasets', file=progress_file):
         try:
             cache = datasets[name]
             data = cache.from_cache()
 
-            timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT
+            timeout = TRAIN_TIMEOUT if name in train_ds_names else TEST_TIMEOUT
             fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT')
             results.append(run_results)
             # TODO:
@@ -251,11 +226,7 @@ def main():
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-<<<<<<< HEAD
-    for name in tqdm(test_ds_names, 'Test datasets', file=progress_file):
-=======
-    for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file):
->>>>>>> origin/docker_and_experiments
+    for name in tqdm(test_ds_names, 'MetaFEDOT, Test datasets', file=progress_file):
         try:
             cache = datasets[name]
             data = cache.from_cache()

From ac060eeb0185f3713400fd7f39e9c75a7cb38c95 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Wed, 17 May 2023 10:41:07 +0300
Subject: [PATCH 43/60] add logging in PymfeExtractor

---
 .../meta_features_extractors/pymfe_extractor.py                | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
index 36cb9d45..8dbc728f 100644
--- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
+++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
@@ -3,6 +3,7 @@
 from typing import List, Union, Dict, Any
 
 import pandas as pd
+from golem.core.log import default_log
 from pymfe.mfe import MFE
 
 from meta_automl.data_preparation.dataset import DatasetCache
@@ -18,6 +19,7 @@ def __init__(self, extractor_params: Dict[str, Any] = None, datasets_loader: Dat
         self.extractor_params = extractor_params if extractor_params is not None else self.DEFAULT_PARAMS
         self._datasets_loader = datasets_loader or OpenMLDatasetsLoader()
         self._extractor = MFE(**self.extractor_params)
+        self._logger = default_log(self)
 
     @property
     def datasets_loader(self) -> DatasetsLoader:
@@ -34,6 +36,7 @@ def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: boo
             if isinstance(dataset, str):
                 dataset = DatasetCache(dataset)
 
+            self._logger.info(f'Extracting meta features of the dataset {dataset.name}...')
             if (use_cached and
                     (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))):
                 meta_features[dataset.name] = mfs

From 7c42e79032924c9d4cb17b8dafb3c30057caf948 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Wed, 31 May 2023 17:43:09 +0300
Subject: [PATCH 44/60] add intelligent datasets train/test split

---
 experiments/fedot_warm_start/run.py           | 95 +++++++++++--------
 .../datasets_train_test_split.py              | 64 +++++++++++++
 2 files changed, 117 insertions(+), 42 deletions(-)
 create mode 100644 meta_automl/data_preparation/datasets_train_test_split.py

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 9bf33ccb..26382ebb 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -4,23 +4,28 @@
 import timeit
 from datetime import datetime
 from itertools import chain
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Sequence
 
 import numpy as np
 import openml
 import pandas as pd
+
 from fedot.api.main import Fedot
+from fedot.core.data.data import InputData
 from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate
 from fedot.core.pipelines.adapters import PipelineAdapter
+from fedot.core.pipelines.pipeline import Pipeline
 from fedot.core.pipelines.pipeline_builder import PipelineBuilder
+from fedot.core.repository.quality_metrics_repository import QualityMetricsEnum, MetricsRepository
 from fedot.core.validation.split import tabular_cv_generator
 from golem.core.log import Log
-from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.model_selection import StratifiedKFold
 from tqdm import tqdm
 
 from meta_automl.data_preparation.data_manager import DataManager
 from meta_automl.data_preparation.dataset import DatasetCache, Dataset
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
+from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
 from meta_automl.data_preparation.model import Model
 from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
@@ -29,11 +34,11 @@
 # Meta-alg hyperparameters
 SEED = 42
 # Datasets sampling
-N_DATASETS = None
-TEST_SIZE = 0.2
+N_DATASETS = 3
+TEST_SIZE = 0.33
 # Evaluation timeouts
-TRAIN_TIMEOUT = 15
-TEST_TIMEOUT = 15
+TRAIN_TIMEOUT = 1
+TEST_TIMEOUT = 1
 # Models & datasets
 N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
 N_CLOSEST_DATASETS_TO_PROPOSE = 5
@@ -41,6 +46,9 @@
 N_BEST_MODELS_TO_ADVISE = 5
 # Meta-features
 MF_EXTRACTOR_PARAMS = {'groups': 'general'}
+COLLECT_METRICS = ['f1', 'roc_auc', 'accuracy', 'neg_log_loss', 'precision']
+COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.metric_by_id, COLLECT_METRICS))
+COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss'
 
 COMMON_FEDOT_PARAMS = dict(
     problem='classification',
@@ -50,19 +58,21 @@
 )
 
 # Setup logging
-time_now = datetime.now().isoformat(timespec="minutes")
-time_now_for_path = time_now.replace(":", ".")
-save_dir = DataManager.get_data_dir().\
+time_now = datetime.now()
+time_now_iso = time_now.isoformat(timespec="minutes")
+time_now_for_path = time_now_iso.replace(":", ".")
+save_dir = DataManager.get_data_dir(). \
     joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}')
 save_dir.mkdir(parents=True)
 log_file = save_dir.joinpath('log.txt')
 Log(log_file=log_file)
-logging.basicConfig(filename=log_file,
-                    filemode='a',
-                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
-                    datefmt='%H:%M:%S',
-                    force=True,
-                    )
+logging.basicConfig(
+    filename=log_file,
+    filemode='a',
+    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
+    datefmt='%H:%M:%S',
+    force=True,
+)
 
 
 def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
@@ -84,18 +94,16 @@ def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
     return x, y
 
 
-def get_pipeline_metrics(pipeline,
-                         input_data,
-                         metrics_obj) -> dict:
+def get_pipeline_metrics(pipeline: Pipeline,
+                         input_data: InputData,
+                         metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM,
+                         metric_names: Sequence[str] = COLLECT_METRICS) -> dict:
     """Gets quality metrics for the fitted pipeline.
     The function is based on `Fedot.get_metrics()`
 
     Returns:
         the values of quality metrics
     """
-    metrics = metrics_obj.metric_functions
-    metric_names = metrics_obj.get_metric_names(metrics)
-
     data_producer = functools.partial(tabular_cv_generator, input_data, 10, StratifiedKFold)
 
     objective = MetricsObjective(metrics)
@@ -103,10 +111,10 @@ def get_pipeline_metrics(pipeline,
                                          data_producer=data_producer,
                                          eval_n_jobs=-1)
 
-    metrics = obj_eval.evaluate(pipeline).values
-    metrics = {metric_name: round(metric, 3) for (metric_name, metric) in zip(metric_names, metrics)}
+    metric_values = obj_eval.evaluate(pipeline).values
+    metric_values = {metric_name: round(value, 3) for (metric_name, value) in zip(metric_names, metric_values)}
 
-    return metrics
+    return metric_values
 
 
 def prepare_extractor_and_assessor(datasets_train: List[str]):
@@ -127,7 +135,7 @@ def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=
     fedot.fit(x, y)
     automl_time = timeit.default_timer() - time_start
 
-    metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data, fedot.metrics)
+    metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data)
     pipeline = fedot.current_pipeline
     run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
                                       automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics)
@@ -144,6 +152,7 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut
                        history_obj=history_obj,
                        automl_time_sec=automl_time_sec,
                        automl_timeout_min=automl_timeout_min,
+                       task_type='classification',
                        **metrics)
     return run_results
 
@@ -156,7 +165,7 @@ def extract_best_history_models(dataset_cache, history):
     best_models = []
     for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
         pipeline = PipelineAdapter().restore(individual.graph)
-        model = Model(pipeline, individual.fitness, dataset_cache)
+        model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset_cache)
         best_models.append(model)
     return best_models
 
@@ -166,10 +175,11 @@ def main():
 
     dataset_ids, datasets_cache = prepare_data()
 
-    datasets_train, datasets_test = \
-        train_test_split(list(datasets_cache.keys()), test_size=TEST_SIZE, random_state=SEED)
+    split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED)
+    datasets_train = split_datasets[split_datasets['is_train'] == 1]['dataset_name'].to_list()
+    datasets_test = split_datasets[~split_datasets['is_train'] == 0]['dataset_name'].to_list()
 
-    results = []
+    evaluation_results = []
     best_models_per_dataset = {}
     progress_file = open(save_dir.joinpath('progress.txt'), 'a')
     for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file):
@@ -179,7 +189,7 @@ def main():
 
             timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT
             fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT')
-            results.append(run_results)
+            evaluation_results.append(run_results)
             # TODO:
             #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
             #   x Evaluate historical pipelines on the data instead of using fitness
@@ -207,28 +217,28 @@ def main():
             time_start = timeit.default_timer()
             meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
             meta_features = meta_features.fillna(0)
-            meta_learning_time = timeit.default_timer() - time_start
+            meta_learning_time_sec = timeit.default_timer() - time_start
             initial_assumptions = model_advisor.predict(meta_features)[0]
             assumption_pipelines = [model.predictor for model in initial_assumptions]
             # 2
             fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
                                                        initial_assumption=assumption_pipelines)
-            fedot_meta_results['meta_learning_time'] = meta_learning_time
-            results.append(fedot_meta_results)
+            fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec
+            evaluation_results.append(fedot_meta_results)
 
             # Fit & evaluate simple baseline
-            baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data, fedot_meta.metrics)
+            baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data)
             baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
                                                **baseline_metrics)
-            results.append(baseline_res)
+            evaluation_results.append(baseline_res)
 
             # Fit & evaluate initial assumptions
             for i, assumption in enumerate(initial_assumptions):
                 pipeline = assumption.predictor
-                assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data, fedot_meta.metrics)
+                assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data)
                 assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
                                                      pipeline=pipeline, **assumption_metrics)
-                results.append(assumption_res)
+                evaluation_results.append(assumption_res)
         except Exception:
             logging.exception(f'Test dataset "{name}"')
     progress_file.close()
@@ -237,7 +247,7 @@ def main():
     history_dir = save_dir.joinpath('histories')
     history_dir.mkdir()
     models_dir = save_dir.joinpath('models')
-    for res in results:
+    for res in evaluation_results:
         try:
             res['run_date'] = time_now
             dataset_name = res['dataset_name']
@@ -255,11 +265,11 @@ def main():
         except Exception:
             logging.exception(f'Saving results "{res}"')
 
-    pd.DataFrame(results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
+    pd.DataFrame(evaluation_results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
 
     # save experiment hyperparameters
     params = {
-        'run_date': time_now,
+        'run_date': time_now_iso,
         'seed': SEED,
         'n_datasets': N_DATASETS or len(dataset_ids),
         'test_size': TEST_SIZE,
@@ -283,5 +293,6 @@ def main():
 if __name__ == "__main__":
     try:
         main()
-    except Exception:
-        logging.exception(f'Main level cached the error')
+    except Exception as e:
+        logging.exception('Main level caught an error.')
+        raise
diff --git a/meta_automl/data_preparation/datasets_train_test_split.py b/meta_automl/data_preparation/datasets_train_test_split.py
new file mode 100644
index 00000000..101b7ce8
--- /dev/null
+++ b/meta_automl/data_preparation/datasets_train_test_split.py
@@ -0,0 +1,64 @@
+import openml
+import pandas as pd
+
+from sklearn.model_selection import train_test_split
+
+
+def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed: int = 42):
+    df_openml_datasets = openml.datasets.list_datasets(dataset_ids, output_format='dataframe')
+    df_openml_datasets_split_features = df_openml_datasets[
+        ['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']]
+    for column in df_openml_datasets_split_features.columns[1:]:
+        if column != 'NumberOfClasses':
+            median = df_openml_datasets_split_features[column].median()
+            df_openml_datasets_split_features[column] = \
+                (df_openml_datasets_split_features[column] > median).map({False: 'small', True: 'big'})
+        else:
+            median = df_openml_datasets_split_features[column][df_openml_datasets_split_features[column] != 2].median()
+            df_openml_datasets_split_features[column] = df_openml_datasets_split_features[column].apply(
+                lambda n: 'binary' if n == 2 else {False: 'small', True: 'big'}[n > median])
+    df_split_categories = df_openml_datasets_split_features.copy()
+    df_split_categories['category'] = df_openml_datasets_split_features.apply(lambda row: '_'.join(
+        row[1:]), axis=1)
+    df_split_categories.drop(columns=['NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses'], inplace=True)
+    # Group single-value categories into a separate category
+    cat_counts = df_split_categories['category'].value_counts()
+    single_value_categories = cat_counts[cat_counts == 1].index
+    idx = df_split_categories[df_split_categories['category'].isin(single_value_categories)].index
+    df_split_categories.loc[idx, 'category'] = 'single_value'
+    df_datasets_to_split = df_split_categories[df_split_categories['category'] != 'single_value']
+    df_test_only_datasets = df_split_categories[df_split_categories['category'] == 'single_value']
+    if not df_datasets_to_split.empty:
+        df_train_datasets, df_test_datasets = train_test_split(
+            df_datasets_to_split,
+            train_size=train_size,
+            shuffle=True,
+            stratify=df_datasets_to_split['category'],
+            random_state=seed
+        )
+        df_test_datasets = pd.concat([df_test_datasets, df_test_only_datasets])
+    else:
+        df_train_datasets, df_test_datasets = train_test_split(
+            df_split_categories,
+            train_size=train_size,
+            shuffle=True,
+            random_state=seed
+        )
+    df_train_datasets['is_train'] = 1
+    df_test_datasets['is_train'] = 0
+    df_split_datasets = pd.concat([df_train_datasets, df_test_datasets]).join(
+        df_openml_datasets_split_features.drop(columns='name'))
+    df_split_datasets = df_split_datasets.rename(columns={'name': 'dataset_name'})
+    df_split_datasets.index.rename('dataset_id', inplace=True)
+
+    return df_split_datasets
+
+
+def main():
+    dataset_ids = openml.study.get_suite(99).data
+    df_split_datasets = openml_datasets_train_test_split(dataset_ids)
+    df_split_datasets.to_csv('train_test_datasets_opencc18.csv')
+
+
+if __name__ == '__main__':
+    main()

From cb11a3ccbbb61074f8bc046c7a3d50ab62106792 Mon Sep 17 00:00:00 2001
From: Peter Shevcnenko <57573631+MorrisNein@users.noreply.github.com>
Date: Fri, 30 Jun 2023 18:35:35 +0300
Subject: [PATCH 45/60] Refactor data storage (#15)

* refactor dataset classes, use openml cache

* fix example select_similar_datasets_by_knn.py

* create DatasetIDType

* create PredictorType

* remove DataManager, refactor cache

* update tests & test data

* allow explicit OpenMLDataset creation from name/search

* adapt examples to the last changes
---
 .gitignore                                    |   2 +-
 .../0_loading_data/load_list_of_datasests.py  |   5 +-
 .../extract_with_load_on_demand.py            |   5 +-
 .../load_and_extract_features_sequentially.py |   4 +-
 .../select_similar_datasets_by_knn.py         |   5 +-
 .../advise_models_from_similar_datasets.py    |  11 +-
 examples/knowledge_base_loading.py            |   6 +-
 experiments/fedot_warm_start/run.py           |  96 +--
 meta_automl/data_preparation/data_manager.py  |  59 --
 meta_automl/data_preparation/dataset.py       |  64 --
 .../data_preparation/dataset/__init__.py      |   3 +
 .../dataset/custom_dataset.py                 |  30 +
 .../data_preparation/dataset/dataset_base.py  |  40 +
 .../dataset/openml_dataset.py                 |  39 +
 .../datasets_loaders/__init__.py              |   2 +-
 .../datasets_loaders/datasets_loader.py       |  16 +-
 .../openml_datasets_loader.py                 |  66 +-
 .../data_preparation/file_system/__init__.py  |   5 +
 .../data_preparation/file_system/cache.py     |  95 +++
 .../file_system/cache_properties.py           |  21 +
 .../file_system/file_system.py                |  27 +
 .../meta_features_extractor.py                |  22 +-
 .../pymfe_extractor.py                        |  31 +-
 meta_automl/data_preparation/model.py         |   9 +-
 .../models_loaders/fedot_pipelines_loader.py  |  48 +-
 .../knowledge_base_models_loader.py           |  21 +-
 .../model_based_similarity_assessors.py       |   9 +-
 .../model_advisors/model_advisor.py           |   7 +-
 requirements.txt                              | Bin 430 -> 460 bytes
 test/conftest.py                              |  40 +
 test/constants.py                             |   7 +-
 test/data/datasets/australian.pkl             | Bin 41870 -> 0 bytes
 test/data/datasets/monks-problems-1.pkl       | Bin 16009 -> 0 bytes
 .../pymfe/334.pkl}                            | Bin
 .../pymfe/40981.pkl}                          | Bin
 .../org/openml/www/datasets/333/dataset.arff  | 651 ++++++++++++++++
 .../www/datasets/333/dataset_333.pkl.py3      | Bin 0 -> 5724 bytes
 .../openml/www/datasets/333/dataset_333.pq    | Bin 0 -> 6016 bytes
 .../openml/www/datasets/333/description.xml   |  33 +
 .../org/openml/www/datasets/333/features.xml  |  84 +++
 .../openml/www/datasets/333/features.xml.pkl  | Bin 0 -> 509 bytes
 .../openml/www/datasets/40981/dataset.arff    | 707 ++++++++++++++++++
 .../www/datasets/40981/dataset_40981.pkl.py3  | Bin 0 -> 17678 bytes
 .../www/datasets/40981/dataset_40981.pq       | Bin 0 -> 20170 bytes
 .../openml/www/datasets/40981/description.xml |  49 ++
 .../openml/www/datasets/40981/features.xml    | 175 +++++
 .../www/datasets/40981/features.xml.pkl       | Bin 0 -> 899 bytes
 test/data_manager.py                          |   9 -
 test/general_checks.py                        |  25 -
 test/unit/datasets/__init__.py                |   0
 test/unit/datasets/conftest.py                |  18 +
 test/unit/datasets/general_checks.py          |  24 +
 test/unit/datasets/test_custom_dataset.py     |  48 ++
 test/unit/datasets/test_datasets_loaders.py   |  24 +
 test/unit/datasets/test_file_dataset.py       |  48 ++
 test/unit/datasets/test_openml_dataset.py     |  27 +
 test/unit/test_dataset.py                     |  40 -
 test/unit/test_datasets_loaders.py            |  50 --
 test/unit/test_file_system.py                 |   7 +
 test/unit/test_meta_features_extractors.py    |  47 +-
 60 files changed, 2399 insertions(+), 462 deletions(-)
 delete mode 100644 meta_automl/data_preparation/data_manager.py
 delete mode 100644 meta_automl/data_preparation/dataset.py
 create mode 100644 meta_automl/data_preparation/dataset/__init__.py
 create mode 100644 meta_automl/data_preparation/dataset/custom_dataset.py
 create mode 100644 meta_automl/data_preparation/dataset/dataset_base.py
 create mode 100644 meta_automl/data_preparation/dataset/openml_dataset.py
 create mode 100644 meta_automl/data_preparation/file_system/__init__.py
 create mode 100644 meta_automl/data_preparation/file_system/cache.py
 create mode 100644 meta_automl/data_preparation/file_system/cache_properties.py
 create mode 100644 meta_automl/data_preparation/file_system/file_system.py
 create mode 100644 test/conftest.py
 delete mode 100644 test/data/datasets/australian.pkl
 delete mode 100644 test/data/datasets/monks-problems-1.pkl
 rename test/data/{pymfe/monks-problems-2.pkl => metafeatures/pymfe/334.pkl} (100%)
 rename test/data/{pymfe/australian.pkl => metafeatures/pymfe/40981.pkl} (100%)
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset.arff
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/description.xml
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/features.xml
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/description.xml
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/features.xml
 create mode 100644 test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl
 delete mode 100644 test/data_manager.py
 delete mode 100644 test/general_checks.py
 create mode 100644 test/unit/datasets/__init__.py
 create mode 100644 test/unit/datasets/conftest.py
 create mode 100644 test/unit/datasets/general_checks.py
 create mode 100644 test/unit/datasets/test_custom_dataset.py
 create mode 100644 test/unit/datasets/test_datasets_loaders.py
 create mode 100644 test/unit/datasets/test_file_dataset.py
 create mode 100644 test/unit/datasets/test_openml_dataset.py
 delete mode 100644 test/unit/test_dataset.py
 delete mode 100644 test/unit/test_datasets_loaders.py
 create mode 100644 test/unit/test_file_system.py

diff --git a/.gitignore b/.gitignore
index 9e584fd4..a5f9134a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,4 @@ dmypy.json
 .pyre/
 
 # User data
-data/
+/data
diff --git a/examples/0_loading_data/load_list_of_datasests.py b/examples/0_loading_data/load_list_of_datasests.py
index c2ee1cbb..741438e1 100644
--- a/examples/0_loading_data/load_list_of_datasests.py
+++ b/examples/0_loading_data/load_list_of_datasests.py
@@ -6,9 +6,8 @@ def get_datasets():
         'nomao', 'sylvine', 'kc1', 'jungle_chess_2pcs_raw_endgame_complete', 'credit-g', 'delta_ailerons', 'pol'
     ]
     datasets_loader = OpenMLDatasetsLoader()
-    datasets = datasets_loader.load(dataset_names)
-    print(f'Datasets "{", ".join(dataset_names)}" are available at the paths:')
-    print('\n'.join(str(d) for d in datasets))
+    datasets = datasets_loader.load(dataset_names, allow_names=True)
+    print(f'Datasets "{", ".join(dataset_names)}" are downloaded.')
     return datasets
 
 
diff --git a/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py b/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py
index 9519e6ca..ad2110a2 100644
--- a/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py
+++ b/examples/2_extracting_datasets_meta_features/extract_with_load_on_demand.py
@@ -1,3 +1,5 @@
+import openml
+
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
 
@@ -6,8 +8,9 @@ def main():
     dataset_names = [
         'nomao', 'sylvine'
     ]
+    dataset_ids = [openml.datasets.get_dataset(name, download_data=False, download_qualities=False).dataset_id for name in dataset_names]
     extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
-    meta_features = extractor.extract(dataset_names)
+    meta_features = extractor.extract(dataset_ids)
     return meta_features
 
 
diff --git a/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py b/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py
index f1d21cf4..cda8b804 100644
--- a/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py
+++ b/examples/2_extracting_datasets_meta_features/load_and_extract_features_sequentially.py
@@ -9,8 +9,8 @@ def main():
     loader = OpenMLDatasetsLoader()
     extractor = PymfeExtractor(extractor_params={'groups': 'general'})
 
-    cached_datasets = loader.load(dataset_names)
-    meta_features = extractor.extract(cached_datasets)
+    datasets = loader.load(dataset_names, allow_names=True)
+    meta_features = extractor.extract(datasets)
     return meta_features
 
 
diff --git a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
index b6f2bb8c..5f13201e 100644
--- a/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
+++ b/examples/3_selecting_similar_datasets/select_similar_datasets_by_knn.py
@@ -8,9 +8,10 @@
 def main():
     # Define datasets.
     dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
+    datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True)
     # Extract meta-features and load on demand.
-    extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
-    meta_features = extractor.extract(dataset_names)
+    extractor = PymfeExtractor(extractor_params={'groups': 'general'})
+    meta_features = extractor.extract(datasets)
     # Preprocess meta-features, as KNN does not support NaNs.
     meta_features = meta_features.dropna(axis=1, how='any')
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py b/examples/4_advising_models/advise_models_from_similar_datasets.py
index 37c3b2db..e1dc16aa 100644
--- a/examples/4_advising_models/advise_models_from_similar_datasets.py
+++ b/examples/4_advising_models/advise_models_from_similar_datasets.py
@@ -2,7 +2,7 @@
 from golem.core.optimisers.fitness import SingleObjFitness
 from sklearn.model_selection import train_test_split
 
-from meta_automl.data_preparation.dataset import DatasetCache
+from meta_automl.data_preparation.dataset import OpenMLDataset
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
 from meta_automl.data_preparation.model import Model
@@ -13,9 +13,10 @@
 def main():
     # Define datasets.
     dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
+    datasets = OpenMLDatasetsLoader().load(dataset_names, allow_names=True)
     # Extract meta-features and load on demand.
-    extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
-    meta_features = extractor.extract(dataset_names)
+    extractor = PymfeExtractor(extractor_params={'groups': 'general'})
+    meta_features = extractor.extract(datasets)
     # Preprocess meta-features, as KNN does not support NaNs.
     meta_features = meta_features.dropna(axis=1, how='any')
     # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
@@ -29,8 +30,8 @@ def main():
         PipelineBuilder().add_node('normalization').add_node('logit').build(),
         PipelineBuilder().add_node('rf').add_node('logit').build()
     ]
-    best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', DatasetCache(dataset_name))]
-                   for dataset_name, pipeline in zip(y_train, best_pipelines)]
+    best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', OpenMLDataset(dataset_id))]
+                   for dataset_id, pipeline in zip(y_train, best_pipelines)]
 
     dataset_names_to_best_pipelines = dict(zip(y_train, best_models))
     advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines)
diff --git a/examples/knowledge_base_loading.py b/examples/knowledge_base_loading.py
index 699a547f..310b4bdf 100644
--- a/examples/knowledge_base_loading.py
+++ b/examples/knowledge_base_loading.py
@@ -16,12 +16,12 @@
     # ===== Another way to get train models, but also group them by datasets:
     models_for_train = {}
 
-    for dataset_name in train_datasets['dataset_name']:
+    for dataset_id in train_datasets['dataset_id']:
         dataset_models = models_loader.load(
-            dataset_names=[dataset_name],   # load models just for this exact dataset.
+            dataset_ids=[dataset_id],   # load models just for this exact dataset.
             fitness_metric='logloss',       # must correspond to a metric name in a knowledge base.
         )
-        models_for_train[dataset_name] = dataset_models
+        models_for_train[dataset_id] = dataset_models
 
         # If you need to load data to the local storage
         # dataset = OpenMLDatasetsLoader().load_single(dataset_name)
diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 26382ebb..c0461f30 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -22,8 +22,8 @@
 from sklearn.model_selection import StratifiedKFold
 from tqdm import tqdm
 
-from meta_automl.data_preparation.data_manager import DataManager
-from meta_automl.data_preparation.dataset import DatasetCache, Dataset
+
+from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
@@ -37,8 +37,8 @@
 N_DATASETS = 3
 TEST_SIZE = 0.33
 # Evaluation timeouts
-TRAIN_TIMEOUT = 1
-TEST_TIMEOUT = 1
+TRAIN_TIMEOUT = 0.01
+TEST_TIMEOUT = 0.01
 # Models & datasets
 N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
 N_CLOSEST_DATASETS_TO_PROPOSE = 5
@@ -61,7 +61,7 @@
 time_now = datetime.now()
 time_now_iso = time_now.isoformat(timespec="minutes")
 time_now_for_path = time_now_iso.replace(":", ".")
-save_dir = DataManager.get_data_dir(). \
+save_dir = get_data_dir(). \
     joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}')
 save_dir.mkdir(parents=True)
 log_file = save_dir.joinpath('log.txt')
@@ -75,18 +75,23 @@
 )
 
 
-def prepare_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
+def prepare_data() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]:
     """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
 
     dataset_ids = openml.study.get_suite(99).data
     if N_DATASETS is not None:
         dataset_ids = pd.Series(dataset_ids)
         dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
-    dataset_ids = list(dataset_ids)
-    return dataset_ids, {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
 
+    df_split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED)
+    df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1]
+    df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0]
+
+    datasets = {dataset.id_: dataset for dataset in OpenMLDatasetsLoader().load(dataset_ids)}
+    return df_datasets_train, df_datasets_test, datasets
 
-def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
+
+def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array):
     x = data.x
     y = data.y
     if len(y.shape) == 1:
@@ -127,8 +132,8 @@ def prepare_extractor_and_assessor(datasets_train: List[str]):
     return data_similarity_assessor, extractor
 
 
-def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=None):
-    x, y = transform_data_for_fedot(data)
+def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None):
+    x, y = transform_data_for_fedot(dataset.get_data(dataset_format='array'))
 
     time_start = timeit.default_timer()
     fedot = Fedot(timeout=timeout, initial_assumption=initial_assumption, **COMMON_FEDOT_PARAMS)
@@ -137,14 +142,14 @@ def fit_fedot(data: Dataset, timeout: float, run_label: str, initial_assumption=
 
     metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data)
     pipeline = fedot.current_pipeline
-    run_results = get_result_data_row(dataset=data, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
+    run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
                                       automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics)
     return fedot, run_results
 
 
-def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0., automl_timeout_min=0.,
-                        **metrics):
-    run_results = dict(dataset_id=dataset.id,
+def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0.,
+                        automl_timeout_min=0., **metrics):
+    run_results = dict(dataset_id=dataset.id_,
                        dataset_name=dataset.name,
                        run_label=run_label,
                        model_obj=pipeline,
@@ -157,7 +162,7 @@ def get_result_data_row(dataset, run_label: str, pipeline, history_obj=None, aut
     return run_results
 
 
-def extract_best_history_models(dataset_cache, history):
+def extract_best_history_models(dataset, history):
     best_individuals = sorted(chain(*history.individuals),
                               key=lambda ind: ind.fitness,
                               reverse=True)
@@ -165,7 +170,7 @@ def extract_best_history_models(dataset_cache, history):
     best_models = []
     for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
         pipeline = PipelineAdapter().restore(individual.graph)
-        model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset_cache)
+        model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset)
         best_models.append(model)
     return best_models
 
@@ -173,22 +178,19 @@ def extract_best_history_models(dataset_cache, history):
 def main():
     baseline_pipeline = PipelineBuilder().add_node('rf').build()
 
-    dataset_ids, datasets_cache = prepare_data()
+    df_datasets_train, df_datasets_test, datasets = prepare_data()
 
-    split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED)
-    datasets_train = split_datasets[split_datasets['is_train'] == 1]['dataset_name'].to_list()
-    datasets_test = split_datasets[~split_datasets['is_train'] == 0]['dataset_name'].to_list()
+    dataset_ids_train = df_datasets_train.index.to_list()
+    dataset_ids_test = df_datasets_test.index.to_list()
 
     evaluation_results = []
     best_models_per_dataset = {}
     progress_file = open(save_dir.joinpath('progress.txt'), 'a')
-    for name in tqdm(datasets_cache.keys(), 'FEDOT, all datasets', file=progress_file):
+    for dataset_id in tqdm(datasets.keys(), 'FEDOT, all datasets', file=progress_file):
         try:
-            cache = datasets_cache[name]
-            data = cache.from_cache()
-
-            timeout = TRAIN_TIMEOUT if name in datasets_train else TEST_TIMEOUT
-            fedot, run_results = fit_fedot(data=data, timeout=timeout, run_label='FEDOT')
+            dataset = datasets[dataset_id]
+            timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT
+            fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT')
             evaluation_results.append(run_results)
             # TODO:
             #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
@@ -197,38 +199,37 @@ def main():
 
             # Filter out unique individuals with the best fitness
             history = fedot.history
-            best_models = extract_best_history_models(cache, history)
-            best_models_per_dataset[name] = best_models
+            best_models = extract_best_history_models(dataset, history)
+            best_models_per_dataset[dataset_id] = best_models
         except Exception:
-            logging.exception(f'Train dataset "{name}"')
+            logging.exception(f'Train dataset "{dataset_id}"')
 
-    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
+    data_similarity_assessor, extractor = prepare_extractor_and_assessor(dataset_ids_train)
     model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)
 
-    for name in tqdm(datasets_test, 'MetaFEDOT, Test datasets', file=progress_file):
+    for dataset_id in tqdm(dataset_ids_test, 'MetaFEDOT, Test datasets', file=progress_file):
         try:
-            cache = datasets_cache[name]
-            data = cache.from_cache()
+            dataset = datasets[dataset_id]
 
             # Run meta AutoML
             # 1
             time_start = timeit.default_timer()
-            meta_features = extractor.extract([cache], fill_input_nans=True, use_cached=False, update_cached=True)
+            meta_features = extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True)
             meta_features = meta_features.fillna(0)
             meta_learning_time_sec = timeit.default_timer() - time_start
             initial_assumptions = model_advisor.predict(meta_features)[0]
             assumption_pipelines = [model.predictor for model in initial_assumptions]
             # 2
-            fedot_meta, fedot_meta_results = fit_fedot(data=data, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
+            fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
                                                        initial_assumption=assumption_pipelines)
             fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec
             evaluation_results.append(fedot_meta_results)
 
             # Fit & evaluate simple baseline
             baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data)
-            baseline_res = get_result_data_row(dataset=data, run_label='simple baseline', pipeline=baseline_pipeline,
+            baseline_res = get_result_data_row(dataset=dataset, run_label='simple baseline', pipeline=baseline_pipeline,
                                                **baseline_metrics)
             evaluation_results.append(baseline_res)
 
@@ -236,11 +237,11 @@ def main():
             for i, assumption in enumerate(initial_assumptions):
                 pipeline = assumption.predictor
                 assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data)
-                assumption_res = get_result_data_row(dataset=data, run_label=f'MetaFEDOT - initial assumption {i}',
+                assumption_res = get_result_data_row(dataset=dataset, run_label=f'MetaFEDOT - initial assumption {i}',
                                                      pipeline=pipeline, **assumption_metrics)
                 evaluation_results.append(assumption_res)
         except Exception:
-            logging.exception(f'Test dataset "{name}"')
+            logging.exception(f'Test dataset "{dataset_id}"')
     progress_file.close()
 
     # Save the accumulated results
@@ -250,11 +251,11 @@ def main():
     for res in evaluation_results:
         try:
             res['run_date'] = time_now
-            dataset_name = res['dataset_name']
+            dataset_id = res['dataset_id']
             run_label = res['run_label']
             # define saving paths
-            model_path = models_dir.joinpath(f'{dataset_name}_{run_label}')
-            history_path = history_dir.joinpath(f'{dataset_name}_{run_label}_history.json')
+            model_path = models_dir.joinpath(f'{dataset_id}_{run_label}')
+            history_path = history_dir.joinpath(f'{dataset_id}_{run_label}_history.json')
             # replace objects with export paths for csv
             res['model_path'] = str(model_path)
             res.pop('model_obj').save(res['model_path'])
@@ -271,12 +272,13 @@ def main():
     params = {
         'run_date': time_now_iso,
         'seed': SEED,
-        'n_datasets': N_DATASETS or len(dataset_ids),
+        'n_datasets': N_DATASETS or len(datasets),
         'test_size': TEST_SIZE,
-        'dataset_ids': dataset_ids,
-        'dataset_names': list(datasets_cache.keys()),
-        'dataset_names_train': datasets_train,
-        'dataset_names_test': datasets_test,
+        'dataset_ids': list(datasets.keys()),
+        'dataset_ids_train': dataset_ids_train,
+        'dataset_ids_test': dataset_ids_test,
+        'dataset_names_train': df_datasets_train['dataset_name'].to_list(),
+        'dataset_names_test': df_datasets_test['dataset_name'].to_list(),
         'train_timeout': TRAIN_TIMEOUT,
         'test_timeout': TEST_TIMEOUT,
         'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE,
diff --git a/meta_automl/data_preparation/data_manager.py b/meta_automl/data_preparation/data_manager.py
deleted file mode 100644
index 0a743e28..00000000
--- a/meta_automl/data_preparation/data_manager.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from __future__ import annotations
-
-import pickle
-from os import PathLike
-from pathlib import Path
-from typing import Dict, Any, Union
-
-PathType = Union[PathLike, str]
-DEFAULT_CACHE_EXTENSION = '.pkl'
-
-
-class DataManager:
-
-    @classmethod
-    def get_dataset_cache_path(cls, dataset_name: str) -> Path:
-        return cls.get_datasets_dir().joinpath(dataset_name).with_suffix(DEFAULT_CACHE_EXTENSION)
-
-    @classmethod
-    def get_datasets_dir(cls) -> Path:
-        datasets_dir = cls.get_data_dir().joinpath('datasets')
-        return cls.ensure_dir_exists(datasets_dir)
-
-    @classmethod
-    def get_data_dir(cls) -> Path:
-        data_dir = cls.get_project_root().joinpath('data')
-        return cls.ensure_dir_exists(data_dir)
-
-    @classmethod
-    def ensure_dir_exists(cls, dir_: Path) -> Path:
-        if not dir_.exists():
-            dir_.mkdir()
-        return dir_
-
-    @classmethod
-    def get_project_root(cls) -> Path:
-        """Returns project root folder."""
-        return Path(__file__).parents[2]
-
-    @classmethod
-    def get_meta_features_cache_path(cls, dataset_name: str, source_name: str):
-        meta_features_dir = cls.ensure_dir_exists(cls.get_data_dir().joinpath(source_name))
-        return meta_features_dir.joinpath(dataset_name).with_suffix('.pkl')
-
-    @classmethod
-    def get_meta_features_dict(cls, dataset_name: str, source_name: str) -> Dict[str, Any]:
-        meta_features_file = cls.get_meta_features_cache_path(dataset_name, source_name)
-        if not meta_features_file.exists():
-            return {}
-        with open(meta_features_file, 'rb') as f:
-            meta_features = pickle.load(f)
-        return meta_features
-
-    @classmethod
-    def update_meta_features_dict(cls, dataset_name: str, source_name: str, meta_features: Dict[str, Any]):
-        meta_features_file = cls.get_meta_features_cache_path(dataset_name, source_name)
-        meta_features_old = cls.get_meta_features_dict(dataset_name, source_name)
-        with open(meta_features_file, 'wb') as f:
-            meta_features_old.update(meta_features)
-            pickle.dump(meta_features, f)
diff --git a/meta_automl/data_preparation/dataset.py b/meta_automl/data_preparation/dataset.py
deleted file mode 100644
index 23dda83c..00000000
--- a/meta_automl/data_preparation/dataset.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from __future__ import annotations
-
-import pickle
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Union, Optional, List
-
-import numpy as np
-import pandas as pd
-import scipy as sp
-
-from meta_automl.data_preparation.data_manager import DataManager
-
-
-class NoCacheError(FileNotFoundError):
-    pass
-
-
-@dataclass
-class DatasetCache:
-    name: str
-    _cache_path: Optional[Path] = None
-    _id: Optional[int] = None
-
-    @property
-    def id(self):
-        return self._id or self.name
-
-    @property
-    def cache_path(self):
-        return self._cache_path or DataManager.get_dataset_cache_path(self.name)
-
-    @cache_path.setter
-    def cache_path(self, val):
-        self._cache_path = val
-
-    def from_cache(self) -> Dataset:
-        if not self.cache_path.exists():
-            raise NoCacheError(f'Dataset {self.name} not found!')
-        with open(self.cache_path, 'rb') as f:
-            dataset = pickle.load(f)
-        dataset.cache_path = self.cache_path
-        return dataset
-
-
-@dataclass
-class Dataset:
-    name: str
-    x: Union[np.ndarray, pd.DataFrame, sp.sparse.csr_matrix]
-    y: Optional[Union[np.ndarray, pd.DataFrame]] = None
-    categorical_indicator: Optional[List[bool]] = None
-    attribute_names: Optional[List[str]] = None
-    cache_path: Optional[Path] = None
-    _id: Optional[int] = None
-
-    def dump_to_cache(self, cache_path: Optional[Path] = None) -> DatasetCache:
-        cache_path = cache_path or self.cache_path
-        with open(cache_path, 'wb') as f:
-            pickle.dump(self, f)
-        return DatasetCache(self.name, cache_path, self.id)
-
-    @property
-    def id(self):
-        return self._id or self.name
diff --git a/meta_automl/data_preparation/dataset/__init__.py b/meta_automl/data_preparation/dataset/__init__.py
new file mode 100644
index 00000000..62c0a37d
--- /dev/null
+++ b/meta_automl/data_preparation/dataset/__init__.py
@@ -0,0 +1,3 @@
+from .dataset_base import DatasetBase, DatasetData, DatasetIDType
+from .custom_dataset import DataNotFoundError, CustomDataset
+from .openml_dataset import OpenMLDataset, OpenMLDatasetIDType
diff --git a/meta_automl/data_preparation/dataset/custom_dataset.py b/meta_automl/data_preparation/dataset/custom_dataset.py
new file mode 100644
index 00000000..505868f6
--- /dev/null
+++ b/meta_automl/data_preparation/dataset/custom_dataset.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import pickle
+from pathlib import Path
+from typing import Optional
+
+from meta_automl.data_preparation.dataset import DatasetBase
+from meta_automl.data_preparation.dataset.dataset_base import DatasetData
+
+
+
+class DataNotFoundError(FileNotFoundError):
+    pass
+
+
+class CustomDataset(DatasetBase):
+
+    def get_data(self, cache_path: Optional[Path] = None) -> DatasetData:
+        cache_path = cache_path or self.cache_path
+        if not cache_path.exists():
+            raise DataNotFoundError(f'Dataset {self} is missing by the path "{cache_path}".')
+        with open(cache_path, 'rb') as f:
+            dataset_data = pickle.load(f)
+        return dataset_data
+
+    def dump_data(self, dataset_data: DatasetData, cache_path: Optional[Path] = None) -> CustomDataset:
+        cache_path = cache_path or self.cache_path
+        with open(cache_path, 'wb') as f:
+            pickle.dump(dataset_data, f)
+        return self
diff --git a/meta_automl/data_preparation/dataset/dataset_base.py b/meta_automl/data_preparation/dataset/dataset_base.py
new file mode 100644
index 00000000..fd84dee5
--- /dev/null
+++ b/meta_automl/data_preparation/dataset/dataset_base.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+from abc import abstractmethod, ABC
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union, Optional, List, Any
+
+import numpy as np
+import pandas as pd
+import scipy as sp
+
+from meta_automl.data_preparation.file_system import CacheOperator, get_dataset_cache_path
+
+DatasetIDType = Any
+
+
+@dataclass
+class DatasetData:
+    x: Union[np.ndarray, pd.DataFrame, sp.sparse.csr_matrix]
+    y: Optional[Union[np.ndarray, pd.DataFrame]] = None
+    categorical_indicator: Optional[List[bool]] = None
+    attribute_names: Optional[List[str]] = None
+
+
+class DatasetBase(ABC, CacheOperator):
+
+    def __init__(self, id_: DatasetIDType, name: Optional[str] = None):
+        self.id_ = id_
+        self.name = name
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(id_={self.id_}, name={self.name})'
+
+    @abstractmethod
+    def get_data(self) -> DatasetData:
+        raise NotImplementedError()
+
+    @property
+    def cache_path(self) -> Path:
+        return get_dataset_cache_path(self)
diff --git a/meta_automl/data_preparation/dataset/openml_dataset.py b/meta_automl/data_preparation/dataset/openml_dataset.py
new file mode 100644
index 00000000..08fc5c1d
--- /dev/null
+++ b/meta_automl/data_preparation/dataset/openml_dataset.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from typing import Union
+
+import openml
+
+from meta_automl.data_preparation.dataset import DatasetBase
+from meta_automl.data_preparation.dataset.dataset_base import DatasetData
+from meta_automl.data_preparation.file_system import update_openml_cache_dir
+
+OpenMLDatasetIDType = int
+
+update_openml_cache_dir()
+
+
+class OpenMLDataset(DatasetBase):
+
+    def __init__(self, id_: OpenMLDatasetIDType):
+        if isinstance(id_, str):
+            raise ValueError('Creating OpenMLDataset by dataset name is ambiguous. Please, use dataset id.'
+                             f'Otherwise, you can perform search by f{self.__class__.__name__}.from_search().')
+        self._openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False,
+                                                           error_if_multiple=True)
+        id_ = self._openml_dataset.id
+        name = self._openml_dataset.name
+        super().__init__(id_, name)
+
+    @classmethod
+    def from_search(cls, id_: Union[OpenMLDatasetIDType, str], **get_dataset_kwargs) -> OpenMLDataset:
+        openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False,
+                                                     **get_dataset_kwargs)
+        return cls(openml_dataset.id)
+
+    def get_data(self, dataset_format: str = 'dataframe') -> DatasetData:
+        X, y, categorical_indicator, attribute_names = self._openml_dataset.get_data(
+            target=self._openml_dataset.default_target_attribute,
+            dataset_format=dataset_format
+        )
+        return DatasetData(X, y, categorical_indicator, attribute_names)
diff --git a/meta_automl/data_preparation/datasets_loaders/__init__.py b/meta_automl/data_preparation/datasets_loaders/__init__.py
index 3908c8e0..4b91c8aa 100644
--- a/meta_automl/data_preparation/datasets_loaders/__init__.py
+++ b/meta_automl/data_preparation/datasets_loaders/__init__.py
@@ -1,2 +1,2 @@
 from .datasets_loader import DatasetsLoader
-from .openml_datasets_loader import OpenMLDatasetsLoader, OpenMLDatasetID
+from .openml_datasets_loader import OpenMLDatasetsLoader
diff --git a/meta_automl/data_preparation/datasets_loaders/datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/datasets_loader.py
index 8faba6d0..ab6ffa6c 100644
--- a/meta_automl/data_preparation/datasets_loaders/datasets_loader.py
+++ b/meta_automl/data_preparation/datasets_loaders/datasets_loader.py
@@ -1,25 +1,17 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import List, Type
+from typing import List
 
-from meta_automl.data_preparation.data_manager import DataManager
-from meta_automl.data_preparation.dataset import Dataset, DatasetCache, NoCacheError
+from meta_automl.data_preparation.dataset import DatasetBase
 
 
 class DatasetsLoader:
-    data_manager: Type[DataManager] = DataManager
 
     @abstractmethod
-    def load(self, *args, **kwargs) -> List[DatasetCache]:
+    def load(self, *args, **kwargs) -> List[DatasetBase]:
         raise NotImplementedError()
 
     @abstractmethod
-    def load_single(self, *args, **kwargs) -> DatasetCache:
+    def load_single(self, *args, **kwargs) -> DatasetBase:
         raise NotImplementedError()
-
-    def cache_to_memory(self, dataset: DatasetCache) -> Dataset:
-        try:
-            return dataset.from_cache()
-        except NoCacheError:
-            return self.load_single(dataset.id).from_cache()
diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
index 7959ca61..11294c45 100644
--- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
+++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
@@ -1,57 +1,43 @@
 from __future__ import annotations
 
-import shutil
-from pathlib import Path
-from typing import List, Union
+from typing import List, Union, Optional
 
-import openml
+from golem.core.log import default_log
 
-from meta_automl.data_preparation.dataset import DatasetCache, Dataset
+from meta_automl.data_preparation.dataset import OpenMLDataset, OpenMLDatasetIDType
 from meta_automl.data_preparation.datasets_loaders import DatasetsLoader
 
-OpenMLDatasetID = Union[str, int]
-
-
-def _clear_openml_cache():
-    cache_dir = openml.config.get_cache_directory()
-    cache_dir = Path(cache_dir)
-    shutil.rmtree(cache_dir)
-
 
 class OpenMLDatasetsLoader(DatasetsLoader):
+    def __init__(self, allow_names: bool = False):
+        self.dataset_ids = []
+        self._allow_names = allow_names
 
-    def __init__(self):
-        self.dataset_sources = []
-
-    def load(self, dataset_sources: List[OpenMLDatasetID]) -> List[DatasetCache]:
-        self.dataset_sources = dataset_sources
+    def load(self, dataset_ids: List[Union[OpenMLDatasetIDType, str]],
+             allow_names: Optional[bool] = None) -> List[OpenMLDataset]:
+        self.dataset_ids += dataset_ids
+        allow_names = self._allow_names if allow_names is None else allow_names
 
         datasets = []
         # TODO: Optimize like this
         #  https://github.com/openml/automlbenchmark/commit/a09dc8aee96178dd14837d9e1cd519d1ec63f804
-        for source in self.dataset_sources:
-            dataset = self.load_single(source)
+        for dataset_id in self.dataset_ids:
+            dataset = self.load_single(dataset_id, allow_name=allow_names)
             datasets.append(dataset)
         return datasets
 
-    def load_single(self, source: OpenMLDatasetID):
-        try:
-            return self.get_openml_dataset(source)
-        finally:
-            _clear_openml_cache()
-
-    def get_openml_dataset(self, dataset_id: OpenMLDatasetID, force_download: bool = False) -> DatasetCache:
-        openml_dataset = openml.datasets.get_dataset(dataset_id, download_data=False, download_qualities=False)
-        name = openml_dataset.name.lower()
-        dataset_cache_path = self.data_manager.get_dataset_cache_path(name)
-        if dataset_cache_path.exists() and not force_download:
-            dataset_cache = DatasetCache(name, dataset_cache_path)
+    def load_single(self, dataset_id: Union[OpenMLDatasetIDType, str],
+                    allow_name: Optional[bool] = None) -> OpenMLDataset:
+        allow_name = self._allow_names if allow_name is None else allow_name
+
+        if allow_name:
+            dataset = OpenMLDataset.from_search(dataset_id)
         else:
-            dataset_id = openml_dataset.id
-            X, y, categorical_indicator, attribute_names = openml_dataset.get_data(
-                target=openml_dataset.default_target_attribute,
-                dataset_format='array'
-            )
-            dataset = Dataset(name, X, y, categorical_indicator, attribute_names, _id=dataset_id)
-            dataset_cache = dataset.dump_to_cache(dataset_cache_path)
-        return dataset_cache
+            dataset = OpenMLDataset(dataset_id)
+
+        self.dataset_ids.append(dataset.id_)
+        return dataset
+
+    @property
+    def _log(self):
+        return default_log(self)
diff --git a/meta_automl/data_preparation/file_system/__init__.py b/meta_automl/data_preparation/file_system/__init__.py
new file mode 100644
index 00000000..a228da6e
--- /dev/null
+++ b/meta_automl/data_preparation/file_system/__init__.py
@@ -0,0 +1,5 @@
+from meta_automl.data_preparation.file_system.file_system import PathType, get_project_root, get_data_dir
+from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_dataset_cache_path,
+                                                            get_dataset_cache_path_by_id, get_meta_features_cache_path,
+                                                            get_local_meta_features, update_local_meta_features,
+                                                            get_openml_cache_dir, update_openml_cache_dir)
diff --git a/meta_automl/data_preparation/file_system/cache.py b/meta_automl/data_preparation/file_system/cache.py
new file mode 100644
index 00000000..99daf965
--- /dev/null
+++ b/meta_automl/data_preparation/file_system/cache.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import pickle
+from pathlib import Path
+
+from typing import Type, Any, Dict, TYPE_CHECKING
+
+import openml
+
+from meta_automl.data_preparation.file_system.cache_properties import CacheProperties, CacheType
+from meta_automl.data_preparation.file_system.file_system import get_data_dir, ensure_dir_exists
+
+if TYPE_CHECKING:
+    from meta_automl.data_preparation.dataset import DatasetBase
+    from meta_automl.data_preparation.meta_features_extractors import MetaFeaturesExtractor
+
+
+class CacheOperator:
+    pass
+
+
+def get_openml_cache_dir() -> Path:
+    return get_data_dir().joinpath('openml_cache')
+
+
+def get_full_openml_cache_dir() -> Path:
+    return get_data_dir().joinpath('openml_cache/org/openml/www')
+
+
+def update_openml_cache_dir():
+    openml_cache_path = str(get_openml_cache_dir())
+    openml.config.set_cache_directory(openml_cache_path)
+
+
+def _get_cache_path(object_class: Type[CacheOperator], object_id: str, _create_parent_dir: bool = True) -> Path:
+    cache_properties = get_cache_properties(object_class.__name__)
+    directory = cache_properties.dir_
+    path = cache_properties.template.format(id_=object_id)
+    path = directory.joinpath(path)
+    if _create_parent_dir:
+        ensure_dir_exists(directory)
+    return path
+
+
+def get_dataset_cache_path(dataset: DatasetBase) -> Path:
+    class_ = dataset.__class__
+    id_ = dataset.id_
+    return _get_cache_path(class_, str(id_))
+
+
+def get_dataset_cache_path_by_id(class_: Type[DatasetBase], id_: Any) -> Path:
+    return _get_cache_path(class_, str(id_))
+
+
+def get_meta_features_cache_path(extractor_class: Type[MetaFeaturesExtractor], dataset_id: Any) -> Path:
+    return _get_cache_path(extractor_class, str(dataset_id))
+
+
+def get_local_meta_features(extractor_class: Type[MetaFeaturesExtractor], dataset_id: Any) -> Dict[str, Any]:
+    meta_features_file = get_meta_features_cache_path(extractor_class, dataset_id)
+    if not meta_features_file.exists():
+        return {}
+    with open(meta_features_file, 'rb') as f:
+        meta_features = pickle.load(f)
+    return meta_features
+
+
+def update_local_meta_features(extractor_class: Type[MetaFeaturesExtractor],
+                               dataset_id: Any, meta_features: Dict[str, Any]):
+    meta_features_file = get_meta_features_cache_path(extractor_class, dataset_id)
+    meta_features_old = get_local_meta_features(extractor_class, dataset_id)
+    with open(meta_features_file, 'wb') as f:
+        meta_features_old.update(meta_features)
+        pickle.dump(meta_features_old, f)
+
+
+def get_cache_properties(class_name: str) -> CacheProperties:
+    cache_properties_by_class_name = {
+        'OpenMLDataset': CacheProperties(
+            type_=CacheType.directory,
+            dir_=get_full_openml_cache_dir().joinpath('datasets'),
+            template='{id_}'),
+        'CustomDataset': CacheProperties(
+            type_=CacheType.file,
+            dir_=get_data_dir().joinpath('datasets/custom_dataset'),
+            template='{id_}.pkl'),
+        'PymfeExtractor': CacheProperties(
+            type_=CacheType.file,
+            dir_=get_data_dir().joinpath('metafeatures/pymfe'),
+            template='{id_}.pkl'),
+    }
+    try:
+        return cache_properties_by_class_name[class_name]
+    except KeyError as e:
+        raise KeyError(f'Cache properties for the class {class_name} are not defined.').with_traceback(e.__traceback__)
diff --git a/meta_automl/data_preparation/file_system/cache_properties.py b/meta_automl/data_preparation/file_system/cache_properties.py
new file mode 100644
index 00000000..7374df08
--- /dev/null
+++ b/meta_automl/data_preparation/file_system/cache_properties.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Optional, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from meta_automl.data_preparation.file_system import PathType
+
+
+class CacheType(Enum):
+    file = 'file'
+    directory = 'directory'
+
+
+@dataclass
+class CacheProperties:
+    type_: Optional[CacheType] = None
+    dir_: Optional[Path] = None
+    template: Optional[PathType] = None
diff --git a/meta_automl/data_preparation/file_system/file_system.py b/meta_automl/data_preparation/file_system/file_system.py
new file mode 100644
index 00000000..ff2c3743
--- /dev/null
+++ b/meta_automl/data_preparation/file_system/file_system.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from os import PathLike
+from pathlib import Path
+from typing import Union
+
+PathType = Union[PathLike, str]
+
+DATA_SUBDIR = 'data'
+
+
+def ensure_dir_exists(dir_: Path) -> Path:
+    if dir_.is_file():
+        dir_ = dir_.parent
+    if not dir_.exists():
+        dir_.mkdir(parents=True)
+    return dir_
+
+
+def get_project_root() -> Path:
+    """Returns project root folder."""
+    return Path(__file__).parents[3]
+
+
+def get_data_dir() -> Path:
+    data_dir = get_project_root().joinpath(DATA_SUBDIR)
+    return data_dir
diff --git a/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py b/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py
index dc7ccf5a..d81e8cbd 100644
--- a/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py
+++ b/meta_automl/data_preparation/meta_features_extractors/meta_features_extractor.py
@@ -1,28 +1,28 @@
 from __future__ import annotations
 
-from abc import abstractmethod
-from typing import Optional, Iterable, Dict, Any, Type
+from abc import abstractmethod, ABC
+from typing import Optional, Iterable, Dict, Any
 
 import pandas as pd
 
-from meta_automl.data_preparation.data_manager import DataManager
+from meta_automl.data_preparation.dataset import DatasetIDType
+from meta_automl.data_preparation.file_system import (CacheOperator, get_local_meta_features,
+                                                      update_local_meta_features)
 
 
-class MetaFeaturesExtractor:
-    DEFAULT_PARAMS: Optional[Dict[str, Any]] = None
-    SOURCE: Optional[str] = None
-    data_manager: Type[DataManager] = DataManager
+class MetaFeaturesExtractor(ABC, CacheOperator):
+    default_params: Optional[Dict[str, Any]] = None
 
     @abstractmethod
     def extract(self, datasets) -> pd.DataFrame:
         raise NotImplementedError()
 
-    def _get_meta_features_cache(self, dataset_name: str, meta_feature_names: Iterable[str]):
-        cache = self.data_manager.get_meta_features_dict(dataset_name, self.SOURCE)
+    def _get_meta_features_cache(self, dataset_id: DatasetIDType, meta_feature_names: Iterable[str]):
+        cache = get_local_meta_features(self.__class__, str(dataset_id))
         if set(meta_feature_names) ^ cache.keys():
             return None
         else:
             return {mf_name: cache[mf_name] for mf_name in meta_feature_names}
 
-    def _update_meta_features_cache(self, dataset_name: str, meta_features_dict: Dict[str, Any]):
-        self.data_manager.update_meta_features_dict(dataset_name, self.SOURCE, meta_features_dict)
+    def _update_meta_features_cache(self, dataset_id: DatasetIDType, meta_features_dict: Dict[str, Any]):
+        update_local_meta_features(self.__class__, dataset_id, meta_features_dict)
diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
index 8dbc728f..edfa6925 100644
--- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
+++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
@@ -6,17 +6,16 @@
 from golem.core.log import default_log
 from pymfe.mfe import MFE
 
-from meta_automl.data_preparation.dataset import DatasetCache
+from meta_automl.data_preparation.dataset import DatasetBase, DatasetIDType
 from meta_automl.data_preparation.datasets_loaders import DatasetsLoader, OpenMLDatasetsLoader
 from meta_automl.data_preparation.meta_features_extractors import MetaFeaturesExtractor
 
 
 class PymfeExtractor(MetaFeaturesExtractor):
-    DEFAULT_PARAMS = {'groups': 'default'}
-    SOURCE = 'pymfe'
+    default_params = {'groups': 'default'}
 
     def __init__(self, extractor_params: Dict[str, Any] = None, datasets_loader: DatasetsLoader = None):
-        self.extractor_params = extractor_params if extractor_params is not None else self.DEFAULT_PARAMS
+        self.extractor_params = extractor_params if extractor_params is not None else self.default_params
         self._datasets_loader = datasets_loader or OpenMLDatasetsLoader()
         self._extractor = MFE(**self.extractor_params)
         self._logger = default_log(self)
@@ -27,21 +26,21 @@ def datasets_loader(self) -> DatasetsLoader:
             raise ValueError("Datasets loader not provided!")
         return self._datasets_loader
 
-    def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: bool = False,
-                use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame:
+    def extract(self, datasets_or_ids: List[Union[DatasetBase, DatasetIDType]],
+                fill_input_nans: bool = False, use_cached: bool = True, update_cached: bool = True) -> pd.DataFrame:
         meta_features = {}
         meta_feature_names = self._extractor.extract_metafeature_names()
-        load_dataset = self.datasets_loader.cache_to_memory
-        for dataset in datasets:
-            if isinstance(dataset, str):
-                dataset = DatasetCache(dataset)
 
-            self._logger.info(f'Extracting meta features of the dataset {dataset.name}...')
+        for dataset in datasets_or_ids:
+            if not isinstance(dataset, DatasetBase):
+                dataset = self._datasets_loader.load_single(dataset)
+
+            self._logger.info(f'Extracting meta features of the dataset {dataset}...')
             if (use_cached and
-                    (mfs := self._get_meta_features_cache(dataset.name, meta_feature_names))):
-                meta_features[dataset.name] = mfs
+                    (mfs := self._get_meta_features_cache(dataset.id_, meta_feature_names))):
+                meta_features[dataset.id_] = mfs
             else:
-                loaded_dataset = load_dataset(dataset)
+                loaded_dataset = dataset.get_data(dataset_format='array')
                 cat_cols = [i for i, val in enumerate(loaded_dataset.categorical_indicator) if val]
                 x = loaded_dataset.x
                 y = loaded_dataset.y
@@ -51,8 +50,8 @@ def extract(self, datasets: List[Union[DatasetCache, str]], fill_input_nans: boo
                 feature_names, dataset_features = mfe.extract(out_type=tuple)
                 mfs = dict(zip(feature_names, dataset_features))
                 if update_cached:
-                    self._update_meta_features_cache(dataset.name, mfs)
-                meta_features[dataset.name] = mfs
+                    self._update_meta_features_cache(dataset.id_, mfs)
+                meta_features[dataset.id_] = mfs
         meta_features = pd.DataFrame.from_dict(meta_features, orient='index')
         return meta_features
 
diff --git a/meta_automl/data_preparation/model.py b/meta_automl/data_preparation/model.py
index 25de781c..d437ea24 100644
--- a/meta_automl/data_preparation/model.py
+++ b/meta_automl/data_preparation/model.py
@@ -3,13 +3,16 @@
 
 from golem.core.optimisers.fitness import Fitness
 
-from meta_automl.data_preparation.dataset import DatasetCache
+from meta_automl.data_preparation.dataset import DatasetBase
+
+
+PredictorType = Any
 
 
 @dataclass
 class Model:
-    predictor: Any
+    predictor: PredictorType
     fitness: Fitness
     fitness_metric_name: str
-    dataset_cache: DatasetCache
+    dataset: DatasetBase
     metadata: Dict[str, Any] = field(default_factory=dict)
diff --git a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py
index ae7f0b38..599056fa 100644
--- a/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py
+++ b/meta_automl/data_preparation/models_loaders/fedot_pipelines_loader.py
@@ -14,8 +14,8 @@
 from golem.core.log import default_log
 from tqdm import tqdm
 
-from meta_automl.data_preparation.data_manager import PathType
-from meta_automl.data_preparation.dataset import DatasetCache
+from meta_automl.data_preparation.file_system import PathType
+from meta_automl.data_preparation.dataset import DatasetBase
 from meta_automl.data_preparation.datasets_loaders import DatasetsLoader, OpenMLDatasetsLoader
 from meta_automl.data_preparation.model import Model
 from meta_automl.data_preparation.models_loaders import ModelsLoader
@@ -29,10 +29,9 @@ def evaluate_classification_fedot_pipeline(pipeline, input_data):
     return fitness
 
 
-def get_n_best_fedot_performers(dataset_cache: DatasetCache, pipelines: List[Pipeline], datasets_loader: DatasetsLoader,
-                                n_best: int = 1) -> List[Model]:
-    loaded_dataset = datasets_loader.cache_to_memory(dataset_cache)
-    X, y_test = loaded_dataset.x, loaded_dataset.y
+def get_n_best_fedot_performers(dataset: DatasetBase, pipelines: List[Pipeline], n_best: int = 1) -> List[Model]:
+    data = dataset.get_data()
+    X, y_test = data.x, data.y
     input_data = InputData(idx=np.arange(0, len(X)), features=X, target=y_test, data_type=DataTypesEnum.table,
                            task=Task(TaskTypesEnum.classification))
     fitnesses = []
@@ -41,14 +40,14 @@ def get_n_best_fedot_performers(dataset_cache: DatasetCache, pipelines: List[Pip
     for pipeline in tqdm(pipelines, desc='Evaluating pipelines'):
         fitness = evaluate_classification_fedot_pipeline(pipeline, input_data)
         fitnesses.append(fitness)
-        models.append(Model(pipeline, fitness, metric_name, dataset_cache))
+        models.append(Model(pipeline, fitness, metric_name, dataset))
 
     best_models = [models.pop(np.argmax(fitnesses)) for _ in range(min(n_best, len(pipelines)))]
     return best_models
 
 
 class FEDOTPipelinesLoader(ModelsLoader):
-    def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Literal['auto']] = 'auto',
+    def __init__(self, datasets_to_load: Union[List[Union[DatasetBase, str]], Literal['auto']] = 'auto',
                  candidate_pipelines: Optional[List[List[Pipeline]]] = None,
                  candidate_pipeline_paths: Optional[List[List[PathType]]] = None,
                  launch_dir: Optional[PathType] = None,
@@ -56,12 +55,12 @@ def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Liter
 
         self.log = default_log(self)
 
-        self.datasets_loader = datasets_loader or OpenMLDatasetsLoader()
+        self.datasets_loader = datasets_loader or OpenMLDatasetsLoader(allow_names=True)
 
         self.launch_dir: Path = Path(launch_dir) if isinstance(launch_dir, str) else launch_dir
 
-        self._datasets: List[DatasetCache] = (self._define_datasets() if datasets_to_load == 'auto'
-                                              else self._dataset_names_to_cache(datasets_to_load))
+        self._datasets: List[DatasetBase] = (self._define_datasets() if datasets_to_load == 'auto'
+                                             else self._get_datasets_from_names(datasets_to_load))
 
         self.candidate_pipelines = candidate_pipelines
 
@@ -71,8 +70,8 @@ def __init__(self, datasets_to_load: Union[List[Union[DatasetCache, str]], Liter
 
     def load(self, datasets: Union[List[str], Literal['auto']] = 'auto', n_best: int = 1) -> List[List[Model]]:
         if datasets != 'auto':
-            datasets = self._dataset_names_to_cache(datasets)
-            difference = set(d.name for d in datasets) - set(self.dataset_names)
+            datasets = self._get_datasets_from_names(datasets)
+            difference = set(d.name for d in datasets) - set(self.dataset_ids)
             if difference:
                 raise ValueError(f'Results for these datasets are not available: {difference}.')
         else:
@@ -89,10 +88,10 @@ def _define_pipeline_paths(self) -> List[List[Path]]:
         if not self.launch_dir:
             raise ValueError('Launch dir or model paths must be provided!')
 
-        dataset_names = self.dataset_names
-        datasets_models_paths = dict(zip(dataset_names, [[]] * len(dataset_names)))
+        dataset_ids = self.dataset_ids
+        datasets_models_paths = dict(zip(dataset_ids, [[]] * len(dataset_ids)))
 
-        for dataset_name in tqdm(dataset_names, desc='Defining model paths', unit='dataset'):
+        for dataset_name in tqdm(dataset_ids, desc='Defining model paths', unit='dataset'):
             for model_path in self.launch_dir.joinpath(dataset_name).glob(r'FEDOT*\*\*\launch_*.json'):
                 datasets_models_paths[dataset_name].append(model_path)
 
@@ -104,28 +103,27 @@ def _import_pipelines(self, candidate_pipeline_paths: List[List[PathType]]):
                                    desc='Importing pipelines', unit='dataset'):
             candidates_for_dataset = [Pipeline.from_serialized(str(p)) for p in paths]
             if not candidates_for_dataset:
-                self.log.warning(f'No pipelines found for the dataset "{dataset.name}".')
+                self.log.warning(f'No pipelines found for the dataset "{dataset}".')
             candidate_pipelines.append(candidates_for_dataset)
         self.candidate_pipelines = candidate_pipelines
 
-    def _define_datasets(self) -> List[DatasetCache]:
+    def _define_datasets(self) -> List[DatasetBase]:
         if not self.launch_dir:
             raise ValueError('Launch dir or datasets must be provided!')
 
         datasets = list({p.parents[2].name for p in self.launch_dir.glob(r'*\FEDOT*\*\launch_0')})
         datasets.sort()
-        datasets = self._dataset_names_to_cache(datasets)
+        datasets = self._get_datasets_from_names(datasets)
         return datasets
 
     @property
-    def dataset_names(self):
-        return [d.name if isinstance(d, DatasetCache) else d for d in self._datasets]
+    def dataset_ids(self):
+        return [d.name if isinstance(d, DatasetBase) else d for d in self._datasets]
 
-    @staticmethod
-    def _dataset_names_to_cache(datasets: List[Union[str, DatasetCache]]) -> List[DatasetCache]:
+    def _get_datasets_from_names(self, datasets: List[Union[str, DatasetBase]]) -> List[DatasetBase]:
         new_list = []
         for dataset in datasets:
-            if isinstance(dataset, str):
-                dataset = DatasetCache(dataset)
+            if not isinstance(dataset, DatasetBase):
+                dataset = self.datasets_loader.load_single(dataset)
             new_list.append(dataset)
         return new_list
diff --git a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py
index e26b896e..7c38b9d8 100644
--- a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py
+++ b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py
@@ -7,12 +7,13 @@
 from fedot.core.pipelines.pipeline import Pipeline
 from golem.core.optimisers.fitness import SingleObjFitness
 
-from meta_automl.data_preparation.data_manager import DataManager
-from meta_automl.data_preparation.dataset import DatasetCache
+
+from meta_automl.data_preparation.dataset import OpenMLDataset
+from meta_automl.data_preparation.file_system import get_data_dir
 from meta_automl.data_preparation.model import Model
 from meta_automl.data_preparation.models_loaders import ModelsLoader
 
-DEFAULT_KNOWLEDGE_BASE_PATH = DataManager.get_data_dir().joinpath('knowledge_base_0')
+DEFAULT_KNOWLEDGE_BASE_PATH = get_data_dir().joinpath('knowledge_base_0')
 
 
 class KnowledgeBaseModelsLoader(ModelsLoader):
@@ -21,21 +22,21 @@ def __init__(self, knowledge_base_path: Union[str, PathLike] = DEFAULT_KNOWLEDGE
         self.df_knowledge_base: Optional[pd.DataFrame] = None
         self.df_datasets: Optional[pd.DataFrame] = None
 
-    def load(self, dataset_names: Optional[Sequence[str]] = None,
+    def load(self, dataset_ids: Optional[Sequence[str]] = None,
              fitness_metric: str = 'f1') -> List[Model]:
         if self.df_knowledge_base is None:
             knowledge_base_split_file = self.knowledge_base_path.joinpath('knowledge_base.csv')
             self.df_knowledge_base = pd.read_csv(knowledge_base_split_file)
 
-        if dataset_names is None:
-            dataset_names = self.parse_datasets()['dataset_name']
+        if dataset_ids is None:
+            dataset_ids = self.parse_datasets()['dataset_id']
 
         df_knowledge_base = self.df_knowledge_base
-        df_knowledge_base = df_knowledge_base[df_knowledge_base['dataset_name'].isin(dataset_names)]
+        df_knowledge_base = df_knowledge_base[df_knowledge_base['dataset_id'].isin(dataset_ids)]
 
         cached_datasets = {}
-        for name in dataset_names:
-            cached_datasets[name] = DatasetCache(name)
+        for id_ in dataset_ids:
+            cached_datasets[id_] = OpenMLDataset(id_)
 
         models = []
         for _, row in df_knowledge_base.iterrows():
@@ -45,7 +46,7 @@ def load(self, dataset_names: Optional[Sequence[str]] = None,
             metric_value = row[fitness_metric]
             fitness = SingleObjFitness(metric_value)
             metadata = dict(row)
-            dataset_cache = cached_datasets[row['dataset_name']]
+            dataset_cache = cached_datasets[row['dataset_id']]
             model = Model(predictor, fitness, fitness_metric, dataset_cache, metadata)
             models.append(model)
         return models
diff --git a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
index 09720a1e..40008d00 100644
--- a/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
+++ b/meta_automl/meta_algorithm/datasets_similarity_assessors/model_based_similarity_assessors.py
@@ -1,10 +1,11 @@
 from abc import ABC
-from typing import Optional, Dict, Any, List, Iterable
+from typing import Optional, List, Iterable
 
 import numpy as np
 import pandas as pd
 from sklearn.neighbors import NearestNeighbors
 
+from meta_automl.data_preparation.dataset import DatasetIDType
 from meta_automl.meta_algorithm.datasets_similarity_assessors.datasets_similarity_assessor import \
     DatasetsSimilarityAssessor
 
@@ -13,7 +14,7 @@ class ModelBasedSimilarityAssessor(ABC, DatasetsSimilarityAssessor):
     def __init__(self, model, n_best: int = 1):
         self._inner_model = model
         self.n_best = n_best
-        self._datasets: Optional[Iterable[str]] = None
+        self._datasets: Optional[Iterable[DatasetIDType]] = None
 
 
 class KNeighborsBasedSimilarityAssessor(ModelBasedSimilarityAssessor):
@@ -21,7 +22,7 @@ def __init__(self, n_neighbors: int = 1, **model_params):
         model = NearestNeighbors(n_neighbors=n_neighbors, **model_params)
         super().__init__(model, n_neighbors)
 
-    def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]):
+    def fit(self, meta_features: pd.DataFrame, datasets: Iterable[DatasetIDType]):
         meta_features = self.preprocess_meta_features(meta_features)
         self._datasets = np.array(datasets)
         self._inner_model.fit(meta_features)
@@ -30,7 +31,7 @@ def fit(self, meta_features: pd.DataFrame, datasets: Iterable[str]):
     def preprocess_meta_features(meta_features: pd.DataFrame) -> pd.DataFrame:
         return meta_features.dropna(axis=1, how='any')
 
-    def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[str]]:
+    def predict(self, meta_features: pd.DataFrame, return_distance: bool = False) -> Iterable[Iterable[DatasetIDType]]:
         dataset_indexes = self._inner_model.kneighbors(meta_features, return_distance=return_distance)
         if return_distance:
             distances, dataset_indexes = dataset_indexes
diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py
index a9ca0d97..c653a173 100644
--- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py
+++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py
@@ -1,8 +1,9 @@
 from abc import abstractmethod
-from typing import List, Dict, Iterable, Optional
+from typing import List, Dict, Iterable
 
 import pandas as pd
 
+from meta_automl.data_preparation.dataset import DatasetIDType
 from meta_automl.data_preparation.model import Model
 from meta_automl.meta_algorithm.datasets_similarity_assessors import DatasetsSimilarityAssessor
 
@@ -17,13 +18,13 @@ def predict(self, *args, **kwargs) -> List[List[Model]]:
 class SimpleSimilarityModelAdvisor(ModelAdvisor):
     def __init__(self, fitted_similarity_assessor: DatasetsSimilarityAssessor):
         self.similarity_assessor = fitted_similarity_assessor
-        self.best_models: Dict[str, List[Model]] = {}
+        self.best_models: Dict[DatasetIDType, List[Model]] = {}
 
     @property
     def datasets(self):
         return self.similarity_assessor.datasets
 
-    def fit(self, dataset_names_to_best_pipelines: Dict[str, List[Model]]):
+    def fit(self, dataset_names_to_best_pipelines: Dict[DatasetIDType, List[Model]]):
         self.best_models.update(dataset_names_to_best_pipelines)
         return self
 
diff --git a/requirements.txt b/requirements.txt
index eca13d853ca1f8e55c583bd3790a78a679ffee4d..ad0a22332f176f2c866188116575624428ac1536 100644
GIT binary patch
delta 38
pcmZ3-e1>_$I!3t?h75*OhIEE}h8!TB%U}zH1`K)(MnG)9006<B2fF|O

delta 7
OcmX@ZypDOpIz|8ti~`^Q

diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 00000000..77cbabd1
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,40 @@
+import time
+from pathlib import Path
+
+import pytest
+
+from test import constants
+from meta_automl.data_preparation.file_system import file_system, get_data_dir, get_project_root
+from meta_automl.data_preparation.file_system import update_openml_cache_dir
+
+
+def pytest_configure():
+    # Crucial setup & checks to avoid misplacing data during the tests
+    check_project_root()
+    set_data_dir()
+    check_data_dir()
+    update_openml_cache_dir()
+
+
+def check_project_root():
+    actual_root = Path(__file__).parents[1]
+    root = get_project_root()
+    if root != actual_root:
+        pytest.exit(f'The function `get_project_root()` should point to "{actual_root}". '
+                    f'Got "{root}" instead', 1)
+
+
+def set_data_dir():
+    file_system.DATA_SUBDIR = constants.TEST_DATA_SUBDIR
+
+
+def check_data_dir():
+    data_dir = get_data_dir()
+    if data_dir.relative_to(get_project_root()) != Path(constants.TEST_DATA_SUBDIR):
+        pytest.exit(f'The function `get_data_dir()` should point to "test/data" (relative to project root). '
+                    f'Got "{data_dir}" instead', 1)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def set_test_start_timestamp():
+    constants.TEST_START_TIMESTAMP = time.time()
diff --git a/test/constants.py b/test/constants.py
index 2eea547c..9a9f9b12 100644
--- a/test/constants.py
+++ b/test/constants.py
@@ -1,2 +1,5 @@
-CACHED_DATASETS = ['australian', 'monks-problems-1']
-DATASETS_WITH_CACHED_META_FEATURES = ['australian', 'monks-problems-2']
+OPENML_CACHED_DATASETS = [40981, 333]  # australian, monks-problems-1
+DATASETS_WITH_CACHED_META_FEATURES = [40981, 334]  # australian, monks-problems-2
+OPENML_DATASET_IDS_TO_LOAD = [40981, 1464]  # australian, blood-transfusion-service-center
+TEST_DATA_SUBDIR = 'test/data'
+TEST_START_TIMESTAMP = None
diff --git a/test/data/datasets/australian.pkl b/test/data/datasets/australian.pkl
deleted file mode 100644
index be8a9ae3c92f0658eaecbb58451f6cb9787a1ed5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 41870
zcmd6w53HSMec#^%Qvyv}w+mgmrrjN+t=p#T(wdTPZg~?-8^|AAnzm`15<9WMX`SRQ
zI9uq_?bJ2Fp?AzHw^^E{>?}>0$1-Lige-*6QG}uhMaV)3MJPf?D2k#8RTheng`#wy
z&wHP*{p8_2*EZe1SA6gH_vicjJ%8TkJ?GfD=1eyI#h?ClH(aZi-nM7|uA6q<wf~OW
z_Pt^EF6@;%_pI!?bJzY`@3?&kyl>C`wbfT`C#Cg~wV(czwLf0lvbyQ^UAOI7TYb&0
zyWY3|&RzR%-F5rg>Unpst-kj5yKY-~{~K<(<IX*Axb3ce`)}QK=bgLWzqWeeO?U3O
z<&N7C-*t;^R$n;?X?^?db;(E8uI@gt_SW-mUi;wMyVkZ`ecrX7JpbzNzjtl#_pZKj
z_x|^<?6K<n_iSCe@6fv76|3iMTf6SM>)!EqT*Hs6yZvi7AG-3y{Vs0umi0B}XZ$fX
zHikugHqzGp7wOWs&9`%1q)T0!kKgu*J{(>AmXojhL^BpkPWEbfJ^Gz{9Br*1>$NDK
zc-=oeYp(6ojExgrH1S)mYd5k}*J|pxwpTQ<hhP;q?-gCk>)F%!RvW+OF}ocnnvAFK
z6+b){OD;SVHnoX`Yi;ast4X|OHMQT`b&oM^9b06#Js0_@57)Y}*jPTgu$tK6#MU$G
zKkHtt-PYCl<XVm<asBp6EZQ@1UEkK4JC|5H)<*T|oAGPEwd=lPJzITx*4o(V)%xi%
z#;N&kxE?2VxE?FL$8uw_=v!9n)%H%T)wh0hHOFqX@i)pRC+D)xE4=Q#{52!7T9@%s
z7k`l^`EYctpLn><$Cq=(-Y6HX-0Et7tXG>K^XtB2W7YXZ^{pm-YJD`}<XS)Rn#VL_
zvARaP){NP8Ec(RG#A9QnCOLgC{R3Ba-C^+kZ|%Ajco|H3a7EYs4Y3!m>?VH&zZu*O
zwv+$Ww|0|9us;l}+eB;&_z*Y`|3dg*!hZxV#$JY3;WxoQ3LeD%2G~aL2iE<`pTM60
zpCi5x{&!#%JcR#Y@GSUm_($NYz^6dR9KVm)gW%`D^~4<i0{B~ixp#Nq-@tq)V2b^x
zz#emI--m6VKZc#>Ua*9H0{#!iz=hy^umXMqJW1{cVb}1p@MG{*@N@LL9ex=9_uymj
zhu~iU2eH2mT;r?24lqN1BeC7sKLu9s*^ll$*f)cdrN*_mC;IoYUmqdoTHnoj{tR}F
z*T9#9Uj|p<yU%|PyM{Y~_v{w*{{`FiP3ZI2@T-X(f_IR2zSn?HgE`nw{3dV-_Fnin
zd>_07GmoyLhxcRoif;1X>Epeda&F!R=e~UqyCbIdFR?GA?kMa0UF^feyvNRYEA<E9
zD}a0S1RD2u6&%5S3b=mH8Fk$eG>^b<hgs9a`|o-Cw}t;1{6+Fd!4!Drej7ZEy%~N0
z-bv5z#r_^}CGfrt`%HWXm^1I6Cw@8c{lGna9Cn|64BLJF7qI8Uz49DB1hxVDd>yc^
z?w7zj!NvHW06&7g1-6&Zj_2_L@OrS7J-HWNBj-Ky-rD;=vqtyY=ZiH>9>u<z*uRH6
za258S!~YYwrk?rF`fl<u?3bu{Kl~`{S>KNBoILy6@P~VW?VkJ^yhN>g?3!ob*NM4D
z2eCcNYV^IA9|he1?lSlgVC};h+f3|@Xq=xl$MHV^d)6-n^qqVh{#U^B_*=j;y#;)R
zKKFpPVE;$pzU&0=1wJ!-VE5106rZ^X_vu#bONi|R&fR-|J!5$O-Nz+xH+T;0BDMy4
z>fDbV-~d|h-7)(8Cj9#V{lxKI_g}KE|BAhg?V7!x6Yw$YYvFyM!#)AtfPE3{YnHDE
z&gWt953oHiuJy;kW5iwnuFq$HIduOR)}Jvqd#T~sI@Q}XUcy|QKibLt@b$oF{~B`6
z>o0)w;!Jdpfb)Spe-b#a_rRX-dtvXB&&{jBSIN0Ij(Hxv{|ELe_TK{U^^?Fp#_k91
zgJ=Gy!8fosgKMze2hVd4ei8g9@OfhH%k}IjXSur!^u&G`_Vwl=U~kXB$FYA4{z334
z_6^{5*lVz7)${S|@SlO70sjcRNbIlRIs0b6e~SHDa4+`1hP|g>!u}!nGI$Bx&l->5
z`}*jacP&1<uKSOH>-;41_1yYwxE^1Vz6=hd-v<8#Jsu|hG2nT=2>-A?xn*iUjO{+T
zUhk1{pUrDv&y#EZ9QIydKlW*OAAAP;ugQIse%{a9z_*CGCeNfbKLGn2AIE+b?Af))
zAsBnIO3b<W9`C)m2RQGy5qk~TjqmgHk~R3gUpOCo+z6b*H^ILl=Iik*Vdv>{WH0u3
z@<C#*$=B##z&?!qR@k}y6xfdMYw-d29q<;Q#@+|9UC)ic^XTiB=gRqgfZl%te*yoG
zVBhopIk*u2Q(y`nMdKPdW0OCHy+_;ODL%c2`^Ia?@OtVQIZ6H^;A_q!@K2~^+&7`&
z`R<+|=2_tUP1yHgy`HZdi2n*2$9OmR9pKr#0sIqi9pEfaJ_JwDV*B}M{p`Y5;r}M=
z>%sHr?!$fnytlx4xvoFNe;4pM`~&zNuoL?-;Jx1m&L>8X;r-Ba<=(%7n6G!=1G|6j
z&oSV0>G|6PT$`_zEAXd*=lxmcV&1*D3%&{*!9GFmA7WpM?OZOv_I!N>TkY4V^O=4<
z_&h%QH_ZJ3>e##C-mb#86MqWc0@N~I_gUbxpw|2DS@7)eoDJvL@jQnQ0M;}-OE+M@
z9W84c=6e9!e%pz6;056PJzL+x-T_>@`{Did^~-TLfo}uzPlBbwAA+9)K2yGT8fT3|
zt@X~|nja!}xp7`&)?i;_4#LOLEMa?}9M}45;H`l3H|)P_@;S8LGw5@;6I?>x-W@%C
z<~+ZTz}~|tF>?<A>s*K9eF3}?oon#9w9cAC*smo%qi(qO#CqUssdIB)o)bTN9)o-I
z);X`Ou(tQldvEVez?|<*zRr9Z-~G50_I_Pabn2boRiI;RKVNy)K7qd(=GhtQ?B&^V
zzV7oja1h&P)3vETNq!0R^!ah(&f!wnGw7V`>sZ&~JJ+Z19e)RGz5DFGyBEG*4EGlM
zGsK;{b2F#TI`_n9$Y)sF{r4F>jBP)ko!mFi)=p}jtFak#bUr=)r-5tP4xGzT*fC~k
zjQjj_#Cgvi=I?vWar`&K?!|*>yeI0d{UUHaYFy6=?4O10<-E3mqvURY9p|m&T!-^=
zoxb+IgujH{1M7x;f-eBOfpzMS!>>f|c<#*v$Ucjn@q6LNz&>DXncNBBKDZ}7%a6nR
z!NtJ7KTMtX?C#=U4x9gNVl&q0{G8LXz%@FKdvA=-9>X}EIp?9?v*x*R-R_s~De9fG
z>vA710>^;;9)?$e&*jen*RU6OW|r~o<5)+)Q(zOAGLE>EynEuld4JXY1~L0x0-Vzh
z*m16b)w_?5|9RlPKLFGpM&o{n%fYPF9Ke1O_#Si*JO{qEKLZ~{<Gk(v5dM!~Tl*ET
z*&ML1^}hD*!1f$Hf$v;=@AW=?2Ka2bR_~SjZy)QP2A-o8YF&%#bPvo~cLYCkejPo&
zL2Sm}_`Erfj@Xyr-QY0rjM~p}T+jW$eR3bJ0`_uFK7Y%^kAfcGah;R*()cR2T0bjH
z@#nzT<#*zrz|OoJ$FuWM;2CuPjoIf$cnZw#pvL!(ye_{QTW!Z$JfEH)K0gefkJLLC
z-&-cII?u~7;GF#Y?jG#}?vwBFGy1#%c5TNP>nE{2+lR63H6`yD?uGa5ZRG6bp1ZC`
z!Exg5ou4JVS89y8r}njW0y=Eh=I6*8VEehgjxpQ=_hK`?&!fGEbz*xath0~L!WL}z
z;{?9X(q{Z$D;me|u`ht#Pj$Xlc{W{-xt==bBfE!J0r%tQ;fsLd`r7*#G1qw#*vIG8
zIqbx@x9r?ptLu;*SKIaY8UF?NAaES_$n)dp2YVTR1X$<$sQc=3<5<RD0{4Jpz%yuX
z>o&pOYsYpz?t!r>_F-@k+qGQ?F9Ut|dlk%y`&#u`Y@dZIvE5tua5#tDi@Vjq-gn2a
z&NVr{p9^}}wM@X}!10&KJC18OQtYdVe-q#RbRTEH_cPb~BJ7;4cTSGu^X(jb9Ww5E
zJVR^Pj&}n8Lj2zZu7792A^2Ki_rlKU>);jm*<<UxUw-C&2)-Eh{!WNHH|M<_*w6cO
z40eu7;2P`|*n6=AT%Y^)CD{3Tww{5n2Ty_(;JtbfxL=3Sx^|z#M~eSr)Le{hzfS`9
z-<<pG9IhwtJ+hzo*SS0g+>`Goz6rL!?=LUFJIJ|D#<m03bd;ETy%Tm0uOV;Tee~IK
zZ2Nr{_>8)DKZ?db139Mso+swm%b)|+`>gy7*h0>@=hZRXTi@e4&W$>eD}LB`?C5Gg
zcCD-JI^V|iy=qPDn#VMWh1Iv$q8@d8k=^!Pq&u5<-7ES!R@=$Tx$0Q#)P$o?zUEp}
z$73fi=Sr^S<mEQE$d5LxuD0Wk#gZH2I;P$F>!Y0gJkz7)c3v4L@tSM9?X!`ex>jS{
zs~t1C<l=9{W7@G;dYJQye<og}t>d-5s9!YUMY@gRXItCqT7PT|HC}bD_G?>B{BT`&
zE_PjC+h^0G)$4n0RNLkk`E5Ks>$tX;xo~tXuPeS&+t#&y+vo4&e>c5T6VCYh^;O4X
z%U(4v%B?G3e(&;LE_<c6w(H()Jo>Zc)VF-L^^4|WKd-jWdUX9~or^YH=VG5rF4~sU
zOHQtiCB7(EYu7uBuQ7dU{i0Yj8ME%2STq~;7}KV%&ee9CPp;*Rn|SIHU*xOzYITX#
zb=rNu{^uZh0;a|9VE=3o%?>{YgE?_+xd(n`o)+7@wn*KOBi<96H%K3Q%*c7E*H>%4
z2gaf`?l`j8T<jjZ1F~^7_L*WE*Pd1O5W`nz??G>#tH&QM8O;0jan$tK`m(ubU2jLs
zoKUN;#;fm7@5`w?xhe5k$ysBr2iEna*7*sw=Aw~1FfBZTt;u@K2OXF8&=buP(4PqA
zK7}_E^D^!gp7rateh+eG9P5NV?WL_&#I|k*)EMi)j-s<hZC5btC9e4?F);&k&=t0i
z`CL7=G5hvCfQLT%V5o)7Pt1cU=)k<f5_#ce-rNK%6Vo1YRS#n`&==@yd#Tm$fU)?-
z#0+$xFT5Fj57dbcxJI>J!+QI{9BQl|YK#GO!<ykK@alVitoz26OJDoX@zu{@p+5!5
zX*&nwqPG?v;w5HWy*hJZ1_o`3ImaPJd<tBLu{m}weJ^9?hx#FMI%dwe?7YQNv?Xp%
z*lz-x7ixQa>!(GduERIxHRSs;KHL>pYmM5}3`}fbQt}iG<DgqAFs|)<!p2;q`f!hJ
zTsTe-w5`*hVe5a#Tu7Un0(D+XuzkD!^uAB9)5rK6Oo3YM2{5m1%xl&UbZRb+Z7wyA
zVJvlWavil(;QZwtcuo5CG3KzbDH!J3t*_aB$)%V649Jd=ar!czzT+gnRO}x8&>L23
z-kdOQ&5YO-bl9?*4yZA&ogUgfm;&>|+PifdI#s7;4(zGUtCvA?#_eNV?Vv|uTzdwF
z3)Y?jwO-nDP}jybuU-uEr^cQ=erJ8bGHefh^BuP98FbjaF)z7jrv3U}ysVCN?a$Dt
zSG!~_Y+k!BzOh*qH}7T4T+WpK)L5>5z1JM}l7+q1+230IK?C>1XGJ4-g1GloSTiMN
zjXC`uHaCOk1?G)+aO%u?4NSc2PunwWVa!@<h92s?`qNr_&hdwJ!`A6f3+wl#M~7`6
zp~hNur+h-{HxpNDet<6i92j@3sWz9j&d;2-IqklR8SjbN*PI%Wx(Tt=blB4ZHTvef
zWMRL=V)w+SU{*Bx^VsM+Z0D3Qr})~=No_Fb@Mk3^FG63{oyoR`ns99Wz?hdj26bnO
z*P7T9>SFiU`f}`9lkd2UdCAeui=CM90O=`RV(KPEV_rWw<F%jo7^gP+aJ2C!^c3;?
z#<ouz)9-*RhBIdf+pFeiX8n*FerOw?7idokv|E^)R&o7Wqn}vERp)AZR_f$j+KErF
z`wGVESXbo|SJPMMxb%&8g|$Urq4sOL<BC72^tIg+k1xCbGg$Nmu{$p93GhmO-mlxn
zjm-*cd+ASrzL*xCRk7p}3nxCtW10y)h5F=&80#`tV_vjfxzu?@GwTO;_>tI@SZwoS
zwlQ;}126Lpm6$xKY*H`FwfWT5HM3H0ZS3g6UD4#;CO#uRDY?Yt#AYQoFC4#bYC5hu
z7GM1YB-ddZi(j+xuE`M@w6ghHKW0xFP3zky*Q`I?Gq$<DipMsuEhd$ncz8aN>#LY{
zw6W(+-gt6y?%7OxfwtpP>!n7mFc$8(^v4j5Ij>ryKk2_SR%?=1YkXSt#%IMgu3h`n
z##a+eirsM~*79rvO>)*ufPRpeER4%uwXYov$&!!8*rb1XjecV4v}Yq4b7CxKEEv<&
zF>NndObW-J6~AU}uf+QP<+YRIxA|5Vzh!$Hn}G>Pp71H#Tpic1W3{cW_VY{_uhDVU
zerr$Ad)4t-KZJ(<+M>?4_H;y(noF)+Kl>;7OzRx|{e_Fbgqnl!0q_KP82I}Z{{F>r
zuoHA>w5Qf$`+FM3Ciqi$2K>F1&k&zr_wY6_C-#sr#(M_73ivxJ9hw91*T9#7v7Xop
z`eA->hyMUPg|+Qx|D7e*6T1Q5-^noN@0QGn`Fk7o@b^aM_^#0!`759!rth4V@nvI|
zVXN_XU+g8O#E+wO9sXX&Py^4Z9@hAKC_ORrbGXw6d#_yYp=Ms<j&lh(0D5BnKFM4?
zwNKI8@r*xBeAo+YW9p{x18_%dGq%2C`1?8DGxxzh?vwNFqKAjM!Oq2g`aON*N#V!I
zZ3Z`jV`x4MoA-Az=S6dZSdZ<P>diX`drpaezQktO9k>_2ht02mCE(ih=futJ#_!+>
za1G`zfnAf@wPHKg7qFef9Id(&!1X%c9k4YMa&!Bk@8Cy??If??<2&EG!R8Y4p4iL0
zYq|$41NYxOoKiCf_E-V;lXD*(&zhy;JGT0d!QLnJ`mSjT>pOpIPQuq4r>}jU1)JDs
zb7~*P*LIw)#EpFkI0t)8sde7xlYfx7XLe4k1IMtRBj?)eZ=VTq>%NM87~TZWiHW}O
zVPY2p`x@_H_jZbH%(cv6?Vgx><rwa%^L-9>4Citw@OgT%<jh}!eGoXOj`}ac!`#R%
zQ8yv>M5%Wy&;3e?yGQms0L;06-Xr^3+kwNyf1%`^=MikjGxu%S`OO)}Irs3r@C06i
zy>DB<9N&KS?y((D+p&A%(-K?8?y$4R@>9UG&=a4Pv2re__>SwIn*Rj0YxpL-1SZ6s
z^91Y!9sbkC8T%nvzX$q`?f$qg?%BM=H{*9;GckJ_x7Vk!t#geTLqGfP+%k^$)Vc{!
z>l~NN3u1@iIq)^Yy_tiZ=I9rgn*pEC;T)EHNBve}jyuP;w`{%h)^<$K?>+eHdSYv^
zz3nw4m%bBn?&%!1#}wO`^Xg!0JdegY^26GR9VK=EOo@Actes%1UBbQqc5F3|g9$l%
zsdHV%J&!Me8-ckV&3<ghnPNYW{WP%Wywp3dN5HVJ@NhP$HE-?^aNLJMN6fj*Vb}O9
zSOSjKmDm^X-4pxu#HQc?zGF_UbuPehmSJ=D---QE*k0zW7mm}x6Z(A0JS-f;vFzo&
zk&okV2lluIJ_#NJy*0$F>9D_1*uLg=!p_0`9RD!Bdf^=83(0v_Uj)u+1}3G}*ptBc
zoY+tUPeBKK4rlb)1e<d&4uB2}<Dl_5-3QX+VeA=k^&Kpxz%^Urz46Q$v&LAD{SbBs
zf4y+_+5EG_T<4s;{5Y7D^-YP*fa@IQ1Ro}M1a^N;fK}`j_<7j9*a@c8n9n?AUr&rL
z!&#5^6u9<>!G!!0?EP9I-s2yIy_e4Iv*0pdo%7j`J;Oc@K7_5t_>{U&VQ(hxJe<GW
z6VJZPvBd<eT)AF%32Z-~_bG8}>^U#C>+7)X>pX_N!tcSoC5PhFHS8Jlk-ciJbBRY2
z|7^JK)7IDiM!Jln-mCT3qwDeH(yz{?w$`+<F~9DS+Qe#Jlsi-0e9(^H){f~C3ujKX
zuU+?!Uu$bSwY4sGa&^Dhb&uF>E`Hso)z-P#a<0VmV}}!qHr(cGzt*HadAYrki?-%y
z7irQj`EYu~kFM=^rhN427f!!$bn(--jgR?l%|^A!r6%(UC!cW=ula0p=@U+U#!Y;T
z>;83JYLkzj9%I_%!qJ4&BYw+u-dwK4>%Pg!ZM^kcP4eON($7`fi(++st4}WR`1Y#p
z_~ANN+igBMIk|Ofy?<iKCzp6Q`SeSC5l7e7#jopPw>4w&T9aPkS|3~X8mn8+oZg4l
zAFIh&buRr=-*T&sKE2}CT=!_}<42P|HMe$KGgh0Lb{?rszV;KVYuZ@+nze(C=F-+g
z+m6@jYQNQ{M>MT(Jh?Hh^Xc8{HtN~t#(Jb*axEuc&!x6AM(XNZTbo?sbxrKLU+n1X
z`ovO~+?ZeYsdaTcb~w6r9P`Q5+S*P{&9Pfu+bemwtx0YyKGvs>jn!m~<f3o69=Fc5
z@%S~@dhPmJlv|{0Yg@mqtMl3!FLs@;YinJti5+dYj;FTf<mJT2?C8=rT*niu`y^lY
ztn;x`TkGqZ#A|(9Q~R}D*T$}Ur)QmuJ(e4bjp^$-q-Iep`Sg+7*qQv)gfmvnvDM@n
zi<vV%maF5j7wOtDYJF=*TaVrLji1`ow0_;Awo_ln>K=(F7C&|MIEh7D=jzzG*cmUJ
zo*65CdbS*09p5OHT-&$yYkQ-4)ni7NSUXPLv$mrT*L8KQtr^p)Pds*A7rW++5w7dg
zqvqJPE_OJ*!p4&eC)f6?^L1UVPfcR2uI_OrZQY~Q*15W8#%p7>K6ZLWJI0CCHK|(^
z+o&eFmg}Ay<uc}&Cb@Iv^s4LHn0||%zqefh{(Y^vdEp5>1%n?h^*z2A`cvEX9Q@?r
z4mi%F#LVjt@e-TEqA&ibo+Z}Co%1P8J+IO?xoPRwHNE}&>S$)<cEm4w^TV7=Z?*b|
zKnLbUmmc9Ia<OY3(=Xer>C@v+ODtUX(XKTe+5u%=j_>?ZqdmmXtyFQ>x1;i-AGl~|
zu(~=|_j26i=9O)1$ThXC#@_W@)2rqweTQE3oMG3wm(7iw|2?J7*Y=_whpDme6sYNo
zUDrO~Jc%1KFSMtH6R%l2e%2_bPx7*{K_dfWM~k)|<7GAEC!&lqr>?h_*uW)jeyP%R
zrGM(=Hl{E7!nHPbH1Xv*`iZE<jy>qhJhT(<vFD(T8J_`pxthOqYFDb58e{3H-B<Y`
zUh=gz8vD<y`t+HO#LQV|pXkCfVqJ-w*RSUp-K1!SUd6V~`y`8bVeM0OjICyDzR~R@
zKdEXu{8{A>z3_W=^c&`fKdm(OvM%E~o_1ep;wNTJ>{+9mP;Wl{<l3KCxjJ6kb<Ga?
z&8V3cHfJnrs>g}lS7RjB_BS3~c*(w1t$ykpFMBieMCUvvaBAjNZN{x*UCGa2<MOF>
zU~je`^J<|kV<y**ZM+-N4e_e4nkk5OUhE#80R4;+?wVNqSrs4VUip2oyK3z8lZV=9
zs=8<#V_IrY?Gbf{jg9E7vnKV<r>^U(T&s02>)Q12{a?H8liZYk*2Q0PJYuzWxyq+k
z^mDWsGi<&2DO}g4X5gZ!W2q0<Iqf6NBe}lR&#FH2VxQ7f^PW^Xdqva6+cD3^ulFkY
z&Gp!2yo|}{r}kUBz0#v)b;jDOjm587yM=ug#geO8J6C$^*FEa|SiH^G^>uE{PORp!
zd6=)`wLW&8&pgkRt97w!UR0lWI5qLdI2ze2{<?R1eaT0Y{6>Cyr8a(DoBCE$YZv9t
zq-n>ApMLGw$&bY|R$Uu?a$|MpqDg&fT0i52ldt{MCKhhvZEdTIzSgy|_^Ayi-qzN6
z?K+=a8*er7hhlz=>1$1$PrQxQadXKV_ew0>#^TFf(WOuP_3Zl3#5U5@dhOK2uJd)S
z&ZmF0t?pd3ZLjnUCm&8cyiwd-^07Da?V0>o|2j8j8w=8_*444-!pWU!+%{j=x4HN=
z*LrQQnq#kL7O$t)#cugbJ?i|JZOon<^^GR=;nc(*)3v?hr*_P*`_(<`n#9{!;%CDf
zY0P=0|GKySvyQcP{BU~Ie%)&<ueR3Le&UIZ@mT*lmL3_q=Gf_5`?2LSU8(>7g`MyJ
zt-1Vt##&TsJbyl@*Re=nYyLj=M*TO+jg5J(`q3W!cizXd?K`F$i)GwJxx{6!=vyAU
zU#;f5_0OjFMsph*H~R3H=3HanO|L~YWAU+AJ;yWUYwiCl`@5Orxz?OXpL}@HUYsev
zsBTgGY|inb+C}mI-E&+t&PMTe4*w_CS&!e2mt4jj%a6rU7mg-<5^p)WHeUN@qp9_=
zt@Db1uAJKNxz>;A?d3I=Yh!1tYwK$NY<p~^+bCC$bvE`!z1m#-@J9VN(rlEo=egw1
zq)or>%JqNW`+3;dVgHU9TlVjD{kLiQp9B~2Z;ZFX>ijqU@>k*G=!{<qj)CjJ?!q&2
z>il<Y_kah1y}t}!1B^cbo(1aq{`7w@@Vg$2U0ib3FM*xZ`|sAw<=@)+Z~2^q|F&&2
z_*>xjaHw%k6KWjma_}gyr{58gb)Up{o%?~`7h#>SW*gA<dpEv@{Xp^U5uNMwyR__g
z0Hm*Lv#)cz07P$}?UkRn-z{dX-|ch^-a+5%Ve6wYZ%qFvdD-u7G44F<?U<9IalNPh
zJ3F>oYrGfg?B!Y<&$-)U3vB!uVEvbY|NjJAVaM@a4C5BowuklKDy-J;+i?$ve6iIz
zhTkXS_q~~WlGvf5v%mAY5ooLb2C&9|=kEPA_WDxmf9v-=J_XzBer(6i+T4%ybR6e%
zJ=hKQ0q5*`jBA?{es9zYJ++<7^T7VA@K&JK`M6gnfjvCi&P{t*Gwi<$b$r)rOq@XH
z+WoFFbBP}(Zg1x<_Ltm|!r9A@;a^x{+Y8&{VeChN`!V!{A1Jo-GA4J7V}HMMWeP{{
z9vgT5CxGi;2KxE_AM@&-gf9b*r|;Q4ik<y+9OtdxeNyi@{{JjY`_ud2{(2^}|9-cV
z^K?vO&d=WJj4#7q09nuZ<UGsf)V~V<80@&#I5%_pes9qZc$gdUtao#<ouBJ`4)(vd
zWlZa>(LWB{^NiWC7Wcrh)i_S(B0I)Ope@{o=Yi+z3E1z;ajo{weQ-_DyZ@gf_p>GD
zSech&`tSc8)7l;w^Sg?y_nAG3uiklF0B<e+VSM%K9)n*jcJ|A-&(sNUfSl*moNF}a
zxpiL0fMdEJj`K}01FlJ(xzy<UT}*k6@?1D~`};jbYCSU_1sTIy$G8zZ57ayjj9&_@
zzY6#sqR#(+k0s{m{QO?GhhTe|v$kHJ;~R56?yK`K_F35d6y7iA=70NL!M2zC_=#fM
z*Vtw7GobHZ{@;DwlY78n;8<!WXk^FoemHJ&>YoRL4$b4(C%{*MYZlJcd40XGw)K85
zo&8=3Td&sp>^ij7JO!L<)|PSX;n=yy?up++W!=sazXWz}*MQFebL!n|Yv-)%QP}T~
z@;tciCEyreg13RYi@z6kOy}#o)YwCI&d0Fr^<0_1<2v4}={+Tuc{{%2%t2!E4fxiZ
zcP?GYJH{7_opEL7GU!=@{T<JB*~{;olf7SAPsXy=@2&G5E>rh$_)}nxZEh2AJ?cG2
zvhjn!dAp{IfMaJ2=j*)nv$wuRz7E*Snq}ZT?}hE_y3+3@Y}ae;=YYM;T?nkze+0J1
z@5S;v3^U$QVp~D>>)Y7&5yqX<7WDQzS^OvHr+y7=1Fp~B+I~l@zWrY){4i`E;e7la
zG;P=IxwJODt;^gpC$-wv`X1`%jP#Qa5tn^k@Y#7DY!*fLQEWeBso7rG@uvOy_abIK
z>G=h6+V1%uqMtG+zaR4;{v6wJj)5D1=O}f)4*Q*b`diEwd(Jq~$llX!C1)?cH%(0k
zPLT7p&GYedMPtpB+-Het=Ui!DW(;^%^?iQi4)!yu`=B5AU57UT_x^$=*D;2=d0}nm
z_Ze^;=x?U)my5kz#!7GJEjsFNWQ?64xv=L@+wYT;{a!cc_)gf*@NWcf2hm?u>`$Xv
z#=aZ&&n6vX_}&wpY)tkw!|xZ>-cdMt&p}U}-y7(;GcN9^bkV+-m_0pvv(np~?+x;5
zsdq2+f4%hZGl{Q%#@vskW}l|?I$p)@FWOHQ&V9B%xQLwdh#&TO*ay5X`%2#L&oqA<
z+%cAO&|ic7{Jf+1#(ljoH$!)6vGx5d;p?XNC->vGu!r+TUQI`htmX<Z*DtXN<9K!+
z178E`g1u<{JmmK#K3;Ms(0A14IgR$~<o6ei?{^1_oxFUM+S>R1=w5V(!R^?|JyX^C
z`ueHj58u5=pEna*p;zvwb$waSk!p--ZIk~teU?hEDgHIs?ydFTDx5jUzCOyc(%aX-
ztl@a+y?vz4d-pW)oR6@*9;o!o=zI;Ud-ROeff?|AXxDRceJ6;g=U1^^L(iUESM8xS
ze*bi8H=)s<)6dVg$IATO6VF*+#@UIT8rKo~+S02pv3D2$O|WzNHQ2u!uupnf`xasy
zG2ffgOLneVL*o8<!q=AH0Xxe$z6W;L_7G}cyz=F}QtxNKUFhS7f1B8Qz~{-|SZe+A
z&O^oav#<L&Fa3S*9cuN#3c2%3%ya(@{ELBmr>;lyV6pXmE!<J&-;w*(^dpx!<+ID*
zQui+Mz8_B<4~^^fv!dri-#=qKRcv3=wEZ*U3)I?Q%@f%2lkgsUVb4oUJG$Q`_fz0z
z@cN?jJoz3Jjrt3T%^2shqVex#R*L=B!tRCpHe(KTZIAE$+d|G>!N)3J{ch@I&#!-u
zi7tFO@frE}vaeP0m6iVYv0rwU82islt$*&AG(9dM_Xos}lwS7B>zVg)j{a8ieW~l3
z`g%UvN2yVhIb2PxXW$;<wN}k95>M<Q@>8&_=zfo!n&<F+&2>)pIau=6IX}N^Ronba
z_)iu6oZ9rhp4?{QzYH&_uX27azQq_la>hJ!K9ip9kAtq#tMkts2da2q@=p^pU%&2V
z4o}eYb<}x(jh)1oXQlRG;vX)#t+0Q_KURGIj5uhEpPsVouJ>eEAMw47*7c|EFk@R6
z|8151)7ZPw=CgojUtK@UcWwXL&8z3Ve{Jt;_I}^h-3Qk8{=f&<-nF*n>hrGs<oS3P
zt)74D*0uW%tp{JR_u}iWyY3x-$2I)8x}*BOE1!4`J6c`mVF!k_bB)Cli!Pj+aN@N-
zF*U6{rfW5IU7gFA$tRbd$uEkhF1q;9pKbh1b;-B=60hgm+I3Cz;b?1)KCx)Sbv)Xd
zlb7o&e)7q+zVXEBn&|6%8(ZY3Px9&C`mHW`xy`jbYE86bn&_g@_ZssP3$H8Je;TWi
z+*r(99oKHJI-gv0buO_*nl|40>6N%V7E3<4cHFwI&BYI=S88gmbBWh=vE@Zr)FZXY
z#}C)>je18PPF*<h^hmtsjWnr?#=6AX9?>Kpj;{3+52wEN6AQPwIv!gN)F<BBtsh;@
zwXNQ(j>TTjoc?LN&eeY6W4tJq_~{YWt;EC0>$fnMSafwPx$s!7u4`@Ua-~ONVe`qw
zPEE_HskyE_Q+@KOKb!v1WSq09-AI!@nM=5?i5+e=iOV(AdA0dPu|a$WeKg^^cVaEq
zn%J#I-&~#F$WE@UY5Ugs+OBo!Wh{2BjV%wKhhKZkuKj!7d&ix(-m+`oO}E~@8{U7%
zoom0pw&l=$_Z>QP2>8GI_N@NEuKoM(y!Gb0_V2mr_FcE_d7qlq^RIa08b5yCAHU#_
zt^RnEKYr04Z}!LK{`jS})mL2c#^vEBHP{sKlPZ30UE8zznp<|=vUkr-E4%jZUAu1e
S6*t|wd+plSPVc(;4gU)*%LZrw

diff --git a/test/data/datasets/monks-problems-1.pkl b/test/data/datasets/monks-problems-1.pkl
deleted file mode 100644
index 58061c482d71f3bd84e61d8c42cd1b7eb45666e7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16009
zcmeHOO>fgc5Ow+isDR3WN*p<GC=!hX6ix`N!ilIka7U4o7!uLMk>g0@04fKdmG;)<
zfA9}sXXEE=oORgBHQ01VTEE>lZ)V5qaoUTIH-3D6?nq<z27ctVyh#)cj_O@c%txU=
z@<K1_2SZ7W{fM)xuY`+cXZ-#b{)z9hjiER2IlDCoh9AazqcCV6`GfJ^L(Xa+Iolad
z2BT@c6NG+!Fgc3)UKn~)&Th0qzY`3{Q8?*PGPWs&VuN9KCOP8`=PQ3%YxD2?4c~3l
z4j-&Fu6*OY>uj?dO-DXOt-aglrzbPPI;%b7&1UoUFJbc1a3bD5Ik>w&yT|HXk6$=W
z6^7%Avn|`G-s83vm*@nl@ZQ-P%lrFSy5GaH{XW}zR{za3_$|DM_y0os)S%{7l05fw
zuKQS3gyJ);=UjWcVgub!_O0U7^sLIN(>@d3b1ExW#imheE^A+y)@A!gd6@rTYWkeY
zs{7yiM|aMqZJ~<=XOaWjKVjZndE&ETbv5g>COVylI1|f?1#glgj|Te`g?VZ-i7l#b
z<Ta@wj|Te`g?VZ-i7l#b<Ta@wj|Te`g?VZ-i7l#b<Ta@wj|Te`g?VZ-i7l#b<Ta@w
zj|Te`g-LC$n9+z0`>9;ox1e!nUT}m9|I5X}@Gtl%mWD9+Rb0}}q*`sVZ>&*sCV5i6
zgm;AlL($Mw&5PO+EyV}}uWD6Z>ZNk7)pD^$f-9~Vl9*kvSK`xXq!K((lGzvJ&BCSq
zkQbPpT6(hbg17nSy@4}P(VmYzpH;7z+AF%b?H}7emF^!`R4)egHV|`{e6cNYu5gG!
z8u9_bscYg?4f)8_oQYn;LrWSNE6|}4TRKmY%bn}N0TGjxN2ZSB5kr_5<^m;;wGbq^
z#T-VQ<OPw;-8qNcKp7|eWTXe5%-uyU;VB^YsWITlO{UfWAN1C5&&T#h{BFA9Z4iUr
z4f}qGvxv?1hwTsQ0eUz7_WL3C72Bp!Q1icQv=9aj=2#PA=zPSfrfP}DT1D{bIF*MU
z8d|6QSi^^P8j%aSG?HF5)vwM;qg$&S%`u?Yt=0W7hYqO5V-AD{2;QRp8QF8O=R)s_
z?E~8fwhzko9&quc5UWjjZ>x8czB}q|&%ZhntJ`d?uY76LyR-bA(dog@YmQSBH)b+s
z_dmOmKCflv(D6Es_9s5ex2itG<w5MP?t^^buqK45^E3w^;>ZsUbNDVh;_T9hb4l}9
zTmNED;aq^1<$Q3TRYg3nG~#~JJl584`yV|}MU?;d>~y@ye;<VXj(61R54!@RAmnfP
z?uj&?ZF^A^_S=)lZ_(%8WAd?0BE@&!5+B9*IL7@LpTzi-`)sS@b$Whl<V8K+Wb3Vd
LmmeP1Chhuf;nPiR

diff --git a/test/data/pymfe/monks-problems-2.pkl b/test/data/metafeatures/pymfe/334.pkl
similarity index 100%
rename from test/data/pymfe/monks-problems-2.pkl
rename to test/data/metafeatures/pymfe/334.pkl
diff --git a/test/data/pymfe/australian.pkl b/test/data/metafeatures/pymfe/40981.pkl
similarity index 100%
rename from test/data/pymfe/australian.pkl
rename to test/data/metafeatures/pymfe/40981.pkl
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset.arff b/test/data/openml_cache/org/openml/www/datasets/333/dataset.arff
new file mode 100644
index 00000000..44600f16
--- /dev/null
+++ b/test/data/openml_cache/org/openml/www/datasets/333/dataset.arff
@@ -0,0 +1,651 @@
+% 
+% 1. Title: The Monk's Problems
+% 
+% 2. Sources: 
+%     (a) Donor: Sebastian Thrun
+% 	       School of Computer Science
+% 	       Carnegie Mellon University
+% 	       Pittsburgh, PA 15213, USA
+% 
+% 	       E-mail: thrun@cs.cmu.edu
+% 
+%     (b) Date: October 1992
+% 
+% 3. Past Usage: 
+% 
+%    - See File: thrun.comparison.ps.Z
+% 
+%    - Wnek, J., "Hypothesis-driven Constructive Induction," PhD dissertation, 
+%      School of Information Technology and Engineering, Reports of Machine 
+%      Learning and Inference Laboratory, MLI 93-2, Center for Artificial 
+%      Intelligence, George Mason University, March 1993.
+% 
+%    - Wnek, J. and Michalski, R.S., "Comparing Symbolic and 
+%      Subsymbolic Learning: Three Studies," in Machine Learning: A 
+%      Multistrategy Approach, Vol. 4., R.S. Michalski and G. Tecuci (Eds.), 
+%      Morgan Kaufmann, San Mateo, CA, 1993.
+% 
+% 4. Relevant Information:
+% 
+%    The MONK's problem were the basis of a first international comparison
+%    of learning algorithms. The result of this comparison is summarized in
+%    "The MONK's Problems - A Performance Comparison of Different Learning
+%    algorithms" by S.B. Thrun, J. Bala, E. Bloedorn, I.  Bratko, B.
+%    Cestnik, J. Cheng, K. De Jong, S.  Dzeroski, S.E. Fahlman, D. Fisher,
+%    R. Hamann, K. Kaufman, S. Keller, I. Kononenko, J.  Kreuziger, R.S.
+%    Michalski, T. Mitchell, P.  Pachowicz, Y. Reich H.  Vafaie, W. Van de
+%    Welde, W. Wenzel, J. Wnek, and J. Zhang has been published as
+%    Technical Report CS-CMU-91-197, Carnegie Mellon University in Dec.
+%    1991.
+% 
+%    One significant characteristic of this comparison is that it was
+%    performed by a collection of researchers, each of whom was an advocate
+%    of the technique they tested (often they were the creators of the
+%    various methods). In this sense, the results are less biased than in
+%    comparisons performed by a single person advocating a specific
+%    learning method, and more accurately reflect the generalization
+%    behavior of the learning techniques as applied by knowledgeable users.
+% 
+%    There are three MONK's problems.  The domains for all MONK's problems
+%    are the same (described below).  One of the MONK's problems has noise
+%    added. For each problem, the domain has been partitioned into a train
+%    and test set.
+% 
+% 5. Number of Instances: 432
+% 
+% 6. Number of Attributes: 8 (including class attribute)
+% 
+% 7. Attribute information:
+%     1. class: 0, 1 
+%     2. a1:    1, 2, 3
+%     3. a2:    1, 2, 3
+%     4. a3:    1, 2
+%     5. a4:    1, 2, 3
+%     6. a5:    1, 2, 3, 4
+%     7. a6:    1, 2
+%     8. Id:    (A unique symbol for each instance)
+% 
+% 8. Missing Attribute Values: None
+% 
+% 9. Target Concepts associated to the MONK's problem:
+% 
+%    MONK-1: (a1 = a2) or (a5 = 1)
+% 
+%    MONK-2: EXACTLY TWO of {a1 = 1, a2 = 1, a3 = 1, a4 = 1, a5 = 1, a6 = 1}
+% 
+%    MONK-3: (a5 = 3 and a4 = 1) or (a5 /= 4 and a2 /= 3)
+%            (5% class noise added to the training set)
+% 
+%
+% Information about the dataset
+% CLASSTYPE: nominal
+% CLASSINDEX: first
+%
+
+@relation monks-problems-1
+
+@attribute 'class' {0,1}
+@attribute 'attr1' {1,2,3}
+@attribute 'attr2' {1,2,3}
+@attribute 'attr3' {1,2}
+@attribute 'attr4' {1,2,3}
+@attribute 'attr5' {1,2,3,4}
+@attribute 'attr6' {1,2}
+
+@data
+1,1,1,1,1,3,1
+1,1,1,1,1,3,2
+1,1,1,1,3,2,1
+1,1,1,1,3,3,2
+1,1,1,2,1,2,1
+1,1,1,2,1,2,2
+1,1,1,2,2,3,1
+1,1,1,2,2,4,1
+1,1,1,2,3,1,2
+1,1,2,1,1,1,2
+0,1,2,1,1,2,1
+0,1,2,1,1,3,1
+0,1,2,1,1,4,2
+1,1,2,1,2,1,1
+0,1,2,1,2,3,1
+0,1,2,1,2,3,2
+0,1,2,1,2,4,2
+0,1,2,1,3,2,1
+0,1,2,1,3,4,2
+0,1,2,2,1,2,2
+0,1,2,2,2,3,2
+0,1,2,2,2,4,1
+0,1,2,2,2,4,2
+0,1,2,2,3,2,2
+0,1,2,2,3,3,1
+0,1,2,2,3,3,2
+0,1,3,1,1,2,1
+0,1,3,1,1,4,1
+0,1,3,1,2,2,1
+0,1,3,1,2,4,1
+1,1,3,1,3,1,2
+0,1,3,1,3,2,2
+0,1,3,1,3,3,1
+0,1,3,1,3,4,1
+0,1,3,1,3,4,2
+0,1,3,2,1,2,2
+1,1,3,2,2,1,2
+0,1,3,2,2,2,2
+0,1,3,2,2,3,2
+0,1,3,2,2,4,1
+0,1,3,2,2,4,2
+1,1,3,2,3,1,1
+0,1,3,2,3,2,1
+0,1,3,2,3,4,1
+0,1,3,2,3,4,2
+0,2,1,1,1,3,1
+0,2,1,1,1,3,2
+1,2,1,1,2,1,1
+1,2,1,1,2,1,2
+0,2,1,1,2,2,2
+0,2,1,1,2,3,1
+0,2,1,1,2,4,1
+0,2,1,1,2,4,2
+0,2,1,1,3,4,1
+0,2,1,2,1,2,2
+0,2,1,2,1,3,1
+0,2,1,2,1,4,2
+0,2,1,2,2,3,1
+0,2,1,2,2,4,2
+0,2,1,2,3,2,2
+0,2,1,2,3,4,1
+1,2,2,1,1,2,1
+1,2,2,1,1,2,2
+1,2,2,1,1,3,1
+1,2,2,1,2,3,2
+1,2,2,1,3,1,1
+1,2,2,1,3,1,2
+1,2,2,1,3,2,2
+1,2,2,1,3,3,2
+1,2,2,1,3,4,2
+1,2,2,2,1,1,1
+1,2,2,2,1,3,2
+1,2,2,2,1,4,1
+1,2,2,2,1,4,2
+1,2,2,2,2,2,1
+1,2,2,2,3,4,1
+1,2,3,1,1,1,1
+1,2,3,1,2,1,1
+0,2,3,1,2,3,1
+1,2,3,1,3,1,2
+0,2,3,1,3,3,1
+0,2,3,1,3,4,2
+0,2,3,2,1,3,2
+1,2,3,2,2,1,1
+1,2,3,2,2,1,2
+0,2,3,2,2,2,1
+0,2,3,2,3,3,2
+1,3,1,1,1,1,1
+1,3,1,1,1,1,2
+1,3,1,1,2,1,1
+0,3,1,1,2,2,2
+0,3,1,1,3,2,2
+1,3,1,2,1,1,1
+0,3,1,2,1,2,2
+0,3,1,2,2,2,2
+0,3,1,2,2,3,2
+0,3,1,2,3,2,2
+1,3,2,1,1,1,1
+0,3,2,1,1,4,2
+1,3,2,1,2,1,2
+0,3,2,1,2,4,2
+1,3,2,2,1,1,1
+1,3,2,2,1,1,2
+0,3,2,2,1,3,2
+1,3,2,2,3,1,1
+0,3,2,2,3,2,1
+0,3,2,2,3,4,1
+1,3,3,1,1,1,1
+1,3,3,1,1,2,1
+1,3,3,1,1,4,2
+1,3,3,1,2,3,2
+1,3,3,1,2,4,2
+1,3,3,1,3,1,2
+1,3,3,1,3,2,1
+1,3,3,1,3,2,2
+1,3,3,1,3,4,2
+1,3,3,2,1,1,1
+1,3,3,2,1,3,2
+1,3,3,2,1,4,1
+1,3,3,2,1,4,2
+1,3,3,2,3,1,2
+1,3,3,2,3,2,2
+1,3,3,2,3,3,2
+1,3,3,2,3,4,2
+1,1,1,1,1,1,1
+1,1,1,1,1,1,2
+1,1,1,1,1,2,1
+1,1,1,1,1,2,2
+1,1,1,1,1,3,1
+1,1,1,1,1,3,2
+1,1,1,1,1,4,1
+1,1,1,1,1,4,2
+1,1,1,1,2,1,1
+1,1,1,1,2,1,2
+1,1,1,1,2,2,1
+1,1,1,1,2,2,2
+1,1,1,1,2,3,1
+1,1,1,1,2,3,2
+1,1,1,1,2,4,1
+1,1,1,1,2,4,2
+1,1,1,1,3,1,1
+1,1,1,1,3,1,2
+1,1,1,1,3,2,1
+1,1,1,1,3,2,2
+1,1,1,1,3,3,1
+1,1,1,1,3,3,2
+1,1,1,1,3,4,1
+1,1,1,1,3,4,2
+1,1,1,2,1,1,1
+1,1,1,2,1,1,2
+1,1,1,2,1,2,1
+1,1,1,2,1,2,2
+1,1,1,2,1,3,1
+1,1,1,2,1,3,2
+1,1,1,2,1,4,1
+1,1,1,2,1,4,2
+1,1,1,2,2,1,1
+1,1,1,2,2,1,2
+1,1,1,2,2,2,1
+1,1,1,2,2,2,2
+1,1,1,2,2,3,1
+1,1,1,2,2,3,2
+1,1,1,2,2,4,1
+1,1,1,2,2,4,2
+1,1,1,2,3,1,1
+1,1,1,2,3,1,2
+1,1,1,2,3,2,1
+1,1,1,2,3,2,2
+1,1,1,2,3,3,1
+1,1,1,2,3,3,2
+1,1,1,2,3,4,1
+1,1,1,2,3,4,2
+1,1,2,1,1,1,1
+1,1,2,1,1,1,2
+0,1,2,1,1,2,1
+0,1,2,1,1,2,2
+0,1,2,1,1,3,1
+0,1,2,1,1,3,2
+0,1,2,1,1,4,1
+0,1,2,1,1,4,2
+1,1,2,1,2,1,1
+1,1,2,1,2,1,2
+0,1,2,1,2,2,1
+0,1,2,1,2,2,2
+0,1,2,1,2,3,1
+0,1,2,1,2,3,2
+0,1,2,1,2,4,1
+0,1,2,1,2,4,2
+1,1,2,1,3,1,1
+1,1,2,1,3,1,2
+0,1,2,1,3,2,1
+0,1,2,1,3,2,2
+0,1,2,1,3,3,1
+0,1,2,1,3,3,2
+0,1,2,1,3,4,1
+0,1,2,1,3,4,2
+1,1,2,2,1,1,1
+1,1,2,2,1,1,2
+0,1,2,2,1,2,1
+0,1,2,2,1,2,2
+0,1,2,2,1,3,1
+0,1,2,2,1,3,2
+0,1,2,2,1,4,1
+0,1,2,2,1,4,2
+1,1,2,2,2,1,1
+1,1,2,2,2,1,2
+0,1,2,2,2,2,1
+0,1,2,2,2,2,2
+0,1,2,2,2,3,1
+0,1,2,2,2,3,2
+0,1,2,2,2,4,1
+0,1,2,2,2,4,2
+1,1,2,2,3,1,1
+1,1,2,2,3,1,2
+0,1,2,2,3,2,1
+0,1,2,2,3,2,2
+0,1,2,2,3,3,1
+0,1,2,2,3,3,2
+0,1,2,2,3,4,1
+0,1,2,2,3,4,2
+1,1,3,1,1,1,1
+1,1,3,1,1,1,2
+0,1,3,1,1,2,1
+0,1,3,1,1,2,2
+0,1,3,1,1,3,1
+0,1,3,1,1,3,2
+0,1,3,1,1,4,1
+0,1,3,1,1,4,2
+1,1,3,1,2,1,1
+1,1,3,1,2,1,2
+0,1,3,1,2,2,1
+0,1,3,1,2,2,2
+0,1,3,1,2,3,1
+0,1,3,1,2,3,2
+0,1,3,1,2,4,1
+0,1,3,1,2,4,2
+1,1,3,1,3,1,1
+1,1,3,1,3,1,2
+0,1,3,1,3,2,1
+0,1,3,1,3,2,2
+0,1,3,1,3,3,1
+0,1,3,1,3,3,2
+0,1,3,1,3,4,1
+0,1,3,1,3,4,2
+1,1,3,2,1,1,1
+1,1,3,2,1,1,2
+0,1,3,2,1,2,1
+0,1,3,2,1,2,2
+0,1,3,2,1,3,1
+0,1,3,2,1,3,2
+0,1,3,2,1,4,1
+0,1,3,2,1,4,2
+1,1,3,2,2,1,1
+1,1,3,2,2,1,2
+0,1,3,2,2,2,1
+0,1,3,2,2,2,2
+0,1,3,2,2,3,1
+0,1,3,2,2,3,2
+0,1,3,2,2,4,1
+0,1,3,2,2,4,2
+1,1,3,2,3,1,1
+1,1,3,2,3,1,2
+0,1,3,2,3,2,1
+0,1,3,2,3,2,2
+0,1,3,2,3,3,1
+0,1,3,2,3,3,2
+0,1,3,2,3,4,1
+0,1,3,2,3,4,2
+1,2,1,1,1,1,1
+1,2,1,1,1,1,2
+0,2,1,1,1,2,1
+0,2,1,1,1,2,2
+0,2,1,1,1,3,1
+0,2,1,1,1,3,2
+0,2,1,1,1,4,1
+0,2,1,1,1,4,2
+1,2,1,1,2,1,1
+1,2,1,1,2,1,2
+0,2,1,1,2,2,1
+0,2,1,1,2,2,2
+0,2,1,1,2,3,1
+0,2,1,1,2,3,2
+0,2,1,1,2,4,1
+0,2,1,1,2,4,2
+1,2,1,1,3,1,1
+1,2,1,1,3,1,2
+0,2,1,1,3,2,1
+0,2,1,1,3,2,2
+0,2,1,1,3,3,1
+0,2,1,1,3,3,2
+0,2,1,1,3,4,1
+0,2,1,1,3,4,2
+1,2,1,2,1,1,1
+1,2,1,2,1,1,2
+0,2,1,2,1,2,1
+0,2,1,2,1,2,2
+0,2,1,2,1,3,1
+0,2,1,2,1,3,2
+0,2,1,2,1,4,1
+0,2,1,2,1,4,2
+1,2,1,2,2,1,1
+1,2,1,2,2,1,2
+0,2,1,2,2,2,1
+0,2,1,2,2,2,2
+0,2,1,2,2,3,1
+0,2,1,2,2,3,2
+0,2,1,2,2,4,1
+0,2,1,2,2,4,2
+1,2,1,2,3,1,1
+1,2,1,2,3,1,2
+0,2,1,2,3,2,1
+0,2,1,2,3,2,2
+0,2,1,2,3,3,1
+0,2,1,2,3,3,2
+0,2,1,2,3,4,1
+0,2,1,2,3,4,2
+1,2,2,1,1,1,1
+1,2,2,1,1,1,2
+1,2,2,1,1,2,1
+1,2,2,1,1,2,2
+1,2,2,1,1,3,1
+1,2,2,1,1,3,2
+1,2,2,1,1,4,1
+1,2,2,1,1,4,2
+1,2,2,1,2,1,1
+1,2,2,1,2,1,2
+1,2,2,1,2,2,1
+1,2,2,1,2,2,2
+1,2,2,1,2,3,1
+1,2,2,1,2,3,2
+1,2,2,1,2,4,1
+1,2,2,1,2,4,2
+1,2,2,1,3,1,1
+1,2,2,1,3,1,2
+1,2,2,1,3,2,1
+1,2,2,1,3,2,2
+1,2,2,1,3,3,1
+1,2,2,1,3,3,2
+1,2,2,1,3,4,1
+1,2,2,1,3,4,2
+1,2,2,2,1,1,1
+1,2,2,2,1,1,2
+1,2,2,2,1,2,1
+1,2,2,2,1,2,2
+1,2,2,2,1,3,1
+1,2,2,2,1,3,2
+1,2,2,2,1,4,1
+1,2,2,2,1,4,2
+1,2,2,2,2,1,1
+1,2,2,2,2,1,2
+1,2,2,2,2,2,1
+1,2,2,2,2,2,2
+1,2,2,2,2,3,1
+1,2,2,2,2,3,2
+1,2,2,2,2,4,1
+1,2,2,2,2,4,2
+1,2,2,2,3,1,1
+1,2,2,2,3,1,2
+1,2,2,2,3,2,1
+1,2,2,2,3,2,2
+1,2,2,2,3,3,1
+1,2,2,2,3,3,2
+1,2,2,2,3,4,1
+1,2,2,2,3,4,2
+1,2,3,1,1,1,1
+1,2,3,1,1,1,2
+0,2,3,1,1,2,1
+0,2,3,1,1,2,2
+0,2,3,1,1,3,1
+0,2,3,1,1,3,2
+0,2,3,1,1,4,1
+0,2,3,1,1,4,2
+1,2,3,1,2,1,1
+1,2,3,1,2,1,2
+0,2,3,1,2,2,1
+0,2,3,1,2,2,2
+0,2,3,1,2,3,1
+0,2,3,1,2,3,2
+0,2,3,1,2,4,1
+0,2,3,1,2,4,2
+1,2,3,1,3,1,1
+1,2,3,1,3,1,2
+0,2,3,1,3,2,1
+0,2,3,1,3,2,2
+0,2,3,1,3,3,1
+0,2,3,1,3,3,2
+0,2,3,1,3,4,1
+0,2,3,1,3,4,2
+1,2,3,2,1,1,1
+1,2,3,2,1,1,2
+0,2,3,2,1,2,1
+0,2,3,2,1,2,2
+0,2,3,2,1,3,1
+0,2,3,2,1,3,2
+0,2,3,2,1,4,1
+0,2,3,2,1,4,2
+1,2,3,2,2,1,1
+1,2,3,2,2,1,2
+0,2,3,2,2,2,1
+0,2,3,2,2,2,2
+0,2,3,2,2,3,1
+0,2,3,2,2,3,2
+0,2,3,2,2,4,1
+0,2,3,2,2,4,2
+1,2,3,2,3,1,1
+1,2,3,2,3,1,2
+0,2,3,2,3,2,1
+0,2,3,2,3,2,2
+0,2,3,2,3,3,1
+0,2,3,2,3,3,2
+0,2,3,2,3,4,1
+0,2,3,2,3,4,2
+1,3,1,1,1,1,1
+1,3,1,1,1,1,2
+0,3,1,1,1,2,1
+0,3,1,1,1,2,2
+0,3,1,1,1,3,1
+0,3,1,1,1,3,2
+0,3,1,1,1,4,1
+0,3,1,1,1,4,2
+1,3,1,1,2,1,1
+1,3,1,1,2,1,2
+0,3,1,1,2,2,1
+0,3,1,1,2,2,2
+0,3,1,1,2,3,1
+0,3,1,1,2,3,2
+0,3,1,1,2,4,1
+0,3,1,1,2,4,2
+1,3,1,1,3,1,1
+1,3,1,1,3,1,2
+0,3,1,1,3,2,1
+0,3,1,1,3,2,2
+0,3,1,1,3,3,1
+0,3,1,1,3,3,2
+0,3,1,1,3,4,1
+0,3,1,1,3,4,2
+1,3,1,2,1,1,1
+1,3,1,2,1,1,2
+0,3,1,2,1,2,1
+0,3,1,2,1,2,2
+0,3,1,2,1,3,1
+0,3,1,2,1,3,2
+0,3,1,2,1,4,1
+0,3,1,2,1,4,2
+1,3,1,2,2,1,1
+1,3,1,2,2,1,2
+0,3,1,2,2,2,1
+0,3,1,2,2,2,2
+0,3,1,2,2,3,1
+0,3,1,2,2,3,2
+0,3,1,2,2,4,1
+0,3,1,2,2,4,2
+1,3,1,2,3,1,1
+1,3,1,2,3,1,2
+0,3,1,2,3,2,1
+0,3,1,2,3,2,2
+0,3,1,2,3,3,1
+0,3,1,2,3,3,2
+0,3,1,2,3,4,1
+0,3,1,2,3,4,2
+1,3,2,1,1,1,1
+1,3,2,1,1,1,2
+0,3,2,1,1,2,1
+0,3,2,1,1,2,2
+0,3,2,1,1,3,1
+0,3,2,1,1,3,2
+0,3,2,1,1,4,1
+0,3,2,1,1,4,2
+1,3,2,1,2,1,1
+1,3,2,1,2,1,2
+0,3,2,1,2,2,1
+0,3,2,1,2,2,2
+0,3,2,1,2,3,1
+0,3,2,1,2,3,2
+0,3,2,1,2,4,1
+0,3,2,1,2,4,2
+1,3,2,1,3,1,1
+1,3,2,1,3,1,2
+0,3,2,1,3,2,1
+0,3,2,1,3,2,2
+0,3,2,1,3,3,1
+0,3,2,1,3,3,2
+0,3,2,1,3,4,1
+0,3,2,1,3,4,2
+1,3,2,2,1,1,1
+1,3,2,2,1,1,2
+0,3,2,2,1,2,1
+0,3,2,2,1,2,2
+0,3,2,2,1,3,1
+0,3,2,2,1,3,2
+0,3,2,2,1,4,1
+0,3,2,2,1,4,2
+1,3,2,2,2,1,1
+1,3,2,2,2,1,2
+0,3,2,2,2,2,1
+0,3,2,2,2,2,2
+0,3,2,2,2,3,1
+0,3,2,2,2,3,2
+0,3,2,2,2,4,1
+0,3,2,2,2,4,2
+1,3,2,2,3,1,1
+1,3,2,2,3,1,2
+0,3,2,2,3,2,1
+0,3,2,2,3,2,2
+0,3,2,2,3,3,1
+0,3,2,2,3,3,2
+0,3,2,2,3,4,1
+0,3,2,2,3,4,2
+1,3,3,1,1,1,1
+1,3,3,1,1,1,2
+1,3,3,1,1,2,1
+1,3,3,1,1,2,2
+1,3,3,1,1,3,1
+1,3,3,1,1,3,2
+1,3,3,1,1,4,1
+1,3,3,1,1,4,2
+1,3,3,1,2,1,1
+1,3,3,1,2,1,2
+1,3,3,1,2,2,1
+1,3,3,1,2,2,2
+1,3,3,1,2,3,1
+1,3,3,1,2,3,2
+1,3,3,1,2,4,1
+1,3,3,1,2,4,2
+1,3,3,1,3,1,1
+1,3,3,1,3,1,2
+1,3,3,1,3,2,1
+1,3,3,1,3,2,2
+1,3,3,1,3,3,1
+1,3,3,1,3,3,2
+1,3,3,1,3,4,1
+1,3,3,1,3,4,2
+1,3,3,2,1,1,1
+1,3,3,2,1,1,2
+1,3,3,2,1,2,1
+1,3,3,2,1,2,2
+1,3,3,2,1,3,1
+1,3,3,2,1,3,2
+1,3,3,2,1,4,1
+1,3,3,2,1,4,2
+1,3,3,2,2,1,1
+1,3,3,2,2,1,2
+1,3,3,2,2,2,1
+1,3,3,2,2,2,2
+1,3,3,2,2,3,1
+1,3,3,2,2,3,2
+1,3,3,2,2,4,1
+1,3,3,2,2,4,2
+1,3,3,2,3,1,1
+1,3,3,2,3,1,2
+1,3,3,2,3,2,1
+1,3,3,2,3,2,2
+1,3,3,2,3,3,1
+1,3,3,2,3,3,2
+1,3,3,2,3,4,1
+1,3,3,2,3,4,2
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 b/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3
new file mode 100644
index 0000000000000000000000000000000000000000..5c9a3b2e2e26154295fad229865852523cae2b78
GIT binary patch
literal 5724
zcmeHLZEqVz5cb{K3ADTwM4%|DkU%Jviq+7xs)Ph02}ty)5taBxb=o_xy;ja=>t57G
zq7r-vO{M*oh2O#-;{8X!%<TF0?&9K$j6y19E#J=0J~O*Je#TzUcby-<T5D00m!>T0
zv!okLV%{CZY|NE>ewU@}Gm?+g)$i3KwISQyco-{rr3@cMDUTx-CfzZM*pSDGk}qtB
zlVI<z9;+j;T+2<oaO5Z5!bZuLy)2rJg1wM?J{kd9zE%OVIA;6cblvl&`wtDgxBCgs
zY_njG_tC7fEp62eSjvZ!coeWu$%{Ls(UH3I^EY3=*n6z*DS5ekqJFwR<w;j}O1@IE
z+QlhYE04#8nR^cVd$a6g6!rN-aN@HBVtK)f_yg~A)Pb>hOLk^5MYRu+R`T^I8&CK3
zXE4sfbVNshV0tkRCQ*{cS&-s6I|Nj66+SbJ0O;J-f!eWrg@@U2E!R0v;$7LHBcO@>
z<!$xkP|rHj`d0ONy*qzHqlf!(s{AMS;bXEkm9##9_YK8CC)*Knp_l!DL0X-}eI7%s
z4>MoBTG}m$96}5M_6G4}>}P`kk73!3pRU=OVJCW8jho%1ooMY-05?9}DFK_r@H8<>
zyf5w1rs1pDtg^KQ%)EQeJ^-5Ts1%s_2zc+y=lpC0Srb7jc9L)uV4k`yNCpde@7nkj
z7sX5Bl1Y5ArZZVw7VF{)B%^p+yd&PjbP^loWV(*&A-2R#aSIbe+}1fEK7?ElAI%~9
zu@7%f2<v34*_12{(qKx{S~Q?$l5y2cI=0{fMYCNPRI^<e)Pm)YO`DaPNoJ*H(y;{>
zD3sVH8jDX05{oZ@ulCZ6h1?JYV;Az3PG>@rAGT1K!`D@{u^t+4g`_>y(3VvwYJyp?
zD%D`cu!=IH0#FkSqAJy3#juJpqXJM945BL4V8yVCGNS@e6AYp%)nLW2iZY`DP!kNI
zD$VDs?K(^I6|XA=Ul)GE+Hk3`EV^0X24P4~t;L+fG3qstWH=%rp+mhkMj*-v&Rymb
zz%_^fgq~Spu+Swo)L}y%HmyC^pOrYwXRPD4mu76c%2;V_(S`Y`_X69tY~Y21{BwX?
zj^%&=ZyN*$m5xQvmgP9aThJhckO0dKK?gqU7EGXqIEO5@S`NHm0-b>o=wKO#JR>h*
zMO^#5e+J?(pS5kbvovc@a4!Hh!V4G!Z|WeYaYVn`q{SqsAwEfiy^-`Z;kBA($%po(
zvH91g_KiMsn09vlJo^!t&)1H-^55}QTGy4>`q!H*QAhKKD2u<xrp;&wnq1Ufjxff#
zM5vtRxe4(SA3)Wmn_d?e!1n+5Cq2+v+4pQ8C)@vg#;&-lWyYoo|MEY*EU-Ge0>}<a
z(-<me)I;@*`Wtz_mG?LE{ucB>$daUbVOX!$AZPW%m_<V@Al}2iR5XN(22?px7N=0-
zknJR$Ocnf@M2Sz8Yf;6HQ`xTVLD3TGxmabK)porxPZ^aR@8{b}po$__yg|r@B)-AI
vaPlDW`q>m}DNr}{Lgw=@QHM$9LtqYd;~1k~#INEv@w@m#{3-t8k5%^{MmfBy

literal 0
HcmV?d00001

diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq b/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq
new file mode 100644
index 0000000000000000000000000000000000000000..34aeff80b70eaa803ee1c0a02805fe86430545a7
GIT binary patch
literal 6016
zcmd5=U1%HG6`mP;9mmbq+r^{GU^cGU3~A!n8vS_fg*?p2)?Xw=maX4zab`w8mNFV?
zrIBSh@os3LloFPOX17oZH7$j#NoiP0-dchm_MwmcflyfbL-wVxB~K;ngXuYUMzSo~
z>$DELJBhFFJ@?*ozVn@P@3}ZC#CUd)eTB7!*dccCWr`Y|qbZ8wv2kGIWGQx<bxg63
zhGF3lJ2Wv%ou{2vnup16rUsyWG)yxYYK0Cm+YGhM(Ax~d-KQtmGvn?-YG~a0f~4y;
zo@FeLhArE)<u1*6x#4{^<5Qq`kYy;aguf{Kbqt~u`^MpjrtY;7y-e9?j+vxhp)Jf9
zHAdT*Yt%LR+JTna8|Z=QFkt?Hw)~Oi{>5<jm}7vBvDOgc2kJ&%MZyf~z?V(d+k??z
zuy$EDG1A|9LO&gNdiLSjhod*2{NvP<(esp@wr>tn-2I^*U`_+rKWWQ{H22{P+}<Do
zMpXVk4Blhd+jI{;Gpg^?_uC}Cy&Yv%zWv_FuP($t5aXxsFdvM(dnb7APE@2XM0cni
zii*c~_J*kZ5H&YyjM^>t;B=1*F2jXi3|M}^aQ{BV-9L2<x98cZ5DRGs+>V9zxy7Cn
z8yBz)e*Cc450}C7`}Ez8A3y!%-Cs7}`fK>D$A7x`_~$o%BEIvt$>~?4clhtSKK=94
z$I;KCdkm#9;KgUq@BE{<-7`I2xD7Av4Onh7+};THV1#%<Y#thjqcdciW}`hJFjFL&
zZKufF7k>Ql?#-p;@bA8t`1Iqy|N1vO4`^y)bccK++^x|bc(Vq)A2F8S)7*bv<aS>?
zCQtg{*$LA??K?D^>Vd~wX*&?_+i1fZqX&kq)M;UO=+-2CT8`ZP<Q7ezdnkgAYY~<j
ze1qmb`bHN-MV0Dx1L9NK@)6Db_9gDtO9Ud(eF%fh_-!|39jYs9xw6E5onp_NW6V3{
z92J@XIQtSupTyZGarH^ueUjM&5(3(kz$G?5F@E!87u-qjos(V<Cq0`dJryTS+mq&$
z<!7T1M{d}%`?WEO8lSQbRV6Je)vdcDH*7^Mo7-4bDr%#w)org&e&>emI>?}Bt4UfR
zXPcU|X;L`{0<EE{FskcPO^2bK^ea{9y4|y6ki!AL^<5Yk8dr(>oT>Pro_sN<W>*i&
z3}CBztBM9~4XvnDvId4mx!USXROC`l(GgoYr%PE$mtgq~TTX-JMXi8)<15}vbGEH6
zFb7fCNS#I%2Y~F0d$1^yo-0&pMMY8xCkOBpHGQrRjn*iyws#Q<oM5a{%jRl1<X5jX
za$8$d0Ndn%n06eFh={)&M9#wz5%Kd!<ah>-T!$kf;^&X(S)%4X91#&ee?+cl;Ar-6
zL`3}j5givbQd9a7b;Mr`B62U)a*|%Dk?TS&$~CFh!rP+Sl4`X|6CtkWYV~48L+R|a
zeVVty8MT=JyXN?5$F$S7MOn`XvDnq*>qf=4K4NF^+b9GCL0Ezx{H_RsOArf!R}cz<
zr$A()cwoMX!o|+KzW{4-3?I=#=0RsufPL^Am556s%)uB|24KDTTA-{pGQNZ><%q1y
z{)8@T;o{YzDCb(inv`^350ol}fEHYn%ULy0Y!(7#_nMrH<C;b`xls>!gree%ttr|P
z$b_ajUROMU*}y`akKi6HV$Y-moZrYN?Dc>rSdtwZyqt9B(MBe<rYbF1@6`cMzy3^F
zRb^i+FPFV~W*OG`+`Q~-qMl|N_NP;keAcfv$($$1%Vibx$J%&#IVHMN&X`;AA#R~4
zJA!X!k`cS%53go@UeFO+LtnvfRW3)?Gd|!2F?oWC#9~^%>Iu&GVBf^`NoT#-!Q~0A
zW&N>McsYRB%2F~>&v@q>=_Icb9?M>P)*sj)b~O<=1>X31doCrK^O3czFQLiKU^VUV
z)=j?q@B?S|s~%!Q&j9X>&udR3FMhzHC3J{oM^7e&@f0>OE}$2<4yqZ94dBcs6Z84y
zCi>iqv1QoKR5Qtq>nTS&CPEQ-vMZ%Pk;K{Oh1~IOc5@53?U`hDU4a;+lTES?F)4nv
zJ4RJmTXG%2Kjamk@6u!B#zGi51-&s<DJMu?X8l1vlj7$?9?aujjA3HCFL&B8Y7zbp
z%bybXsAe4VEh!mUlMS8`Q!|(7j+NTTE))#^k_me{83WE2fop<=mlN|{dJ-g8!sPtS
zx8S_BGD$oyqFvDvYCZ*L0%O<CZxdt0uzge>8TtFw&LjTw^C<o$<k8aqm`CZ)&7(;1
zE1gFr8O|r%tw-EPkviO+n<PeGY#z0aypPQK0B0P|(|pUklj~A4*h;5@ZgP)mq8s`x
z*{P7&WXp+`;!xM+BHq^lN2nw;!plLp7oawE?x>8aXpySyaFH4iUUs!YrEoEi^$<&=
z-@K2ZY>HSq#ezo=JWZ^n38Sn++4R)0bSAMeifnvleP#v*q8P)XODe12!|6^b2PsbZ
z46bpex&GSB3@OixIAhqyj`?|_oq2o{2m+2BD;H;G=3ubs#X@gZ`ZRV8tJuT$f*=@o
zG?`k)rv=J{7-`8o>5)cEoA?2)@jk$@-nPFKkK?i~kjDFh?8PaO)Vm1rF9enlpCEQU
zLdZ(eTq&9O1rdma7tR8X7fpQ6z#pldcHCMAjNSaf6X(Epv5g-dIXE$gV}8ZN*X9rX
zC6A&b_zUB^3rK7VfY&bw6<m*>>7NlALj<jNq2oXCZ?Od9i*;za^Mg=h>?Ev`?M8kO
z|5|7Vf)qw*kcaRNG`}U{<bJ5HwEZ#p3~r+RB}^u;4=Jb!3m7_3-U*r+&t+6eK<-Zo
z!hSr%OK6wa1E@;-{Rx{E0T<cY#<x!b$sg1QCUKE)`7&iaTa{{WHgftUrCPm2-piBb
agLsm>l*ezv->3TjQ-6U!X-461@Bal0yk|%N

literal 0
HcmV?d00001

diff --git a/test/data/openml_cache/org/openml/www/datasets/333/description.xml b/test/data/openml_cache/org/openml/www/datasets/333/description.xml
new file mode 100644
index 00000000..4c00296e
--- /dev/null
+++ b/test/data/openml_cache/org/openml/www/datasets/333/description.xml
@@ -0,0 +1,33 @@
+<oml:data_set_description xmlns:oml="http://openml.org/openml">
+  <oml:id>333</oml:id>
+  <oml:name>monks-problems-1</oml:name>
+  <oml:version>1</oml:version>
+  <oml:description>**Author**: Sebastian Thrun (Carnegie Mellon University)  
+**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/MONK's+Problems) - October 1992  
+**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)   
+
+**The Monk's Problems: Problem 1**  
+Once upon a time, in July 1991, the monks of Corsendonk Priory were faced with a school held in their priory, namely the 2nd European Summer School on Machine Learning. After listening more than one week to a wide variety of learning algorithms, they felt rather confused: Which algorithm would be optimal? And which one to avoid? As a consequence of this dilemma, they created a simple task on which all learning algorithms ought to be compared: the three MONK's problems.
+
+The target concept associated with the 1st Monk's problem is the binary outcome of the logical formula:  
+MONK-1: (a1 == a2) or (a5 == 1)
+
+In this dataset, the original train and test sets were merged to allow other sampling procedures. However, the original train-test splits can be found as one of the OpenML tasks. 
+
+### Attribute information: 
+* attr1: 1, 2, 3 
+* attr2: 1, 2, 3 
+* attr3: 1, 2 
+* attr4: 1, 2, 3 
+* attr5: 1, 2, 3, 4 
+* attr6: 1, 2 
+
+### Relevant papers  
+The MONK's Problems - A Performance Comparison of Different Learning Algorithms, by S.B. Thrun, J. Bala, E. Bloedorn, I. Bratko, B. Cestnik, J. Cheng, K. De Jong, S. Dzeroski, S.E. Fahlman, D. Fisher, R. Hamann, K. Kaufman, S. Keller, I. Kononenko, J. Kreuziger, R.S. Michalski, T. Mitchell, P. Pachowicz, Y. Reich H. Vafaie, W. Van de Welde, W. Wenzel, J. Wnek, and J. Zhang. Technical Report CS-CMU-91-197, Carnegie Mellon University, Dec. 1991.  </oml:description>
+  <oml:description_version>1</oml:description_version>
+  <oml:format>ARFF</oml:format>
+  <oml:creator>Sebastian Thrun</oml:creator>     <oml:collection_date>1992-10-01</oml:collection_date>  <oml:upload_date>2014-08-26T17:11:18</oml:upload_date>
+  <oml:language>English</oml:language>  <oml:licence>Public</oml:licence>  <oml:url>https://api.openml.org/data/v1/download/52236/monks-problems-1.arff</oml:url>
+  <oml:parquet_url>http://openml1.win.tue.nl/dataset333/dataset_333.pq</oml:parquet_url>  <oml:file_id>52236</oml:file_id>  <oml:default_target_attribute>class</oml:default_target_attribute>        <oml:citation>https://archive.ics.uci.edu/ml/citation_policy.html</oml:citation>  <oml:tag>artificial</oml:tag><oml:tag>mythbusting_1</oml:tag><oml:tag>OpenML100</oml:tag><oml:tag>study_1</oml:tag><oml:tag>study_123</oml:tag><oml:tag>study_135</oml:tag><oml:tag>study_14</oml:tag><oml:tag>study_144</oml:tag><oml:tag>study_15</oml:tag><oml:tag>study_20</oml:tag><oml:tag>study_34</oml:tag><oml:tag>study_41</oml:tag><oml:tag>study_50</oml:tag><oml:tag>study_52</oml:tag><oml:tag>study_7</oml:tag><oml:tag>uci</oml:tag>  <oml:visibility>public</oml:visibility>  <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/MONK's+Problems</oml:original_data_url>  <oml:paper_url>https://link.springer.com/article/10.1023/A:1022622132310</oml:paper_url>  <oml:minio_url>http://openml1.win.tue.nl/dataset333/dataset_333.pq</oml:minio_url>  <oml:status>active</oml:status>
+  <oml:processing_date>2020-11-20 18:58:56</oml:processing_date>      <oml:md5_checksum>6cd008dccee6a34420c091dfe7cdb457</oml:md5_checksum>
+</oml:data_set_description>
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml b/test/data/openml_cache/org/openml/www/datasets/333/features.xml
new file mode 100644
index 00000000..6cca4738
--- /dev/null
+++ b/test/data/openml_cache/org/openml/www/datasets/333/features.xml
@@ -0,0 +1,84 @@
+<oml:data_features xmlns:oml="http://openml.org/openml">
+    <oml:feature>
+    <oml:index>0</oml:index>
+    <oml:name>class</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>0</oml:nominal_value>
+          <oml:nominal_value>1</oml:nominal_value>
+        <oml:is_target>true</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>1</oml:index>
+    <oml:name>attr1</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+          <oml:nominal_value>3</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>2</oml:index>
+    <oml:name>attr2</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+          <oml:nominal_value>3</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>3</oml:index>
+    <oml:name>attr3</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>4</oml:index>
+    <oml:name>attr4</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+          <oml:nominal_value>3</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>5</oml:index>
+    <oml:name>attr5</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+          <oml:nominal_value>3</oml:nominal_value>
+          <oml:nominal_value>4</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>6</oml:index>
+    <oml:name>attr6</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+  </oml:data_features>
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl b/test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..03189bf8369f06b88fd0cd4bd39fe6a701908ab6
GIT binary patch
literal 509
zcmZ|Ky-ve05C`zMiBnZbMGR%)0br@<2Rs2{38+tiB9|H*DR!&aMlc}3ejCS&@&p_q
zrXGT)lka~2v){7MFP60D6;F!<PPRK~bUob&C#-a~Ey@QeoNpw;(Tdu)H&@iX?(RPD
z>>UXMugpe1BZXu}Xd34CwXhap^b=OPmmS>=%~n@N)Cl9QDxXB{rNujnVCM*Ih7u;m
z>$Pl3U0GY1$1qbQeqAt?{%%w`)kEOIxn_oHv=3JYl%~eO=9ITUsSaB?h~yy1d7o4h
tbsSV0sWhlXzX~RcOcu<t-|RQ>Nb#U9{!_q~e?QA3nFo2<FS|GT={GujtWp2~

literal 0
HcmV?d00001

diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff b/test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff
new file mode 100644
index 00000000..f86981d5
--- /dev/null
+++ b/test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff
@@ -0,0 +1,707 @@
+@relation 'aus4'
+@attribute 'A1' {'0','1'}
+@attribute 'A2' numeric
+@attribute 'A3' numeric
+@attribute 'A4' {'1','2','3'}
+@attribute 'A5' {'1','2','3','4','5','6','7','8','9','10','11','12','13','14'}
+@attribute 'A6' {'1','2','3','4','5','7','8','9'}
+@attribute 'A7' numeric
+@attribute 'A8' {'0','1'}
+@attribute 'A9' {'0','1'}
+@attribute 'A10' numeric
+@attribute 'A11' {'0','1'}
+@attribute 'A12' {'1','2','3'}
+@attribute 'A13' numeric
+@attribute 'A14' numeric
+@attribute 'A15' {'0','1'}
+@data
+"1",65,168,"2","4","4",39,"0","0",1,"1","2",32,161,"0"
+"0",72,123,"2","8","4",5,"0","0",1,"0","2",53,1,"0"
+"0",142,52,"1","4","4",31,"0","0",1,"1","2",98,1,"0"
+"0",60,169,"1","5","3",1,"1","1",12,"1","2",1,1,"1"
+"1",44,134,"2","6","4",46,"1","1",15,"0","2",18,68,"1"
+"0",4,20,"2","8","8",37,"1","1",3,"0","2",32,1,"1"
+"1",16,119,"2","3","4",4,"0","0",1,"0","2",18,50,"0"
+"0",330,94,"2","11","8",66,"1","1",7,"0","2",12,120,"1"
+"1",126,33,"1","2","8",65,"0","0",1,"0","2",61,117,"0"
+"0",315,125,"2","4","8",100,"1","1",4,"1","2",32,36,"0"
+"1",183,52,"2","14","8",81,"1","1",5,"1","2",89,144,"1"
+"1",248,102,"2","11","8",85,"1","1",7,"1","2",148,1,"1"
+"1",49,40,"1","8","8",34,"1","1",4,"1","2",45,79,"0"
+"1",198,102,"2","14","8",104,"1","1",7,"1","2",1,150,"1"
+"1",329,69,"2","8","4",56,"0","0",1,"1","2",107,1,"0"
+"1",284,116,"2","4","4",2,"0","0",1,"0","2",1,196,"1"
+"1",142,95,"2","9","4",104,"1","1",3,"1","2",109,1,"1"
+"0",33,139,"2","6","4",20,"1","1",3,"0","2",26,124,"1"
+"1",42,40,"1","4","4",4,"0","0",1,"0","2",45,5,"0"
+"0",69,111,"2","11","4",60,"1","1",8,"0","2",41,202,"1"
+"0",129,20,"2","6","4",2,"0","0",1,"0","2",92,151,"0"
+"0",35,20,"1","6","4",16,"1","0",1,"1","2",53,1,"0"
+"1",246,42,"2","2","4",5,"0","0",1,"0","2",57,1,"0"
+"1",250,52,"2","4","4",6,"1","0",1,"0","2",53,1,"0"
+"1",38,146,"2","6","4",21,"0","0",1,"0","2",24,101,"0"
+"1",176,46,"2","13","8",93,"1","1",4,"1","2",1,1,"1"
+"1",70,5,"1","4","4",4,"0","0",1,"0","2",69,44,"0"
+"1",181,74,"1","8","8",48,"1","1",2,"1","2",63,235,"1"
+"0",155,175,"2","8","4",47,"1","1",2,"0","2",77,20,"1"
+"1",77,66,"2","8","4",28,"1","1",12,"1","2",18,186,"1"
+"1",118,26,"2","8","8",78,"1","1",4,"1","2",106,66,"1"
+"0",47,160,"1","14","8",1,"0","0",1,"1","2",51,28,"0"
+"1",305,43,"1","8","8",114,"1","0",1,"1","2",69,50,"0"
+"1",77,169,"2","9","8",50,"1","1",12,"1","2",100,92,"1"
+"1",259,40,"2","7","4",123,"0","1",2,"1","2",115,54,"0"
+"1",348,204,"1","1","1",2,"0","1",3,"0","2",1,102,"0"
+"1",95,180,"2","6","4",65,"1","0",1,"1","1",3,1,"1"
+"1",236,191,"2","9","4",111,"1","1",7,"1","2",20,1,"1"
+"0",282,132,"2","8","4",106,"1","1",7,"1","2",1,164,"1"
+"0",280,73,"2","14","4",123,"1","1",3,"1","2",155,178,"1"
+"1",78,1,"2","13","4",3,"1","0",1,"0","2",1,1,"1"
+"1",71,46,"1","6","4",15,"0","0",1,"1","2",38,42,"0"
+"1",115,37,"2","14","8",31,"1","0",1,"0","2",1,216,"1"
+"1",338,18,"2","8","4",16,"1","1",4,"1","2",63,1,"0"
+"1",84,14,"1","8","4",2,"0","1",3,"0","2",40,7,"0"
+"0",50,156,"2","11","4",19,"1","1",3,"1","2",14,1,"1"
+"0",90,52,"1","8","4",5,"0","0",1,"0","2",43,1,"0"
+"1",8,2,"2","8","4",2,"0","0",1,"0","2",1,1,"1"
+"0",141,55,"1","10","8",47,"0","0",1,"0","2",91,18,"0"
+"0",308,196,"2","8","4",93,"1","1",15,"0","2",1,188,"1"
+"1",173,82,"2","4","4",14,"0","0",1,"1","2",84,1,"0"
+"1",53,90,"1","3","8",2,"0","0",1,"0","2",45,50,"0"
+"1",129,5,"1","4","4",3,"0","0",1,"0","2",76,185,"0"
+"1",34,52,"1","8","4",54,"0","0",1,"1","2",36,7,"0"
+"1",123,78,"1","11","8",87,"0","1",3,"1","2",119,2,"0"
+"1",126,46,"2","9","4",47,"1","1",12,"1","2",140,30,"1"
+"1",239,119,"2","6","5",73,"1","1",2,"0","2",1,115,"1"
+"0",216,66,"2","3","8",6,"0","0",1,"0","2",92,86,"0"
+"1",257,101,"1","9","4",69,"1","0",1,"1","2",16,173,"1"
+"1",321,177,"2","7","4",31,"1","1",5,"1","2",69,1,"1"
+"1",262,102,"2","3","5",52,"0","0",1,"1","2",46,1,"0"
+"0",84,25,"2","9","4",7,"0","1",2,"1","2",86,5,"0"
+"1",28,55,"2","3","4",37,"1","1",3,"0","2",38,94,"1"
+"0",243,82,"2","3","5",14,"0","0",1,"0","1",170,1,"0"
+"0",90,17,"2","11","8",37,"1","0",1,"0","2",98,141,"1"
+"1",253,149,"2","14","8",107,"1","1",9,"0","2",1,1,"1"
+"0",38,6,"2","11","4",2,"0","0",1,"1","2",124,1,"0"
+"1",58,169,"2","3","4",14,"1","0",1,"1","2",32,43,"0"
+"1",160,71,"2","1","1",1,"0","1",6,"0","2",61,64,"0"
+"1",120,48,"2","13","8",44,"1","1",13,"1","2",160,131,"1"
+"1",290,214,"1","1","1",1,"1","0",1,"1","2",1,1,"0"
+"0",152,43,"2","9","8",2,"0","1",4,"0","2",1,29,"0"
+"1",140,40,"2","9","4",43,"0","0",1,"0","2",69,1,"0"
+"1",130,103,"1","8","5",37,"1","1",9,"1","2",47,8,"1"
+"1",240,208,"2","10","9",131,"1","1",12,"0","2",1,158,"1"
+"1",211,92,"2","11","4",73,"0","0",1,"0","2",144,36,"0"
+"1",102,12,"2","4","8",73,"0","0",1,"1","2",111,1,"0"
+"1",145,82,"2","8","4",5,"0","0",1,"0","2",76,1,"0"
+"1",77,1,"2","4","4",26,"0","1",12,"0","1",1,1,"0"
+"0",171,45,"2","9","4",28,"1","1",17,"0","2",38,184,"1"
+"1",97,82,"2","13","4",17,"1","1",8,"0","2",1,223,"1"
+"0",200,85,"2","1","1",1,"0","1",7,"0","2",1,77,"0"
+"0",29,151,"2","2","4",11,"0","0",1,"0","2",24,34,"0"
+"1",238,102,"2","3","5",6,"0","0",1,"0","2",158,1,"0"
+"1",81,33,"2","8","4",14,"0","0",1,"1","1",98,1,"0"
+"1",219,1,"2","8","4",1,"0","0",1,"0","3",65,1,"1"
+"0",92,189,"2","11","8",37,"0","0",1,"0","2",98,2,"0"
+"0",276,185,"2","3","5",89,"1","1",10,"1","2",1,1,"1"
+"1",190,106,"2","9","4",3,"0","0",1,"1","2",100,7,"1"
+"1",66,20,"1","1","1",1,"0","0",1,"0","2",32,1,"0"
+"1",125,41,"2","4","8",7,"0","0",1,"1","1",45,1,"0"
+"1",258,89,"2","6","4",2,"0","0",1,"0","2",34,50,"0"
+"1",135,193,"2","2","4",4,"0","0",1,"0","2",1,93,"0"
+"1",208,102,"2","8","5",58,"1","1",7,"0","2",1,104,"1"
+"0",24,151,"1","11","8",5,"0","0",1,"0","2",111,1,"0"
+"1",55,46,"2","9","4",37,"0","0",1,"0","2",49,9,"1"
+"0",228,50,"2","6","4",7,"0","0",1,"0","2",1,106,"0"
+"1",167,2,"1","7","4",2,"0","0",1,"0","2",1,1,"0"
+"0",15,144,"2","6","4",43,"0","1",11,"1","2",1,11,"0"
+"0",47,29,"2","11","4",39,"1","1",2,"0","2",1,1,"1"
+"0",230,87,"2","8","4",65,"0","0",1,"0","2",149,1,"0"
+"1",226,9,"2","4","4",3,"1","0",1,"1","2",98,1,"1"
+"1",124,190,"2","9","4",95,"1","0",1,"1","2",150,115,"1"
+"1",125,20,"1","13","4",7,"1","1",3,"0","2",92,115,"1"
+"1",34,1,"1","1","1",1,"0","1",5,"0","2",13,2,"0"
+"0",95,30,"2","14","8",27,"1","0",1,"1","2",53,220,"1"
+"1",124,55,"2","14","8",26,"1","1",5,"0","2",45,224,"1"
+"1",67,139,"2","6","4",3,"0","0",1,"0","2",1,1,"0"
+"0",295,188,"2","4","8",110,"1","0",1,"1","2",1,1,"1"
+"1",173,66,"2","8","4",31,"0","0",1,"1","2",98,1,"0"
+"1",222,154,"1","4","4",4,"0","0",1,"0","2",53,1,"0"
+"1",279,119,"2","8","4",26,"0","0",1,"1","2",1,82,"0"
+"1",126,87,"1","3","8",95,"1","1",3,"1","2",22,1,"0"
+"0",204,31,"2","6","4",20,"1","1",5,"0","2",1,175,"1"
+"1",184,10,"2","3","5",76,"0","0",1,"1","1",136,1,"0"
+"1",189,3,"1","7","5",2,"1","1",2,"1","2",98,180,"1"
+"0",50,158,"2","13","8",9,"1","1",2,"1","2",24,36,"1"
+"1",181,33,"2","14","4",20,"1","1",8,"1","2",111,207,"1"
+"1",73,164,"2","11","4",58,"1","1",8,"1","2",32,140,"1"
+"1",290,136,"2","8","8",119,"1","1",10,"0","2",64,177,"1"
+"1",242,102,"2","8","4",85,"1","1",8,"0","2",1,201,"1"
+"1",49,29,"1","8","4",47,"0","0",1,"1","1",86,1,"0"
+"0",227,46,"2","1","1",1,"0","0",1,"0","2",23,1,"0"
+"0",323,205,"2","8","4",93,"1","1",8,"0","2",1,200,"1"
+"0",160,85,"2","13","8",17,"1","1",10,"1","2",64,1,"1"
+"1",64,27,"2","9","4",8,"0","1",2,"0","2",136,91,"0"
+"0",327,151,"2","11","4",76,"1","1",15,"0","2",1,176,"1"
+"1",137,13,"2","8","4",8,"0","0",1,"0","2",77,62,"0"
+"0",273,87,"2","5","3",1,"1","0",1,"0","2",32,148,"1"
+"1",52,73,"1","2","4",28,"1","1",9,"1","2",53,2,"1"
+"0",92,180,"2","6","4",37,"1","1",13,"1","2",38,121,"1"
+"0",51,17,"1","10","2",26,"0","0",1,"0","2",92,1,"0"
+"1",91,187,"1","1","1",1,"0","0",1,"0","2",65,1,"0"
+"0",112,69,"1","8","4",3,"0","0",1,"0","1",24,1,"0"
+"1",244,29,"2","1","1",1,"1","0",1,"0","2",42,2,"0"
+"0",223,93,"2","8","4",4,"0","0",1,"0","2",53,1,"0"
+"1",39,20,"2","1","1",1,"0","1",4,"0","2",114,137,"0"
+"1",232,144,"2","7","4",99,"1","1",15,"0","2",86,211,"1"
+"1",104,17,"2","8","4",36,"1","1",6,"1","2",106,1,"1"
+"0",274,73,"2","8","4",55,"1","1",9,"1","2",129,208,"1"
+"1",40,151,"1","4","8",23,"1","0",1,"1","2",45,1,"0"
+"0",67,40,"1","1","1",71,"0","0",1,"0","2",98,1,"0"
+"1",32,83,"1","1","1",1,"0","0",1,"1","2",63,2,"0"
+"1",339,206,"2","14","8",129,"1","1",10,"1","2",1,150,"1"
+"0",11,40,"2","11","4",7,"0","1",2,"0","2",34,48,"0"
+"0",343,196,"2","10","9",1,"1","1",15,"0","2",1,204,"1"
+"1",349,210,"2","10","9",120,"1","1",2,"1","2",1,53,"1"
+"0",5,72,"2","11","4",3,"0","0",1,"0","2",38,1,"0"
+"1",197,87,"2","2","5",119,"1","0",1,"1","2",65,1,"0"
+"0",280,132,"2","10","5",99,"1","1",7,"0","2",123,239,"1"
+"1",78,1,"2","8","4",1,"0","0",1,"0","3",65,1,"1"
+"1",270,46,"2","8","4",58,"1","0",1,"1","2",45,1,"0"
+"1",2,123,"2","10","4",26,"0","0",1,"0","2",161,1,"0"
+"1",32,14,"1","8","4",5,"0","1",2,"0","2",69,2,"0"
+"1",307,119,"2","4","4",98,"1","1",16,"0","2",1,231,"1"
+"1",35,1,"1","7","5",1,"0","0",1,"1","1",152,2,"1"
+"1",22,6,"2","11","7",6,"0","0",1,"0","2",69,32,"1"
+"1",218,29,"2","10","4",2,"0","0",1,"0","2",38,6,"0"
+"1",72,48,"1","9","4",67,"1","1",7,"0","2",24,1,"1"
+"1",283,91,"2","14","5",3,"0","0",1,"1","2",156,1,"0"
+"1",189,66,"2","8","4",26,"0","0",1,"0","2",146,17,"0"
+"0",180,49,"2","2","4",15,"0","0",1,"1","2",1,1,"0"
+"1",261,13,"1","8","4",10,"1","1",9,"1","2",102,70,"1"
+"1",193,88,"1","3","5",110,"1","1",8,"1","2",68,1,"1"
+"1",258,73,"2","3","5",26,"1","0",1,"0","2",1,77,"0"
+"1",25,6,"2","2","4",7,"0","0",1,"1","1",98,1,"0"
+"1",77,66,"2","1","1",3,"0","0",1,"1","2",32,209,"0"
+"0",70,135,"1","14","4",57,"0","0",1,"0","2",55,1,"1"
+"1",21,8,"2","6","4",2,"0","0",1,"0","2",98,135,"0"
+"1",27,159,"1","6","4",4,"1","0",1,"0","2",38,105,"0"
+"0",124,46,"2","7","4",47,"1","0",1,"0","1",118,1,"0"
+"0",33,141,"1","8","4",26,"1","1",5,"1","2",24,115,"1"
+"1",72,26,"2","3","4",39,"0","1",2,"1","2",131,10,"0"
+"1",336,183,"1","8","8",85,"1","0",1,"0","2",36,1,"0"
+"0",82,139,"2","11","4",110,"1","1",6,"1","2",38,1,"1"
+"1",201,77,"2","14","8",74,"1","0",1,"1","2",163,1,"0"
+"0",322,92,"1","1","1",85,"0","0",1,"1","2",1,5,"0"
+"0",309,6,"2","1","1",1,"0","0",1,"1","1",19,26,"0"
+"1",246,88,"2","13","8",101,"1","1",9,"0","2",107,1,"1"
+"1",255,103,"2","11","8",120,"1","0",1,"1","2",27,1,"1"
+"1",246,40,"1","9","4",7,"0","0",1,"0","2",1,75,"0"
+"0",186,26,"2","4","5",26,"1","1",4,"1","2",75,1,"0"
+"1",103,180,"2","13","4",30,"1","1",23,"1","2",45,89,"1"
+"1",88,120,"1","2","4",93,"1","0",1,"1","1",32,1,"1"
+"1",80,171,"1","9","4",23,"1","0",1,"1","2",53,94,"1"
+"1",155,66,"2","13","8",52,"0","0",1,"1","1",111,1,"0"
+"1",215,87,"2","8","5",85,"1","0",1,"1","1",98,1,"0"
+"1",110,47,"2","9","4",4,"0","0",1,"0","2",32,1,"0"
+"0",144,22,"2","9","4",7,"0","0",1,"1","2",102,1,"0"
+"0",76,54,"2","5","3",1,"0","1",2,"0","2",69,37,"0"
+"1",14,73,"2","4","4",2,"0","0",1,"1","2",53,32,"0"
+"1",139,195,"2","6","4",86,"1","1",6,"1","2",57,1,"1"
+"0",134,83,"2","3","5",14,"1","0",1,"1","2",59,1,"0"
+"1",194,1,"2","8","4",1,"0","0",1,"0","3",65,1,"0"
+"1",81,20,"2","8","8",3,"1","0",1,"0","2",63,1,"0"
+"1",95,167,"2","8","4",58,"1","1",18,"0","2",69,159,"1"
+"0",31,127,"2","11","4",62,"1","1",6,"0","2",65,236,"1"
+"1",13,10,"2","11","4",9,"0","1",5,"0","2",53,9,"0"
+"0",7,12,"2","1","1",1,"0","1",2,"0","2",53,59,"0"
+"1",178,66,"2","6","4",43,"0","1",3,"1","2",165,1,"0"
+"0",46,151,"2","8","8",26,"1","1",5,"0","2",15,174,"1"
+"0",61,52,"1","5","3",1,"0","0",1,"0","2",53,1,"0"
+"1",181,34,"2","12","8",99,"1","0",1,"1","2",55,237,"1"
+"0",99,57,"2","8","8",63,"1","0",1,"1","2",117,2,"0"
+"0",92,73,"2","11","8",44,"1","1",19,"0","2",1,115,"1"
+"1",157,1,"2","9","4",31,"1","1",2,"0","2",70,1,"1"
+"1",50,104,"1","5","4",8,"0","0",1,"0","2",45,74,"0"
+"0",241,133,"1","4","4",5,"0","1",3,"0","2",65,19,"0"
+"1",140,40,"2","8","8",7,"0","1",3,"1","2",131,52,"0"
+"1",244,17,"1","7","4",14,"0","0",1,"1","2",42,1,"0"
+"0",141,16,"2","4","4",15,"1","1",5,"0","2",124,115,"1"
+"1",312,17,"1","4","8",75,"1","0",1,"0","2",63,95,"1"
+"1",188,109,"1","8","4",37,"0","0",1,"1","2",18,1,"0"
+"1",95,175,"2","4","4",52,"1","1",3,"1","2",38,6,"0"
+"0",113,67,"1","1","1",1,"0","0",1,"1","2",63,41,"0"
+"0",180,97,"2","11","8",40,"1","1",3,"0","2",1,1,"1"
+"1",185,59,"2","8","4",37,"0","0",1,"0","3",38,1,"0"
+"1",207,109,"2","3","5",85,"0","0",1,"0","2",73,129,"0"
+"1",38,11,"2","4","4",8,"0","0",1,"0","2",98,103,"0"
+"1",87,30,"2","11","4",83,"1","1",3,"1","2",156,180,"1"
+"0",153,119,"2","8","5",76,"1","1",8,"1","2",1,201,"1"
+"1",42,123,"2","8","4",14,"0","0",1,"0","2",1,1,"0"
+"1",99,19,"2","8","4",8,"1","1",8,"1","2",30,214,"1"
+"1",142,99,"2","7","4",47,"0","1",2,"1","2",146,43,"0"
+"1",262,61,"2","3","5",20,"1","0",1,"0","2",159,1,"0"
+"1",136,38,"2","4","4",14,"1","0",1,"0","1",98,1,"0"
+"1",163,73,"1","3","5",101,"0","0",1,"0","2",1,2,"0"
+"1",45,150,"2","10","2",1,"1","0",1,"0","2",1,1,"1"
+"1",269,136,"2","13","8",124,"1","1",2,"1","2",26,180,"1"
+"0",74,62,"2","11","8",53,"1","1",8,"1","2",45,193,"1"
+"0",49,73,"2","11","4",5,"1","1",4,"0","2",32,7,"1"
+"1",126,46,"2","9","4",52,"0","1",2,"1","2",32,4,"0"
+"1",244,61,"1","14","8",115,"1","0",1,"1","2",61,1,"0"
+"1",318,215,"1","8","4",132,"1","1",22,"0","2",1,16,"1"
+"0",339,6,"2","1","1",1,"1","1",2,"0","2",84,50,"1"
+"0",69,167,"1","14","8",20,"1","1",5,"0","2",1,97,"1"
+"0",88,66,"1","3","5",81,"0","0",1,"0","2",69,111,"0"
+"0",345,115,"2","1","1",1,"0","0",1,"0","1",1,1,"0"
+"1",203,26,"2","4","4",37,"0","0",1,"1","2",83,1,"0"
+"1",287,175,"2","7","4",128,"1","0",1,"0","1",35,1,"1"
+"0",128,196,"1","10","9",1,"1","0",1,"0","2",1,232,"1"
+"1",200,66,"2","4","4",81,"1","1",8,"0","2",49,165,"1"
+"1",293,129,"2","3","5",105,"1","1",16,"1","2",1,213,"1"
+"1",350,109,"2","8","4",15,"1","0",1,"0","2",1,99,"0"
+"1",139,185,"2","2","8",14,"0","0",1,"0","2",81,1,"0"
+"0",12,17,"2","3","4",5,"0","1",7,"1","2",86,30,"0"
+"0",6,6,"2","6","4",26,"0","1",3,"1","2",107,2,"0"
+"1",333,193,"2","1","1",130,"1","1",16,"1","2",1,150,"1"
+"1",127,55,"2","4","8",77,"1","1",3,"1","2",64,1,"1"
+"1",249,47,"2","3","5",73,"0","0",1,"0","2",76,1,"1"
+"1",86,139,"2","6","4",7,"0","0",1,"1","2",1,1,"0"
+"1",90,186,"1","6","4",2,"0","0",1,"1","2",38,113,"0"
+"1",196,196,"2","12","7",92,"1","1",10,"1","2",1,61,"1"
+"1",185,40,"2","9","4",29,"0","0",1,"0","2",38,1,"0"
+"0",243,151,"2","11","8",43,"1","0",1,"0","2",8,143,"1"
+"1",47,54,"2","8","4",52,"1","1",2,"0","2",32,66,"1"
+"1",218,37,"1","2","4",37,"0","0",1,"1","2",138,1,"1"
+"1",288,92,"2","7","4",4,"1","0",1,"1","2",80,1,"1"
+"1",76,26,"2","7","4",14,"0","0",1,"1","1",107,1,"0"
+"1",199,81,"2","8","8",109,"0","0",1,"1","2",1,1,"0"
+"1",10,70,"2","6","4",18,"0","1",2,"0","2",24,22,"0"
+"1",72,160,"2","11","8",33,"1","0",1,"0","2",32,1,"1"
+"0",61,173,"2","8","4",7,"0","0",1,"1","2",63,1,"0"
+"0",96,67,"1","6","4",7,"1","0",1,"1","2",120,1,"1"
+"1",209,58,"1","9","4",3,"1","1",2,"0","2",15,157,"1"
+"0",134,85,"2","8","4",7,"0","1",2,"1","2",11,67,"0"
+"1",66,176,"2","8","4",72,"0","1",3,"1","2",63,72,"0"
+"1",190,47,"2","13","4",38,"1","1",2,"1","2",156,238,"1"
+"1",199,66,"2","3","4",26,"0","0",1,"1","2",73,1,"0"
+"1",134,84,"2","6","4",7,"0","0",1,"1","2",32,1,"0"
+"0",35,108,"2","3","8",8,"0","0",1,"0","2",24,114,"0"
+"1",79,33,"2","8","4",23,"1","0",1,"0","1",102,1,"1"
+"1",11,5,"2","8","4",5,"0","0",1,"0","2",43,1,"0"
+"0",43,5,"2","11","4",26,"0","1",2,"0","2",86,136,"1"
+"1",68,164,"2","9","4",47,"1","1",2,"0","2",24,90,"1"
+"1",188,104,"1","3","5",28,"0","0",1,"1","2",149,1,"0"
+"0",323,12,"2","3","5",26,"1","0",1,"1","2",88,187,"0"
+"1",9,29,"2","7","4",3,"1","0",1,"0","1",69,1,"0"
+"1",177,66,"2","13","8",63,"1","1",7,"0","2",53,183,"1"
+"1",286,211,"2","9","4",43,"1","1",4,"0","2",38,15,"1"
+"1",190,70,"2","3","5",58,"0","0",1,"1","2",84,77,"0"
+"1",26,39,"1","10","2",1,"0","0",1,"0","2",32,1,"0"
+"1",266,17,"2","7","4",116,"1","0",1,"0","1",131,1,"0"
+"1",228,52,"2","4","4",14,"0","0",1,"1","2",102,3,"0"
+"1",164,26,"1","6","4",73,"0","0",1,"1","2",107,1,"0"
+"1",96,51,"2","14","4",41,"1","1",2,"1","2",128,21,"1"
+"1",341,164,"2","10","9",131,"1","1",8,"1","2",5,1,"1"
+"1",247,1,"2","8","5",126,"1","0",1,"0","2",1,1,"1"
+"1",160,37,"2","1","1",1,"0","1",2,"0","2",30,20,"0"
+"0",214,98,"2","1","1",1,"0","0",1,"0","2",53,1,"0"
+"1",173,127,"2","10","5",39,"1","0",1,"1","1",136,1,"0"
+"0",168,76,"2","1","1",66,"0","1",3,"1","2",69,5,"0"
+"1",93,70,"2","8","4",52,"1","1",7,"0","2",65,125,"1"
+"1",63,18,"1","14","4",2,"1","1",2,"1","2",167,40,"1"
+"1",191,73,"2","13","8",103,"1","0",1,"1","2",1,1,"1"
+"1",302,73,"1","1","1",37,"0","0",1,"0","2",63,5,"0"
+"0",163,46,"2","1","1",1,"0","1",3,"1","2",69,51,"0"
+"1",224,24,"2","8","4",10,"0","1",3,"0","2",80,115,"0"
+"1",231,51,"2","14","4",4,"1","1",6,"1","2",149,1,"1"
+"1",24,65,"2","8","7",25,"0","1",3,"1","2",53,123,"0"
+"1",229,102,"2","13","4",73,"1","1",11,"1","2",1,1,"1"
+"1",42,165,"2","8","4",47,"0","0",1,"1","2",44,1,"0"
+"1",176,63,"2","2","8",95,"0","0",1,"1","2",101,1,"0"
+"1",29,112,"2","2","4",15,"0","0",1,"0","2",38,1,"0"
+"0",4,130,"2","11","4",4,"0","1",2,"1","2",1,69,"0"
+"1",233,114,"2","13","8",115,"1","1",15,"1","2",130,1,"1"
+"0",39,22,"2","9","4",41,"0","0",1,"0","2",77,6,"0"
+"1",56,160,"2","8","4",65,"1","0",1,"1","2",1,1,"1"
+"0",72,26,"2","8","4",47,"0","1",3,"1","2",69,107,"0"
+"1",103,78,"2","8","8",53,"0","1",2,"1","2",135,22,"0"
+"1",53,153,"1","10","8",31,"0","0",1,"0","2",92,1,"0"
+"0",291,46,"2","5","3",1,"1","0",1,"1","2",32,26,"0"
+"1",49,107,"2","11","4",10,"1","1",2,"0","2",53,1,"0"
+"0",40,9,"2","11","8",8,"1","1",12,"0","2",24,49,"1"
+"1",62,10,"2","2","8",18,"1","0",1,"1","2",1,1,"1"
+"1",116,18,"2","4","1",1,"0","0",1,"0","2",32,1,"0"
+"0",225,102,"2","13","4",122,"1","0",1,"1","2",169,1,"0"
+"0",296,181,"2","6","4",53,"1","1",4,"1","2",52,1,"1"
+"0",3,13,"2","8","4",26,"0","0",1,"0","2",38,19,"0"
+"0",90,183,"2","8","5",84,"1","1",3,"0","2",21,110,"1"
+"1",52,100,"1","9","4",52,"1","1",2,"1","2",24,94,"1"
+"1",65,164,"2","13","4",18,"1","0",1,"0","2",32,1,"1"
+"1",313,143,"2","1","1",125,"1","1",12,"1","2",9,94,"1"
+"1",23,109,"2","4","4",14,"1","0",1,"0","2",24,1,"1"
+"1",213,93,"1","4","4",7,"1","1",11,"1","2",107,1,"1"
+"1",221,154,"2","14","4",58,"1","1",7,"0","2",156,76,"1"
+"1",62,47,"2","4","4",3,"0","0",1,"1","2",116,1,"0"
+"0",303,119,"2","3","5",67,"0","0",1,"1","2",21,1,"0"
+"1",66,61,"2","3","4",4,"0","0",1,"0","2",53,11,"0"
+"1",306,46,"2","2","4",74,"0","0",1,"1","2",1,101,"0"
+"0",335,102,"2","6","4",76,"1","1",5,"0","2",1,49,"1"
+"1",44,110,"2","9","4",42,"1","0",1,"0","1",38,1,"1"
+"1",79,87,"2","8","5",7,"1","0",1,"1","2",53,1,"1"
+"1",207,15,"1","9","4",8,"0","0",1,"1","2",104,3,"0"
+"0",256,52,"1","8","4",1,"0","0",1,"1","2",49,2,"0"
+"1",83,29,"2","3","8",3,"0","0",1,"1","2",77,6,"0"
+"0",205,33,"2","8","4",47,"1","1",12,"0","2",1,111,"1"
+"1",236,102,"2","1","1",1,"0","1",3,"0","2",2,2,"0"
+"0",32,93,"1","8","8",65,"1","0",1,"0","2",86,1,"1"
+"1",120,21,"2","6","4",12,"1","0",1,"1","2",69,1,"0"
+"1",174,73,"2","2","4",5,"0","0",1,"1","2",38,1,"0"
+"1",37,46,"1","13","4",47,"1","0",1,"1","2",32,21,"0"
+"0",71,162,"2","11","4",11,"1","1",6,"1","2",1,120,"1"
+"1",238,118,"2","11","4",2,"1","1",2,"0","2",69,94,"1"
+"1",72,6,"2","8","3",52,"0","0",1,"1","1",1,1,"1"
+"0",81,27,"1","11","4",37,"1","1",3,"1","2",24,109,"1"
+"0",147,107,"2","10","2",52,"1","1",6,"1","2",31,115,"1"
+"0",241,128,"1","11","8",108,"1","1",15,"0","2",1,192,"1"
+"0",214,105,"2","10","4",85,"1","0",1,"1","2",1,206,"1"
+"1",158,57,"2","8","4",3,"0","0",1,"0","2",102,1,"0"
+"0",97,72,"2","14","8",24,"1","0",1,"0","2",117,1,"1"
+"1",122,180,"2","6","5",7,"0","0",1,"1","2",165,1,"0"
+"1",163,87,"2","14","4",85,"1","1",4,"1","2",100,190,"1"
+"1",189,119,"2","6","4",4,"1","0",1,"1","2",142,1,"0"
+"1",65,63,"2","4","4",20,"0","0",1,"0","2",63,1,"0"
+"1",159,75,"2","8","4",58,"0","1",3,"1","2",53,33,"0"
+"1",163,17,"2","8","5",23,"1","0",1,"1","1",107,1,"0"
+"1",6,76,"2","9","4",3,"0","1",2,"0","2",1,7,"0"
+"1",120,50,"2","13","8",87,"1","1",10,"0","2",130,142,"1"
+"1",97,73,"2","8","4",31,"0","1",2,"0","2",1,23,"0"
+"1",342,109,"2","10","9",121,"1","1",2,"1","2",1,1,"1"
+"0",32,144,"2","9","4",40,"1","1",7,"1","2",11,125,"1"
+"0",120,11,"2","7","8",4,"0","1",2,"1","2",95,52,"0"
+"0",39,22,"1","8","4",26,"0","1",2,"0","2",171,3,"0"
+"1",156,48,"2","2","4",16,"0","0",1,"1","1",1,1,"0"
+"0",297,29,"2","6","4",14,"0","0",1,"1","2",86,56,"0"
+"1",159,46,"1","9","4",2,"0","0",1,"0","1",53,1,"0"
+"1",301,196,"2","8","4",110,"1","1",10,"0","2",1,1,"1"
+"0",212,11,"2","1","1",1,"0","1",11,"0","2",69,19,"0"
+"1",184,70,"2","7","4",78,"1","1",7,"0","2",71,1,"1"
+"0",51,73,"2","6","4",2,"1","0",1,"0","2",32,1,"1"
+"1",207,202,"2","9","4",3,"0","0",1,"0","2",107,205,"0"
+"1",298,20,"2","1","1",1,"0","0",1,"0","2",48,1,"0"
+"0",83,28,"2","11","4",11,"0","1",2,"1","2",69,12,"0"
+"1",200,95,"2","14","8",95,"0","0",1,"1","1",164,1,"1"
+"0",328,207,"2","3","5",115,"1","1",14,"0","2",1,222,"1"
+"1",130,30,"2","7","4",25,"1","1",4,"1","2",129,1,"1"
+"1",163,21,"2","4","4",7,"0","0",1,"0","2",124,181,"0"
+"0",166,73,"1","5","3",1,"0","0",1,"0","2",53,21,"0"
+"1",37,119,"2","9","8",36,"1","1",8,"0","2",24,199,"1"
+"1",230,115,"2","7","4",32,"1","1",6,"1","2",34,155,"1"
+"1",13,79,"2","3","4",9,"0","0",1,"1","2",45,3,"0"
+"1",268,123,"1","8","4",40,"0","0",1,"0","2",53,3,"0"
+"1",9,1,"1","6","4",7,"0","0",1,"0","2",18,1,"0"
+"0",246,119,"2","11","4",14,"1","1",4,"1","2",48,1,"1"
+"1",137,196,"2","8","8",91,"1","1",12,"0","2",1,191,"1"
+"0",109,55,"2","5","3",1,"0","0",1,"1","2",97,2,"0"
+"1",225,80,"2","9","4",76,"1","1",15,"0","2",126,168,"1"
+"1",97,115,"2","8","4",26,"1","1",4,"0","2",1,1,"1"
+"0",129,13,"2","11","4",16,"1","1",5,"0","2",24,1,"1"
+"1",347,201,"2","1","1",1,"1","0",1,"1","2",1,1,"1"
+"1",70,169,"1","7","4",37,"0","0",1,"1","2",1,206,"0"
+"0",119,40,"2","1","1",1,"0","1",2,"0","2",27,94,"0"
+"0",35,137,"2","13","8",20,"1","1",8,"0","2",30,1,"1"
+"0",102,1,"2","8","4",1,"0","0",1,"0","3",65,1,"1"
+"1",163,2,"1","2","4",78,"0","0",1,"1","2",146,1,"0"
+"0",70,136,"2","11","4",43,"1","1",11,"0","2",24,149,"0"
+"0",75,40,"2","11","4",7,"0","0",1,"1","2",38,140,"0"
+"1",79,182,"2","8","4",4,"0","1",3,"0","2",1,217,"0"
+"0",141,36,"1","14","4",26,"0","0",1,"0","2",98,14,"0"
+"0",169,115,"2","2","4",31,"0","0",1,"0","2",95,1,"0"
+"1",118,46,"1","9","4",10,"1","0",1,"1","2",92,154,"1"
+"1",36,163,"2","8","5",16,"0","1",3,"1","2",69,8,"0"
+"1",204,64,"2","9","4",4,"0","1",3,"0","2",77,2,"0"
+"1",32,1,"2","11","4",18,"0","0",1,"0","2",53,2,"0"
+"0",136,85,"2","8","4",28,"1","1",2,"1","2",121,1,"1"
+"1",192,92,"2","3","5",71,"1","1",3,"0","2",96,126,"1"
+"0",182,73,"1","6","4",47,"0","0",1,"0","2",63,1,"0"
+"1",310,147,"2","10","4",112,"1","1",6,"0","2",1,1,"1"
+"1",98,33,"2","6","4",14,"0","0",1,"0","2",69,1,"0"
+"1",174,59,"1","4","1",1,"0","0",1,"0","2",38,1,"0"
+"0",53,102,"1","1","1",1,"0","0",1,"0","2",1,1,"0"
+"0",319,198,"2","5","1",1,"1","1",16,"0","2",1,87,"1"
+"1",172,6,"1","8","8",71,"1","1",2,"1","2",139,225,"1"
+"0",48,174,"2","8","8",96,"1","0",1,"0","2",111,1,"1"
+"0",150,109,"2","4","4",93,"0","0",1,"1","1",32,1,"0"
+"0",91,23,"2","6","8",43,"1","0",1,"0","2",131,1,"0"
+"1",78,166,"2","14","8",13,"1","1",2,"0","2",32,1,"1"
+"0",163,167,"2","1","1",1,"0","0",1,"0","2",65,215,"0"
+"0",173,18,"2","13","4",2,"1","0",1,"0","2",141,230,"1"
+"1",85,46,"2","2","8",45,"1","1",7,"0","2",69,98,"1"
+"1",214,5,"1","8","4",37,"0","0",1,"1","2",84,55,"1"
+"1",145,40,"1","4","4",7,"0","0",1,"0","2",79,1,"0"
+"0",86,17,"2","11","8",31,"1","1",2,"0","2",1,128,"1"
+"1",54,10,"1","8","8",7,"0","0",1,"0","2",98,78,"0"
+"1",162,197,"2","8","4",14,"1","0",1,"0","2",38,1,"0"
+"1",74,73,"2","7","4",32,"1","1",2,"0","2",92,139,"1"
+"1",218,52,"1","8","5",7,"1","0",1,"1","2",55,109,"0"
+"0",93,95,"2","9","4",26,"0","0",1,"1","2",117,7,"0"
+"1",209,86,"2","9","4",29,"1","0",1,"1","2",69,1,"0"
+"0",278,26,"2","11","8",63,"1","1",2,"0","2",110,145,"1"
+"1",170,87,"1","13","4",37,"0","0",1,"1","2",38,1,"0"
+"0",154,161,"2","11","8",3,"0","1",13,"1","2",41,4,"0"
+"1",126,47,"2","9","4",74,"1","1",6,"1","2",32,4,"1"
+"1",314,197,"2","10","9",1,"1","1",20,"0","2",50,60,"0"
+"1",7,26,"2","8","4",43,"1","1",6,"1","2",115,130,"1"
+"1",91,180,"2","9","4",24,"1","0",1,"1","2",92,1,"0"
+"1",175,109,"2","11","8",93,"1","1",13,"1","2",132,150,"1"
+"1",149,17,"2","8","4",43,"1","1",12,"0","2",10,118,"1"
+"1",67,16,"2","4","4",4,"0","0",1,"1","2",98,38,"0"
+"1",251,32,"2","14","4",58,"1","0",1,"0","2",153,125,"1"
+"1",213,55,"2","3","4",7,"0","0",1,"1","2",78,1,"0"
+"1",105,184,"2","13","4",14,"0","0",1,"0","2",1,3,"0"
+"1",146,54,"2","8","8",80,"1","0",1,"0","2",92,77,"1"
+"1",43,10,"2","11","4",4,"0","0",1,"0","2",69,1,"0"
+"1",40,13,"2","11","4",47,"1","1",3,"1","2",24,1,"1"
+"1",300,2,"2","14","8",2,"1","0",1,"0","2",1,200,"1"
+"1",337,123,"2","10","9",1,"0","0",1,"0","2",1,13,"0"
+"1",242,79,"2","7","4",73,"0","0",1,"1","1",131,1,"0"
+"0",10,9,"2","6","4",4,"0","0",1,"0","2",69,2,"0"
+"1",141,19,"2","9","4",8,"0","1",2,"0","2",111,198,"0"
+"1",188,95,"2","6","4",26,"1","0",1,"1","2",86,1,"0"
+"0",346,1,"2","8","4",1,"0","0",1,"0","3",65,1,"1"
+"1",253,8,"2","3","8",88,"1","0",1,"0","2",131,1,"1"
+"0",70,164,"1","11","4",65,"1","0",1,"1","2",94,1,"0"
+"1",89,55,"2","10","2",5,"0","1",3,"0","2",107,166,"0"
+"1",267,17,"2","3","8",85,"1","0",1,"1","2",107,1,"1"
+"0",76,173,"2","14","8",14,"1","1",3,"1","2",102,119,"1"
+"1",192,42,"2","3","5",4,"0","0",1,"1","2",141,210,"0"
+"1",77,169,"2","3","4",73,"1","1",10,"0","2",17,134,"1"
+"1",106,30,"2","4","4",10,"0","1",3,"1","2",60,4,"0"
+"0",18,139,"2","6","4",34,"1","0",1,"1","2",1,1,"1"
+"1",47,36,"2","11","4",37,"0","0",1,"0","2",34,8,"0"
+"1",165,199,"2","2","4",65,"1","1",10,"0","2",87,133,"1"
+"0",325,55,"2","1","1",99,"0","1",2,"0","2",1,11,"0"
+"1",332,47,"2","11","4",4,"1","0",1,"1","2",92,1,"1"
+"0",340,192,"2","1","1",1,"1","1",12,"1","2",1,171,"1"
+"1",148,34,"1","3","5",14,"1","1",11,"1","2",43,27,"0"
+"1",289,119,"2","11","8",96,"1","0",1,"1","2",114,1,"1"
+"0",83,169,"1","4","8",65,"0","0",1,"1","2",3,17,"0"
+"1",44,141,"2","8","4",41,"1","1",4,"1","2",11,27,"1"
+"1",285,130,"2","9","8",127,"1","1",13,"0","2",1,138,"1"
+"1",79,46,"2","11","4",55,"1","1",4,"1","2",1,122,"1"
+"1",172,192,"1","1","1",1,"0","1",3,"0","2",53,2,"0"
+"1",184,12,"1","13","4",3,"0","0",1,"0","2",63,1,"0"
+"1",83,16,"1","9","4",61,"1","1",7,"1","2",72,100,"0"
+"1",12,12,"1","4","4",8,"0","0",1,"0","1",69,1,"0"
+"0",51,136,"2","8","4",5,"0","0",1,"0","2",1,102,"0"
+"0",334,200,"2","11","4",117,"1","0",1,"1","2",4,229,"1"
+"1",163,102,"1","6","4",110,"1","0",1,"0","2",1,1,"0"
+"0",95,178,"2","13","8",73,"1","1",7,"0","2",131,112,"1"
+"1",58,148,"2","8","4",7,"1","0",1,"0","2",45,1,"0"
+"1",47,1,"2","8","4",1,"0","0",1,"0","3",65,1,"0"
+"0",98,187,"1","1","1",47,"0","1",2,"1","2",69,2,"0"
+"1",259,97,"2","11","4",82,"1","0",1,"0","1",1,1,"1"
+"1",114,52,"1","8","4",26,"1","1",6,"1","2",53,218,"1"
+"1",326,124,"2","7","4",124,"1","1",7,"1","2",117,167,"1"
+"1",316,169,"2","1","1",85,"1","1",6,"0","2",1,226,"1"
+"1",299,151,"2","3","5",1,"1","1",12,"0","2",1,163,"1"
+"1",237,17,"2","7","4",7,"1","0",1,"0","1",99,1,"0"
+"1",123,56,"1","6","4",47,"1","1",4,"1","2",120,120,"1"
+"1",149,119,"2","13","4",68,"1","1",9,"0","2",109,158,"1"
+"1",302,56,"1","1","1",37,"0","0",1,"0","2",38,2,"0"
+"0",95,164,"1","6","4",81,"1","0",1,"0","2",38,1,"0"
+"1",143,26,"1","8","4",2,"0","0",1,"0","2",86,1,"0"
+"0",185,13,"2","13","4",10,"0","0",1,"0","2",102,35,"1"
+"1",104,26,"2","8","5",7,"1","0",1,"0","2",113,24,"1"
+"0",135,34,"2","8","4",58,"1","1",6,"1","2",102,172,"1"
+"0",23,13,"3","13","1",115,"0","0",1,"1","1",102,1,"1"
+"1",47,123,"2","8","4",40,"1","1",4,"0","2",69,170,"1"
+"1",17,209,"3","1","7",1,"0","0",1,"1","3",143,240,"1"
+"1",23,122,"1","7","4",2,"0","0",1,"0","2",45,1,"0"
+"1",281,66,"2","7","5",58,"1","1",13,"1","2",133,195,"1"
+"1",324,136,"2","10","8",101,"1","1",4,"0","2",1,1,"1"
+"1",1,87,"1","9","4",43,"1","1",3,"1","2",38,150,"1"
+"1",36,144,"2","11","4",26,"1","0",1,"1","2",18,109,"1"
+"0",63,172,"2","4","8",3,"0","0",1,"0","2",107,6,"0"
+"1",18,151,"2","9","8",5,"0","1",2,"0","2",38,2,"0"
+"1",197,66,"1","9","4",65,"0","0",1,"0","1",69,1,"0"
+"1",138,82,"2","9","4",73,"1","1",4,"1","2",108,1,"0"
+"1",57,26,"1","12","7",20,"0","0",1,"1","2",43,3,"0"
+"1",24,156,"2","8","8",28,"0","0",1,"0","2",107,14,"0"
+"1",127,73,"2","9","4",20,"0","0",1,"1","2",102,42,"0"
+"0",272,160,"2","11","4",85,"1","1",8,"1","2",1,1,"1"
+"1",73,169,"2","3","4",11,"0","0",1,"0","2",1,1,"0"
+"0",163,82,"2","2","4",65,"1","0",1,"1","2",102,1,"0"
+"0",47,127,"2","4","4",37,"1","1",2,"0","2",53,83,"1"
+"0",55,63,"2","3","5",14,"1","1",5,"0","1",24,1,"1"
+"1",114,92,"2","13","4",79,"1","1",2,"1","2",38,1,"1"
+"0",121,50,"2","1","1",1,"0","0",1,"0","2",111,2,"0"
+"1",254,34,"2","9","4",85,"1","1",7,"1","2",152,228,"1"
+"0",25,151,"2","9","4",26,"0","1",2,"0","2",38,2,"0"
+"0",133,33,"2","11","4",26,"1","1",3,"1","2",56,115,"0"
+"0",58,115,"2","6","4",58,"1","1",4,"0","2",24,146,"1"
+"1",62,164,"2","14","4",8,"1","1",7,"0","2",39,1,"1"
+"1",63,17,"2","8","4",4,"0","0",1,"0","2",117,1,"0"
+"1",175,139,"1","9","8",90,"1","0",1,"1","2",51,1,"1"
+"0",271,33,"2","11","4",4,"0","0",1,"1","2",93,1,"0"
+"0",83,20,"1","1","1",4,"0","0",1,"0","2",38,46,"0"
+"0",21,18,"2","8","4",43,"0","1",2,"1","2",24,6,"0"
+"1",179,66,"1","9","4",101,"0","0",1,"1","2",98,1,"0"
+"0",72,12,"2","11","4",20,"0","0",1,"0","1",53,1,"0"
+"1",195,35,"2","7","4",29,"0","0",1,"0","1",7,1,"0"
+"1",143,44,"2","9","8",20,"1","1",2,"0","2",86,50,"1"
+"1",167,66,"2","6","4",104,"1","0",1,"1","2",157,1,"0"
+"0",49,54,"2","11","4",49,"1","1",6,"0","2",77,194,"1"
+"1",186,70,"2","3","5",1,"0","0",1,"0","2",63,1,"0"
+"1",65,28,"2","8","8",51,"0","0",1,"1","2",40,1,"1"
+"0",202,175,"2","11","8",124,"1","1",9,"0","2",1,221,"1"
+"1",168,94,"2","13","8",97,"1","1",4,"0","2",105,94,"1"
+"0",72,27,"2","3","4",3,"0","0",1,"0","2",47,1,"0"
+"1",114,194,"2","3","5",1,"0","0",1,"1","2",62,1,"0"
+"1",264,55,"2","7","4",43,"1","1",3,"1","2",1,16,"1"
+"1",149,36,"1","8","4",2,"0","0",1,"0","2",58,73,"0"
+"1",220,123,"2","11","8",118,"1","1",8,"1","2",102,6,"0"
+"1",13,4,"1","8","4",2,"0","0",1,"0","2",45,132,"0"
+"1",292,62,"2","1","1",8,"0","0",1,"0","2",69,4,"0"
+"1",75,7,"2","7","4",3,"0","0",1,"0","1",1,1,"0"
+"0",197,40,"1","3","8",14,"0","0",1,"1","2",53,1,"0"
+"1",266,164,"1","2","4",37,"1","0",1,"0","1",1,1,"0"
+"1",331,70,"2","9","4",43,"1","1",6,"1","2",18,39,"1"
+"1",101,13,"2","7","4",7,"1","1",4,"0","2",92,234,"1"
+"0",250,34,"2","6","4",18,"0","0",1,"0","2",86,84,"0"
+"1",294,204,"2","1","1",1,"1","1",2,"0","2",29,1,"0"
+"0",187,48,"1","1","1",1,"1","0",1,"0","2",107,1,"0"
+"1",35,87,"1","3","4",26,"0","0",1,"1","2",117,150,"0"
+"0",90,64,"1","8","4",1,"0","0",1,"0","2",38,1,"0"
+"1",80,46,"2","8","8",35,"1","0",1,"0","2",137,77,"1"
+"1",344,139,"2","1","1",76,"0","1",2,"0","2",20,7,"0"
+"1",281,11,"2","8","5",126,"1","1",20,"0","2",1,233,"1"
+"0",182,66,"1","8","4",58,"0","0",1,"1","2",1,3,"0"
+"1",263,73,"2","11","8",96,"1","1",12,"0","2",24,1,"1"
+"1",76,21,"1","6","4",4,"1","0",1,"0","2",63,2,"0"
+"1",196,66,"2","13","5",14,"0","0",1,"0","2",112,1,"0"
+"1",320,203,"2","2","5",126,"1","1",18,"1","2",1,1,"1"
+"0",265,121,"2","11","4",102,"1","1",4,"1","2",1,1,"1"
+"0",267,1,"2","8","4",58,"1","0",1,"0","2",1,1,"1"
+"1",191,52,"2","9","5",7,"1","0",1,"1","2",54,1,"0"
+"0",269,96,"2","4","8",26,"0","0",1,"1","1",86,1,"0"
+"1",154,69,"1","7","4",4,"0","0",1,"1","1",24,1,"0"
+"1",124,26,"2","11","8",5,"0","0",1,"1","2",77,88,"0"
+"1",37,126,"2","7","4",2,"0","1",2,"0","2",32,2,"0"
+"0",247,33,"2","3","5",52,"1","0",1,"1","2",1,94,"1"
+"0",82,46,"2","9","4",24,"0","0",1,"1","2",53,1,"0"
+"1",165,28,"2","14","4",33,"1","1",9,"1","2",103,203,"1"
+"0",20,164,"2","14","8",26,"1","1",12,"0","2",1,200,"1"
+"0",181,61,"1","13","4",73,"0","0",1,"1","2",69,63,"0"
+"0",307,123,"2","6","8",65,"0","0",1,"0","2",1,1,"0"
+"0",100,37,"2","11","4",32,"1","1",3,"0","2",69,1,"0"
+"1",242,46,"2","3","5",1,"0","0",1,"0","1",102,1,"0"
+"1",85,20,"1","13","8",4,"0","0",1,"0","2",86,2,"0"
+"1",317,180,"2","4","8",108,"1","0",1,"1","2",6,182,"1"
+"0",133,74,"1","14","8",59,"1","1",2,"0","2",20,1,"1"
+"1",170,87,"2","7","4",58,"1","0",1,"1","2",117,1,"1"
+"1",54,30,"1","8","8",7,"0","0",1,"0","2",98,78,"0"
+"1",190,140,"2","8","4",81,"1","1",13,"1","2",1,80,"1"
+"1",263,213,"2","11","8",6,"1","1",2,"0","2",166,47,"0"
+"0",222,115,"2","4","4",26,"1","0",1,"1","2",1,1,"1"
+"1",217,56,"2","9","4",2,"1","0",1,"1","2",131,219,"1"
+"1",145,56,"1","14","8",2,"0","0",1,"0","2",40,2,"0"
+"1",216,68,"2","13","4",5,"0","0",1,"1","2",1,116,"0"
+"1",161,205,"2","8","4",101,"1","1",17,"0","2",1,213,"1"
+"1",206,67,"2","1","1",1,"0","0",1,"0","2",1,150,"0"
+"1",48,64,"2","8","4",47,"1","1",12,"1","2",69,200,"1"
+"1",277,113,"2","9","4",93,"0","0",1,"0","2",147,66,"0"
+"1",64,131,"1","3","5",5,"0","0",1,"1","2",65,1,"0"
+"1",48,151,"1","8","4",58,"1","0",1,"0","1",11,1,"1"
+"1",198,66,"2","9","4",1,"1","0",1,"1","2",85,77,"1"
+"1",86,30,"2","7","4",3,"0","1",5,"0","2",90,179,"0"
+"1",216,119,"2","7","8",78,"1","1",13,"1","2",28,1,"1"
+"0",50,145,"2","3","4",2,"0","0",1,"0","2",69,150,"0"
+"1",213,78,"2","11","8",113,"1","0",1,"1","2",33,127,"1"
+"1",71,152,"2","14","4",2,"1","1",10,"0","2",18,108,"1"
+"0",117,187,"2","11","8",85,"1","1",3,"0","2",1,213,"1"
+"0",275,16,"2","13","8",11,"1","1",12,"1","2",141,7,"1"
+"0",304,1,"1","1","1",1,"0","0",1,"0","2",1,1,"0"
+"1",231,66,"1","3","8",115,"0","0",1,"1","1",69,1,"0"
+"1",231,49,"2","8","4",37,"1","1",11,"0","2",66,212,"1"
+"0",124,56,"2","9","4",7,"0","0",1,"1","2",63,36,"0"
+"1",35,144,"2","9","4",37,"1","0",1,"0","2",38,189,"1"
+"1",235,92,"2","8","5",99,"1","1",17,"0","2",37,160,"1"
+"1",210,26,"1","2","4",16,"0","0",1,"0","2",86,4,"0"
+"1",109,10,"2","3","5",1,"1","0",1,"1","2",1,1,"1"
+"1",234,51,"1","7","4",5,"0","0",1,"0","1",131,1,"0"
+"1",252,15,"2","8","8",6,"1","1",7,"0","2",77,147,"1"
+"1",56,127,"2","6","4",35,"1","1",2,"0","2",24,227,"1"
+"0",43,40,"2","8","4",1,"0","0",1,"0","2",1,1,"0"
+"0",123,73,"2","7","4",64,"0","1",2,"1","2",98,11,"0"
+"1",59,27,"1","13","4",18,"0","0",1,"0","2",53,1,"0"
+"1",130,105,"2","14","4",84,"1","1",3,"0","2",136,8,"1"
+"1",117,61,"2","3","5",14,"0","0",1,"1","2",162,206,"0"
+"0",27,141,"2","11","4",30,"1","1",5,"0","2",18,118,"1"
+"0",79,114,"2","11","4",70,"1","1",11,"0","2",38,85,"1"
+"1",83,53,"2","8","4",15,"0","0",1,"1","2",44,2,"0"
+"1",131,102,"2","9","4",117,"1","0",1,"1","2",20,1,"1"
+"1",103,60,"1","6","4",76,"1","0",1,"0","2",67,1,"0"
+"0",134,50,"2","11","4",56,"1","0",1,"1","2",141,1,"0"
+"1",75,77,"1","8","4",5,"0","0",1,"0","2",53,152,"0"
+"0",90,34,"1","1","1",14,"1","1",4,"0","2",63,65,"0"
+"1",92,18,"2","7","4",26,"0","0",1,"1","2",38,2,"0"
+"1",169,52,"1","10","8",2,"1","0",1,"1","2",127,1,"1"
+"1",284,85,"2","3","5",26,"0","0",1,"0","2",32,3,"0"
+"0",132,82,"2","9","4",23,"1","0",1,"0","1",98,1,"1"
+"1",100,18,"2","9","4",5,"0","1",2,"0","2",95,110,"0"
+"1",29,157,"2","1","1",11,"0","0",1,"0","2",24,1,"0"
+"1",60,38,"1","4","4",58,"1","1",2,"0","2",63,21,"0"
+"1",91,40,"2","8","4",7,"0","0",1,"0","2",35,1,"0"
+"1",10,89,"2","3","8",11,"0","0",1,"1","2",38,1,"0"
+"1",115,55,"2","2","4",20,"0","0",1,"1","2",24,1,"0"
+"1",285,82,"2","6","4",73,"1","0",1,"0","1",82,1,"1"
+"1",82,70,"2","1","1",81,"0","0",1,"0","2",53,25,"0"
+"0",245,56,"1","11","8",4,"1","1",21,"1","2",145,162,"1"
+"1",200,212,"2","14","8",40,"1","1",2,"1","2",154,115,"1"
+"1",84,175,"2","8","4",49,"0","0",1,"0","1",24,1,"0"
+"1",30,102,"2","11","4",10,"1","1",3,"0","2",1,31,"0"
+"1",151,17,"2","2","8",3,"0","0",1,"1","1",88,1,"0"
+"0",41,26,"2","8","4",22,"1","1",6,"1","2",45,6,"0"
+"1",109,29,"2","13","4",29,"0","0",1,"0","2",32,1,"0"
+"1",136,46,"1","8","4",37,"1","0",1,"1","2",1,81,"0"
+"1",135,142,"2","11","8",94,"1","1",7,"0","2",125,71,"1"
+"1",107,33,"2","11","4",43,"1","0",1,"1","2",98,1,"1"
+"1",82,77,"1","4","4",11,"0","1",2,"1","2",98,45,"0"
+"1",111,185,"2","10","2",1,"0","0",1,"1","2",45,156,"0"
+"1",311,122,"2","8","8",61,"1","1",12,"1","2",1,92,"1"
+"1",163,13,"2","2","4",24,"1","0",1,"1","1",168,1,"0"
+"0",98,180,"2","2","4",26,"0","0",1,"1","2",63,153,"0"
+"1",181,77,"1","14","4",69,"1","1",4,"1","2",124,1,"1"
+"1",235,49,"2","8","4",37,"0","0",1,"0","2",1,96,"0"
+"0",19,1,"1","5","1",1,"0","0",1,"0","2",25,1,"0"
+"1",85,22,"2","8","4",5,"0","0",1,"0","2",32,1,"1"
+"1",114,69,"1","13","4",90,"1","1",2,"0","2",74,1,"1"
+"0",70,14,"2","3","4",9,"0","0",1,"1","1",47,1,"0"
+"1",238,18,"1","6","4",14,"1","1",4,"0","2",69,150,"1"
+"0",108,138,"2","6","4",35,"1","0",1,"0","2",53,66,"1"
+"1",42,1,"2","2","4",14,"0","0",1,"0","2",47,1,"0"
+"1",163,87,"1","3","4",3,"0","0",1,"1","2",134,1,"0"
+"1",115,95,"1","8","5",58,"0","0",1,"0","2",69,160,"0"
+"0",94,40,"2","1","1",1,"1","0",1,"0","2",24,1,"0"
+"0",172,46,"2","8","4",7,"0","0",1,"1","2",122,58,"0"
+"1",19,94,"2","8","4",7,"0","0",1,"0","1",24,1,"0"
+"0",220,109,"2","11","4",4,"1","0",1,"1","2",81,1,"1"
+"1",72,67,"1","8","8",60,"1","0",1,"0","2",1,1,"1"
+"0",21,155,"2","1","1",1,"0","0",1,"0","2",1,36,"0"
+"1",89,179,"2","11","8",39,"1","0",1,"1","2",38,1,"1"
+"0",104,17,"2","8","8",24,"1","0",1,"1","2",151,1,"1"
+"1",109,180,"1","4","8",31,"0","0",1,"1","2",1,18,"0"
+"0",73,117,"2","6","4",5,"0","0",1,"0","2",77,150,"0"
+"1",76,26,"2","7","4",14,"1","0",1,"1","1",107,1,"0"
+"1",103,11,"1","8","4",37,"0","0",1,"1","2",53,1,"0"
+"1",289,8,"1","4","4",7,"1","1",12,"0","2",124,197,"1"
+"1",54,1,"2","8","4",14,"0","0",1,"1","1",1,1,"0"
+"1",201,200,"1","8","4",76,"1","0",1,"0","2",24,1,"0"
+"0",75,170,"2","13","4",2,"1","0",1,"0","2",24,169,"1"
+"0",285,42,"2","3","7",9,"0","0",1,"0","2",1,57,"0"
+"1",260,11,"1","13","8",43,"1","1",9,"0","2",32,105,"1"
+"1",163,160,"2","14","4",99,"1","0",1,"0","2",1,1,"1"
+"1",49,14,"2","8","4",4,"0","0",1,"0","2",1,35,"0"
+"0",32,145,"2","6","4",3,"1","0",1,"0","2",32,1,"1"
+"0",122,193,"2","14","8",67,"1","1",2,"0","2",38,12,"1"
+"1",245,2,"2","10","4",2,"0","1",2,"0","1",159,1,"1"
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 b/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3
new file mode 100644
index 0000000000000000000000000000000000000000..b217d55fa4c6faf615758774435b53d045448be1
GIT binary patch
literal 17678
zcmch9iF;G$mG?dO>gr0eBrh?bh)obI92_rzg^3el$sj-&J3z3V#b8^;LXOv1c1!{!
zO330ArzMF4UDDQVX;a$igy}HTb~=kPO=r@Xw!<u)eerzD%(s8}55Di5_dQ1zgy}Q$
zbe_3J?{b#kIqx~|a_^NM^7h+S(ookz<<#KBsli;s(B$-RLuPt#e0XueyKiu2aKCE5
zwRrCfi{}?>7hL`0r>7SeHk68wOw0^VPYjOb8pa1F22T%9=N1?I?PHTeqn*0H_!ere
zW|jW2k#w%1sI$0G-ak7rH8M0hHr$^kC6r#MnwTA*dPaML&Ee^hp~Z!={>=2`czQOI
z8J;G{%a>MZ?^PK;64$+Bx-JZIzl-brTvu@2r0b;_UFZ9CeLbh^<&3V4N#-Babs?+k
zORQITis@`GAJ=^&sq6SrT}K9VoloexFu{DrH}>kj!FKbkx5WK{A(nqc*ZC7%AJBCL
z`yFR~)0CzcX1VTV`Egw@vEO;AS+Br$3w>HH!um$H?i(q_>EV7v*GoM9IQy|Q%yNfx
zE#o(2{3n^uabIS?j8RRmVE-#P&I8;p`thKa8{jzR*`KA`<Jqa@j8j~*-N-ceIj(u}
zpYvoe?!c2;F8`RWBWYbroCkQE@kce?7~y)4t_wV__;KAI;PDxpk2uG_a9Z>8M|2(M
zxI}o|%i<UNW9(-64z9;^y)5HwU^>r_0`ucs8;mb@ig{tbgkNBPrZ`@b*8-1YO6Cva
z<z@Ulo+Ykj-pG88^E#D0#3hfs&IZ`eCeD8`f8r0*Bb>Ji_QT-$Ch^Vld@aa)<$0ZF
zd&@kIysST&hmY&{MFe-B?n^w3xb8Q}`iScO6!&imy;{?kc;C2vy)AQo4MWQnad{od
zeqrp={J6{?j^B0mPvV^C{o)AYM%b>*BjbL?;q@;2j1gn|4{$B{<Z&8o&){+8c|J!t
z-x1Eo5%#+v@#Xa>{W#Bu0<R}S^1^ts&n?TkdxHH*bIpDm_v!us=fA+~BhKTM^;VGi
z+{XIbxL&90ILFK2b&~JWbeX3GS=WrS#QTNJ=ZMT(_Cw~C?1y>Dql{bDz05<lA7TEo
ztRJ3_g>LOnMC|gpB7POvZgGE<KJNqh{aXJf&xbtcG0*EPFL{=HvA;5}<@}O7UE=%?
zu%92v`r!B*Og}QM?JkQv`(@N>x)EZ(c-|D)kBFSNJZ{<7^D>V)P6HBm&RcO^uw8@a
z$ujF1TiC9QpYv>Rej_~JWFE;rT#&plPLZF`cm<h{QQgmTevG5sXZ_+nwpr67ocCpp
zpV%>4HGfI+#rt4h_I3962*-7q;~@KEfyW{Hv?1pc&%6A?8h=^F#qk;te73jDwVX3c
z(wFhE{|29PMg@<H_s`;fdRWtC|B!W`KcwkRY+vT1v5)21PlNAsB5Xe|`SoZynIE$6
z$B$_GC7#Ddi|#K;Tx9=b{Ex&w>qS`aBc7KAuOG>CaeZ=r3cPPE@x0H=`62rz-)|H+
z5BYZOPw_rW<d12(>{|nA-Ix7E@><}1K<+p4j4%0H;=D>6mRVl-vhEDl6Py8_PjdbU
zU-Fn|oJ$-p!M!Q*<M<kEU(Q{L*AmM~ykwuqbDZP6e~6z2o{z<N$^6^K@#FX#o4L>H
zK<)?Pd~V3Rk#R=Y&UGHIoPPzmpXPNf_j}?`QI7K@^Fj9ch~$@XjCSo;UiS02?i;cX
z%6`J@Xn^Nkp4WGAzVJHvNce2WILPBpbIs>y{t?|5zn54(&ikp%qX_HenQnAwy|~1k
z$0_@2ocE6i=Xrqlt+?Rx{E+ixiTySBoGq|_%Phad_T_$RfcJe_r*Xdjii@8~?Vs!q
z1)fh8oX5EAgIlzm<R#DZaf#*R`9Sg+XTPq?eB$^SoS!)N^X!kYL*wTUupN%q5jn?X
zKC``iTFX_izs32_`^rt(|HTf+N7hkMzLxE#biK@WB+teBoKBWw|71Qj@%Zw5-!UNm
za-OcUe@na{6?lJ;JS~g=JP(!!wVepZ*Wmap^ZJr~SK?%B*YZNYB=d*;Fg|KC)YY!#
z<eZXtFG)VwZeGrn-CADm9}K}grs)zN@gvXkT;dRy{h8;V+#fH=I5?gLpBLhn%;S%4
zI|t-GdO*%c@q_)BxGl>%JE;APa~x&gj?2Cw^MP^1?>yU;{N&|a<NYx%`C>hZpWIhm
zmwC$f;c*`S0P78~Kl1!hJZFz{et3RM9t%9)soU03p2uyly-Uo$&h;|exyk)F?<0oX
zm$qwra-WlD`%4@@qgnIie!Yq3vmxW+eL?oWo6MK|%Kjz$UxEFRaTGYN5vkMKo*{kq
zqagX>d>-L>XdGdArWZIK#eG-u#_L-4Ke<1M^Lf%Fc<jI2pGjSeS3=tx;B_45{aVh$
zIMeeo-+3L$eXFedJkR?$uLJSJ;O9d*|73p}U^%(JD@b4Ftvu(+Il}Y0z~joxdXwkE
zJsh_#UCa3)_wn+6>Za(+ypVH1^3lyW59xY2t?T?gT}xc#TwWHxc|Vi+R^a=IDUN?}
zU*Ypw=AXpf*va^8zqoFXYI=e5V94_=pKpRI<I8ir^0E&wZgIVHd}ZH}^Gx<*8K1mw
zF&<(29A`O~Wd27uZUx4b_{MpCihsrX^|;2%%Xu*oUtF-&`&+C}%{((TOmB3}%<je4
z=3iYrwpiO?Ex?0|U0q#={vTed=rCp$(;d+HZS$4I1=zbdgK`U=^z6vk%*X`3PPTJn
zBSVDK0Uc%s5KM<jWwtCEtiAkfz);uM&o*CsrF!n<rt{62+Dvk$|6=WAb@jI9Hyi9j
z@3n`Q>aNxP_{Q(Nl!Atus^B}#X3Gc19=Dsj8s^%Ee<gEw%dyJ0-PaB@{dryO>6?B2
zV^Dvr`$}dL{MPrLtiJcL_cK=-c9@se_j-T38aA|C{LS{y&h>8eSzp+6bw0Dc;V0c|
zuLOEiH!8oMTz{haQf2B&?IS;}{oqdy_PzuSXM=O#Uf=XY`}V-2DR=v;o-a1M9{$e7
z&OY~}r)$GE&U~}Iw{i8Z)DIqk>UW>_`P;8FZ@xbT*_Uf8D=Np|+dKHcc-e-zvv&>c
ziKZ^kKa^_N^B3-S+jHUfK4^bo{Ckz|_gfk-{>BHjS9aC(g^%uE`qtjo?(2>1$Hz9=
z_SLL^efyv8-&)xm-gy;j($62>UzJKVgmxw}Uu~OuaeDPqxa%_wqh4QC=GOw>Id%0@
z<0n4bIJc)EGtpb#yna*W#Hv)om!Ed89nFv2fAB`{Tw6!w;T@^*OJ%E@);D}mwm<a;
z)$98IXv3=1T<+nsF!w>rkzC+M*J@t_bH4W1x8xeW5{K@-)LWY#$o!uBm&ez9?cJ&d
zySCEz!&GyBPq_C>jXw<E7_Y2|w0w7$FJPX1*n3w$c&@foerwyc4HvikeoJFv^W3%9
zUu-D&EBx~xX1=`$-ydu~ckN2$?z!Dp!-4STz}Y{3wtc?$(l1?UKNfx`wRG*+^KE@Q
zHecD4yIOg1Q*~9l|DOBWckN&I-pRGU+p^`X>mjH1>i)pyA2;>J8!nc$r&hK9;WN-Y
zkPJV!sbSsKJ!fkgs&5?X-LUYp-0t1`ezG_I)P*bU8|L2V`t<iJpnd!9OMPMA&hqDL
zQ)@@|ygnVvJh-ECb5(Qqv8vBUYaeSlwtx4Zy_cyBuW$I|rq1s@8@SQA9ablURiA*~
z{@R9<<9k2%tCcrq!(|@$@EhC8mV$4*Z*H|W;$(ESCr%yh-kjZSW~*HfniaNt)d^Qq
z<GL=G-f`AUOg?$mD&JUP-esGOCxWK6tG6-bPHt>=YJxSRW)SWJv##bsDrJY)RYI9N
z2sMvc;FzvfyT&o=wkE882iAtIiH)vsfAwM0TirU>-3TWtJ%>H8t|kl?xL{~Y%bEl5
zf&*u)xglr`+i7r{d#!M}ZzAki_nWT9@Ye2$L)|cZ#tCmYv8K7(<$&1}1grHSFVux$
zsO`Ru9@`0)chodPZB4i5k;6exV|8#C&c*EJ+)Ge>6l(Htbfi3JhhOYWIcDGby5<^J
zaJ*`h<5<qYXxQ<rvBQn)!d6>dE6fB__g;7oU~7A`dAbICuDer>{pGIQy@|%yM(}y6
z;MrLCcyrbHpwnvYbsajq&GCTM2noO4SY|fYG>>&#A*b6_-MYr=-S|wcD_jvYC!ou;
zpsYW4w*y-u-dZR41RM@mKn+wwSDoVuZ}e5Ts@5H7oC*TKfq@fs0Wj-poQ+N};)K_?
z)mVq1E_9;K)#?O~xEf7JxjbQbG+}oKT`Aw@n#7O`f&+<rpKk$ckH?;!NIB4WEPSpT
zoW^}MM@^s8nyUq~b+|5U*@;y3y6Wz-L{0Ev=njWlJ+8-HuDaps_0`$!u=CuY1$E}r
zrpF4_)HziEVYuh;bFBle;B%Q)C+xD#=Fp~aDiHzK*aNN3QFquh?O|v<VYh<SWj<wg
zmqD(^sTryO+kFqjtBwaZ>@S}^yxR_j&F~t#tko&2P91gCZueQM*Vmk^D}z0uaIyyM
z$GyROf>tXyj_X0#8ro<FYn&iXo4O&Fwb#VB_{(v6fy;Jq3V>yS2WfzF$Ta;f<YI&|
zp+rF*KshljKPuXQEbs!j11RnFBi&!=K@7Zns#NVVvSHaMP8T*z)8p}>5s%MAFmFK7
zUFB{>0wke0>6sN+P{XVJL3<{+sgItl!5S}3sB+sT!UEi7lKa@Q@DEE5c*r)G9+wHL
zP3?dOaUH9I1kg#%CmGXHZERrA!GROeLjsmVh2^%&h=eYaq)EedQPU4)YMdJ0Pr$g=
zz;dfdA&>>8&omtc>{GHP1d;Bc(YX9j?m|^G1!dZ8l(*DKE65f6AUs)Aoq)UA4FNKa
zm$F54Y+2yLt`Ge|H(dc2<rFWa>vr6Ja8Z2GHS`*Yjes;tv}<By9n+%piy3syt6bN?
zcjIhO%xWAiRFR&Nq@<G*$P6ffO5$UxFsVKZo9Ig=_A!DO5wIzO6gIqwLf0x{pgG)r
zNSAopF~yZaq0#_MGZITfRiw(lC-T^0J&eM5xc=V0*!{5(B-=Wxp4^!{l>@69J_k`~
zs}0t{6L7r#@Rqp+=&MLJ^hCdrhI5fUO9076xHNefr6BZ$bRrF>BK4aACbyHxx6_yV
z0<n64bPQI*d^$84i9^pCh|fc3Bo^&Ugto<|d!BruKHUbW6iu8>)PE@npH4<Y@4N|_
zeUOf{q@z!VE^LW@1@0PM3mx^5*kn4ANyhGfAkmgg#A079kG0Ig{`$m)ZRgsmqHPdO
zrQZSA@a7lOxH;5EB75LCB;k>J06jX|GYWC?HR_3c;>ak3B5h~N%i7*aMM9anp3vKm
zCS%d#kqep8ybT*-^U1`!(3U*%b`0(tj>3zbkbJc-f&q$LYmTl8MZBTWcV<?_(C=7w
zdna!D$!G*kg*F0YX1{vslWnn4coB2g|4rzLLGnOkv?+NNUN{#EZLW`1HNoI>5CT}c
zA@OD?3hDm7WM}M|fi_5DC@)S%E<_TqCyzf2>1bceY;q!cZ)8WV?<HUQGhNV^M7;~i
z$)3c8lcS@V*DD`{&AreEb<?!AVr>VLnEwm)i6q2k0fx?uN25KNr!tx3r|MsZXw_&m
zIaW8CK`&#W=;WuPyF*)Ncb<rb(lMv!^PO$AFw*!Cv}Wp29jf|z){o9cU#rISwoPG5
zclCriBM>`{15IXn+|{#b_!?ZOMNaaqL|OOa>yn$Q;B`p0w?%7u;6i5zb3gy(s_1>y
zdtZ;WMW2tP`$KK_L_^17Z+tcckJd-vd^%kZUvE1HcSjC2CS#4SBty@Jj$%}=K<8-e
z9E3370Fmeyzl28i!HzStTQ0%Fm*E3AXaB(!qz%J;|Filx+Sl#q+l7O=`fb?umuk^g
z{U+@Ao?~vv^?VDL(n<6R>f#q*J$Syb1=YU$51lyYaFyVC-Ekrd-Y039K`v+h?K0MI
z4BmqqTmER_SAGQWAv|&Yp+)l<CI8x=!cEk>-1KA2{r@G>KbsLZt<e3@34aW)jzEq!
zZaWOn0gu5+Y(ou>0pN@O%liQQGW>5d2Z!!zhFm|m?*(|_^o<|EpVyHuH{QoRvtq~K
zBntnxul{TZHkg;O`P3fxR67Bz#?R;X9{fK5(*X02!}sAk<(;&7tO7i&!9HIL_Rbuf
zCD|nI2cxU;gj8$su8EW6$zhz3!$`19_oRtJtInK+@m%4j*!*pPP2hZT4V-VbU^kuv
z06)RK<nQ6vR{j3x=+OEd^Xn+~QyS0P3x5@a&w6k(ehzy8z5ht%U;I<L1@c{RaomA7
zz~y@IMY#NQ^gLMT;)i(8@OvcmHJJO8S6twExeUI!7An_%;#m_;LFT^#PA?Pw`rZHZ
z6*%(+bnTwdFTpoHf*-);aXbgW3T{K?M^NU!n7KCzr~zXRJK%Q#{tjKLNV#94U2p6E
zI1X?bI-UQH2LOrT^{9Cg{_w*#Y?;3f&~>B{{>Gg7+2s%00XlybE<Zo^Gn@l|w&k~$
z5a6x<jMKpL)2F*h`O^v7RwpRB0C#^m2OBY_M-M>#JALpO7$E9p1X+SsY~iG;!espr
z;M)Boa1VxVF<Z4@&TcP##5;GVM?70S;$`oq2RwRmtJM!~hRL;?YcJP+uK5AaP<BR2
z9woD@@9-dLzr3hT$V1}7UAO%DXmEOZ@ELrpT-V<}^~{---ye1DOAx1haA<V+6g{=x
z@;fK3)i5+TGkkgye~~mehQCSLThe=L@z8hsx&vMGV7hR3=})B8V{ooPSBnc3C0hIV
z37LOj<rDZ}^e<G3F*0##_{?ywAw8JGA7}acCx++xQ$#~Yv-=ioX{(=Uox-1PEiSAo
z`5D#tEIv!CA>gmG`lpA7CMR+;)3ZY}q+YS$;LpEk?DULnyu7#<(u?|G9Ur(cnwZ^(
zOQXK^)~L7P<NIqT@K;{YgwOBW7l*OG;F_SHk#)_!Lg8>Gr%w$}V^rs7)0FscOU4ZY
zCzOGIbWsP}bZH86iKFC*E=`h9eWfEJqOHWVtWpqIwnUm@P)n5n5QgqbLyIU3rAd9V
zttB-QsnEX_x(2~gaY$C<iabdYNs*O7&1XG9BMzx(IW3|)T0(0oDAq)Z7H4miHk(lf
zNkv1jq$02<8dU_amVek%ZYWK5T<H));UPyMk#*&e(k)d|bYd$VjjXg3P4zWrMFfe0
z#@c9`HAO+0rH-~r@ESl7ls@tllSOo;q1hW%6+x5{E|ek_ptrh_lC3U#+hU-1S-8!~
z9&{cTQaZb#!_(ot?akHh>;o$sMmDZ@fb8Dv{_H_q#MwjH&g{dq7_&!9mtrq1wCv;A
zCuosnPw4fP9k^{>DPM7jG-!3H;Yw4eDoPTkB9pXOu~ZxREU#cyFetDZOzIWG1VQ?$
zRDgm;mUNr6=%bODuPIuQ^pt>xQW7ktRmq+f;Qk7@B8Y~_5SuRT8@r*qN`wj?nv&9X
zSzMS{%SEz9McpD66A6n3KrV_+)n!tVT!KiFBwNI|rLA~^t|YafwxAW2j5e)Glpv#;
zsT*2^RYgmL$T@|_?vXHG<QB0M2WeW2dPM_TQVEEZ9)XCEr2ZpOL0HV5y2Gk5@Qv@S
z?|c94tH#15q*u)o3$OzZsGaJ7`j^*^9>LEoizEBE)NL%?HPjv2ZLFd$(@WW<Z6$Xb
zfa2exhau&@5-oxk8%1&{HYMU?*DWO7QKitO{+QzJgj)nlJJMb;FO*V<qQ|AgI|-KZ
z)qw7(iBekG)O8WbDBeJqkV-KnvO-&NTPTEGRQ`BOKgL=i6cLz2RSA9xl_W(EYiR=B
z7T;;@ly&(3J=ae5Qud9Xf97xm$S>`iI4Ntql&pqWy3f$)@Dd%}6H*DC!>sjJ_)1l8
zThuH`20}8Z_CxiBLZS+!q)-GT<DXJ3va}=-sa2w<HME@Ku?j1c$leNg3(75+VnGu%
z4ufi?Qj!u9g6b$4O~H~pW@X(JnH9uhO9T}`>**|M9mS>qPy^q|VO7E?q7Ze(=n9rn
zW9=eeVXok)uBK^mQfEayj-momOQp4Wq%e_v5M4#rDjMO=>&EJE{le=exN_#ux><l`
zJa=OF*@d4L{Orch{rJ7;c6?$ow{NEck0x{bPAZyz{sCn<7QTHqeP9KAPQbA|w&k;J
zd{yqZ9J~UK$8FdiC*ZdOj^lG2uVa~(<MY_2$8F-P3}3)!8h*p$alAgn^7wK8LqXGH
zST>NH+x97PAYdck!S}sxKbmqJ!|{83_~OR)8rU}6_@du10zR+T2VP~z?els94!*y|
zEjD1=ZpU<d`0(!cdGPJ7$M1MNF3Uz^j?ZV{7YJo&&1E@m3(Xile5XuC-1utU>-YH(
z$OxE*<@Q=8zU{RQ%e3t>v=45#gC61#9Y6YpU^pnZ-(z~+mL2dqhJo${@ZAB1!jCUJ
ze14R4ye=O`%x^mm4%o5+7W#+~9(2)gY(RsK&+GF#0h>mRLXOwtc1Xl<+a3eIpmbX<
z3?P|yc^seNHV_=O0)Qy!yzR9u!-XNS1BM^JGr+O9alk$tyq5&5fa%9^2fPSJ>GPTa
z{1(CXf#EYuOoYd0_&t~>+mC_vIc~#r+kT7^cKzss+w1XJ9*?IC1@UE#NeT2}hEWYa
zZjbH5So<*$m~sPOBzfGnVPJ3!jF0UHpX0IIKFogr=Y`MoTLH`-0tOJ<@t8Pg+%z%#
zsDcv%5XQ!D0Rkot5TOy#i(kVS^xcMyv2c8VPWgP67vmeiZ-Oup_)U#6M$TK9dJLl9
z^amVF$(<LPtHb_<7n<kRg_bR3-_QOa`$6`HKliAzEL;(8%dIc$$%Qoh<_0edrwCox
z)g{p_7nb<7mJ9h7<{t@;mN1Cyb|I0tN{0%}n&P7%QAtbnO?=B}V4IBKJ1;ke3MG`9
z%T^|+TXca$5Q2oUKxLOgK`fJ8RltO;WJt_qxp0vX-Eg}Q2GNyqQn!^Z7Nn>y)j)fw
zj^A6Mw97C^3(b-^36cm-5d4r+h(Z<BsvtAeMAv|XnU2F^BP5O9A{`eIT?j=KR>L-_
z&<76|iW!<AAp9VaTy?0Ti^$kfEgU21C?v{BY1WiUg@}BLrL7V}khVrD39uRsQV~SO
zkbsmBMZj&S&>$A(kl9#~DE-ktlp<4#K+}S{I!Bs;1VTg^!o!l>ATj(<#1x$>i{g$J
zby|s_oON4wUO{$;`xjn8^rqrp_fkP0`f_a3sj3gyl|xvcv~>I;jZRv+ZaH?Tr#?~^
zrL-*aR@A9Qm&Q;mCC)U$p??~Jj44@N71gLgJT0xHNUliIo|24$lR}{=DP1Top-@BX
z=ni+aJwnz($|7SUP5mRYY?q~op*~nJu&#@i$PfJy3}s!zkQ8ZBrSz1nbf{Kz(jX4k
zN`=%kSL^5wt0)OlQKm^i8(|Hq6oCzsB*7^Q#Nw`MQb!yhdXdQJq)0?n>q6Qjs{XJL
znvzgBD@5XvQAXh=_U^oD9DEt~i@sF&*RLAw7WuC(<+Zq8q)`BPs5}@oA{tQ1$h58s
zg&HF<@zR6mih$_SWW`n>B&T@9&|Iph!(<Xrx|*iQib)_!MFA;jDp>xO217E8l0qCZ
zS9DA3F(6AQwo)Q7YAAcuCTHkRQxq2Qi6SH=ru)h*_KXcFDHb7A)hx;>igHGA7*;_m
zS?#CtLfO<(Vw)97ix~K!iYy9A)yU8^&0GQ2vYJbpN}VA}#wh9vnV|@91(g+vq!|iI
z^^|YgCxj$aGN3;!i<N2-2d_J?8ed1*f8eS?H^@~P5Vwg+bRsJO79lEmuEwF5B%)}l
zL3GtG*4!gBt*UW}uPDlf(orf(PNRq`OjlNvF{P^PlvpNoCCSQUmuRXchBkzy&`6oP
zT9p_KqJ>yTR2fZINEColNCIG!MAp3`Y_XwKlo1kI=_wCTs))ii2vobLAT^&fQ9+4o
z6;e|?g{Z)a-e_U6MG{o#3u{GDFig}q#1SFgP;@O%3}s9;*r*~AU$Mx95@9)RvnEv~
z5XwDmk<{66u~~$oFNW^)5-!+LUb<>#7U|oYfB74pa=95>Fcb7rAVDt$67*6afo~AZ
z1inHr6Zj6nOyEldGl6dr%mlthuoCnh0ZY`u5|yw-Ei6$D%h;`N6!zcx{j_|yJUuvZ
zn!a2<Ms>*-%J`}re}g?UI6Z^k4llTJGm}$`__Cf_!&CU9Z1o-Qu==9c)sJ6j<9{AP
z-$2WMB+@@VJTs`il0JdI<sO`w!T%!U{NjSUKQlIXS~b@Uj*U&u<@!&}PT?=T@jLbY
zvBC84SZ;AXH=D+Y&A;~A{QNwA=-+Ge!$^AaL5%&Ad#HGbiZ&|ZR3xZqr(!P^`>04#
Vv7d?qR2-xtMa3a3hF@K5`2Vhn;nV;C

literal 0
HcmV?d00001

diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq b/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq
new file mode 100644
index 0000000000000000000000000000000000000000..fe69f0e09f0ae755e168d8f9ea690b4396f21fd7
GIT binary patch
literal 20170
zcmc(n31CxI+V}6tO+u13Bx##ArG=y^O#>xu(!DH_q)iuS+NLz!*t%~tZRy^!G-cnZ
zD2f{vkVOPVR0Kq=Ac!amu8cTNXU1W4MxFZV`1-x$`@P@)+!Ur@8OIObndtA{d(J(}
zbDnd~bDnc<HOG=`Qu394N>wIqd>@99Oe74$80p2NWR&j^<rOkT`Kbi2g-T&?JfkNj
z6*1y<?NAMi+ouvj!X5v8wsGd8Kg{p$NqYHrUw*kK_rv#mi29|Xuc`}<KJuS$m%ja*
zM?d*aP88ie&#ifR{;w~|iEqu}Wv+=&`JO1ykf;wDzRdpHQ$PIo?!=RS?P$vm|M+h`
z<vnk;$%*Wfmk-bRYZB32;sq<c1LOFNFi^#_n3O#KPbK~@@%+#8bklXBzl#+(g|<V<
zV<0=dG4wVVp*-N9sXQg%S~#R+7(ztm95WdsVwha9h(Az@3zH=kF{wZfjELxO#l^i4
zmm@8>3|8U7G}5?qTn1{yBBoE8$RAiNmh;_a`qGAr*@26u1Q)jjmw^T(GfNBkOtM(c
zFw!Q*Elp(lrE-2rxmd$D&!GwDqsBC(lS_~AjZG+Spbaqv^py+I{fYw$ku--HkT&rL
z=85%;Mr!6WPNZWpXqtRn7!6HmC_!Cret)G{#dps_45Jug7NFum={i2M2oI=&ksH#P
z5=g*I7i*XjgP!R(=@~n+Fcne}MAbA)<%o4Fb{IM#ayq}HPORs<i%^ZG4270RkMqri
zwE9|zIRh~RG$s$lmq-T~hDu?`;SW@!c(;8WW47l&<SWdeqK}G^C^a&(k+B{b2Ni=%
zB=QYNRgfI=N(=E^!!#Kf+;=cd(mu#>9#8#vYj$ZEvq^<CM#DO^txzgwG=>~Bst~mf
zqB0dT;85AkP;w#KjJ%FPLpq-+7GGgBiX0{dRWU9z-`IwnI@I5U`z5J}&-jT|OuuU#
zGiWH`GYkSodmo>PM|!h#9dm^Co2!t?LEmYh5GF}%<TG&)X&N-viHaErQV29i)5Hwg
z4?qPq%z(WH^9bFA7>4hjD=o1@ZWl_Pf+|X&-T`|eV^$nsW@0cflTlzWq%uQU%w`qi
z-uyihT!CO3zNSHJM6-^ABPe?Sa?o!5j6dMS3t9yOEtQ~R_u)e3613vBmmon2(`PWF
zUviis1L{V}Irag>>-mgc+{9;8sP7Ud>h}=82F34S+$NPGT_I;Cicu<hLPG_m?+zbe
zn8OU8(b6h0iUuM1RJ5dDF(~~WEk6SN(Q#VBFn8x{*6`igC}Gf4C`CS3Cu+qQ1)9Zr
zCR3cq7!^ioI>TH<4Hy${g@zHK;66hqpGkn``yoodOT-M}rADep15mDX5Gpx<p?(DQ
zT;jW};vBRZ0lS><UI1AJp_dW^l!tpC`U2gHqWctbsT++bM6Pw*Kso}I3_$7us7ZsG
zsQMTK!q9CAYN35Pq>w8TzrG~B#EoI*a38v+57p5wLcc+o{U~J+gSQ_IDY5r6=FOiW
zUWJl}Ff1=|%{6b<U<7>+i4yrt^rS*Wu48gcdI)Alzqz3<Ik-gChi1B6S0I2KMI1*q
zBhznhVg_AT_>4*1i58{Px}lVrJ|$v2B0Z{Q6nY3XfWGQ?4MJfk!L);^5o0Ezd2XqO
zOJznAk6^f(4^so6;+IsTd)&y>#0?*)9-SU5)<YL&h=TzktwBF~be5wyZ?9pD5R5_F
zso5~w2OxkP#Sby=asAXV&_<$<(e=z-;y#RqBaHhZ)#(uQJcN3h_)Lu0h$2*acY&Fq
zKXa2A*7~To@|5?8l}07vUJkaIho8`5e?~t_EYdw7{8q&duSVh?h9^>)``x{)$VXo|
z(BHdq3{R8NXY92KyaJ3BmyYx%kCCYp-J*~({(2u>R)6}gGQmdw$OQxIOqO%r{MfoH
z$9t1)W5-WvpEr2<%xf}@WtDeG?yI7;q3PT7MW-KIe@}SM?(_$&dpB&4C_Gy8^^w)H
z113eMw0`{`0^ts$=+#Xj&sdWmP*<rQw!9m<`N6pQj}jMuBg_g;YWeKFr(0h0+Bv=Z
zyP?m2zkk*GneHL}ZcR-NW37+`PK^~P)_Sk^YwMMlDE6donc#Hig=~8)<KFnZqpjjO
zI{QP3CdK8~-wn^%l~Swv!>ac-<XV1r+}JjYf6)4P-u%L|lTYhOV)2P?L4-fcUtIBr
zyN$K}hl`JG`}p+UN%ucF|5W$?Df>EN`&ijZcd0}-#eJak)p%R4PrdKzvhGder>zzi
zMOWtBXFBsv{f=j<XY{|aPUjl;T-}q0=0CH-z!QcEj$XX%Z_eH1^8Tvn&)2>TO7wRz
zt9%o-1V7*Gc=M6aoJ{4()<2~Ve*N^8U#|}%JD!aYTpYix<H_@k`;W8kmreY7==&X^
zjT5GK|MlE)M`hIfZA+~4ep~+h**SMrC_nh@n>7`uU-JGz@3(mSu7#$6|2y>u-<A87
z18&}!CpTJ-^uMyQG4uF}UipKHg<|EoaT7ORv96IS5`XyiT1ZNi;<b#2YrYtJc0<*w
z6N#0J)7ETMnaHPojy-EP2T$w%K@>f{x9_ZEOUC_gY)&uGc6O>nWfNcDrptcIU1;ZD
z$ZI>W?wh{chc9e5P5B@DyPxgfJ3Hdkg8LT#;kOskcbF8ffBK5+qcabD?sM5Y$G6g7
zcsFxplYhyl_H~If{_w>g*FNypBmSM=JvOjnTlRL+Q}$niSbb{Uk#X&%|GDXM>aKA-
zY23=b1DR{YyMEx$c`seMCrzrgmmleUYFxy*N74_re*WB|`Ff-4;M<?iah<C&E4Sr2
zo^Ex7x&HLp>Gdy<$z3}+a41!^p>NYZox}3nLFcl=2j*3-ihJUT-f=tXY>CllwjEZ?
z+cj966Y&<aKJtUh1;TqK_KAc3>h-JlmF)Dt?4$k<e!4fqgO!pS-cz*4dwMp>f6YW)
z-#FblHuz^uwoyH-<5J~mX{Pc?mh-Sa7JfUIJ%EarfLQQ2SOfe(2Uq|?z%oz;U~m{4
zXax(wM$iZ*gKZ!Z2*51x6tIGEpdUnlH^H;uWv~il0P4Yp0r<4cSnx2Q`bYq;f?QAx
zvOpzx8Uz9b*bams4Zv10AAo9b3CsiD09KQ6g8RXX-~^}xlRy!81w?~BFdh^DF?b0~
z0H?r%Kn)~-TCbhpF|ZC?1VKOwUI6pK9Pk!+2c(0!zy#p^Fl&JbJPJ00=Ybt`f+s)}
z@CWhW4A>8PfEFZy8n7D3feR#qQZNm80bfuKW&j<y58MSl1TCN!oCFS#3J!y*;0VYA
z_269)4q`wV$OI~20eitr&;(Y1X7B)51fByGU@K?<GN1&zK|64RS}+@20KwpGkPULc
z>tH=N1}1_%zzpsI=fIO70PF*2K|a_8G~hUR4Ge(u;3&8kECq?+GUx_}Kq#03P6Iv|
z1Kt2#pad)jo51@Z1uO<D!QEg7hy&BXBj7!-1#AEhfoDJ)I0&#@uJTfD@fxvQVq?aq
zU7_L=T=6?C`1TbZiwA7bL_G}Qv3!AWj7ZFrcuBmaV`W})fk46Y@%2;s2dDzav0g#J
z<0q&!z9HJsiM%jfxPFo!Z?dm2AYzKdpwmZ28Cg?w%s8*uxcCIUCQ+X>RU*<%n?6Hu
z*G#iys;?!?nw(-w6=>3Qvh;uqValx8nRZcDwj|j%C&@82H$@#BImbIWPhgEyPxW%<
zdxaMircBWIl2D<>m>gXcP@FqgZ48(<e}V5pRf()L-Z(ebOBGyJ9*|N|S*5OaY6La4
zvl9ZQ^6D0;HT5p9$cD&9|E9&wjV)<H(%RPEq3+D?Qv0@d8*2iVsQkkNmNrh0G@5l4
zvSs03Q>#MaRm*z<-A%noD=Gq3t`hnAWz{Dn2dwT{6UkGp&C-Odt6krtRjYkBtlcQt
zw0X<czREz=<hy;h*=B1c_vDt`o3*_$FQY=(?kl-3<$hkWNRqch?fbxkM#r{?9$r)!
zoFd%(NaT8py3yKF8J+AK-qNGqDR^{jf7%YUaLF#B5r*q_FUj4*dOfBI%l6v4Z;9{z
z1Dd3G_2Z&u$wA{2jfW1`Xnp;>B4<CT=8b(ym2#xwNmX!TWW&?CqgAtW!^kdM>sZbF
zEM25q!KTUfJ)^65wob?{-0J&mLPJ9KbCP4vYvwHG9cQOTo)DhMQst`Za>>aT0<ujh
z)>Bgh^s0&KKvrM7Io?t?RkS24QYI2;0}@_*DWIz7QU5WHmv#HdK-Vi?Sv|RRLD4A(
zd}S>G>Ky^6&uDnBdaKTgd>_qfCn3IqU}Z|$x!0DQIB`Cw>A>q3rUs-5-<bU7oCZxm
z@d<0ZYWCXuYrWKthuatSU6j0KDXr7;lJwfQJG^G<^gSmo`AZ^I$-b9GrPE)?4K6L#
zRC<SM-_fLu_3|Cl*!1puk-_h)TIOq>`#{kW82W}ej2p#_?p15Bl6Pb6it(&n`vUz>
z2I{_w*KL?Y_x>YC{;2EI1f@0;=NI&VLzYM<h>2dJ=_Lk}>06%i2j0z-iNvUg39nC*
zp%cTih^G`d^YUyCfzr?ENbxh<oKBm==|g0Bjy!==VRPm=`FYA5IT5EgEVdLsr9eXj
zc^1CI2RDJmMsE%$-j6{cPFtQo@yc`BY))mKpVBdo`1(1WN@X6ZNOAh5I0cTpJRMQw
zIh_in*`YwuI-4!eDJN`7o(~C6u_$vCPMg!AaM+wVPJuH;>9on6d5FnVC?Kd`ibKih
z2GT~Jo5Xsw_b1l>J65+YMRy^YJ2~N+I;!68BT6^PR0eaJn=SWfP5{LTm>3WX;y^q|
z0EtAvPvQ{+V-yG(lR#*U=22b8aMxHU`b~lI1HmndPVy-F8t>amGOTHSPfK#!Zqwlp
z6AJe@O=nv=K3pqY{o%Z2`#ZWqTQ-E%tk8ZmdBENIc*{!fkdDcsIg2+`?oBv<X?{yf
zZrR7(9Zj3M{kAtXm2|L|9(?@nj;`_tuLOkqTynNIf5aS03_ia$eNJ=7!JM=u8)B<t
z68gjUbW}%gJG5chC0=<_bBG|+xv{&neQ!bMYq`z)^<|s3#lPRhh(GjhTDDYip|d07
zplM#mdE>jGKC9P;k12TN;Qq(E=Fe|gyL5fKqqAgnzaf5=uAqC>{*r-!gCF*;?anP?
zTNB<NSiGmBecgo(T?Y~jOAkJhUl{)2JK8zp`ZZJ6N;b6=FCEh@EIXU?TF26o`%DK`
zu3K`zC)9gg@v;L=2iELuzMyUIF7c@mXgZqqpWU^#yl%y>=S`mO>X>?EX_J0!)$YUI
z&Fccn3aUTe-CfvSGHIPL+|j>m+-v)%b}5#%PH~2e>FDlHs7&aX@^JxOT6AAzj~o?o
z!>4Iu{Ev>&eUza)na+(0u3ttS6*<bxOr@L~6Z+A{#Pm2iP!i~cn`}ww`JH^_0siPc
zzm@mx*y}yd3a-Y!eazM9!}jb<xH{pO>ziX2A4=Hz$??mdlyv@Z%B4zZ8A@%udQAG=
zsf(d6cAYt8pKJ8K>fQN2=UqWBEDlPlzj`h3$ZM`^`!;_Q|K`ybk+<{X<D1LBxVUot
zm%l&t%Ewb1ANr%~YJ6i*($-%c^VP(&a}ydrsah0&Z1I_6i?1H<8aig*n9y=H@h{&T
zlU_Bp-1EHagLv;xj=6px-@ul<eeTq+f)d{T+cnoi2|ZV@oxOHU%6{2(<=DlbW4%|8
zY(6u!`<dsvubwNt8ZV2udTsNu*Djtp<r=7JnsDOsH^;jsxZeKenl$M7uAS1x-zPR+
zJ?=sUXRb-VyXN}jcdj$XT%pGTk3H|2`{G5+0^QO4kzEw;>7sK2|2GA?w;Z}_IX}@w
z{UYT-QKoVokL#sA{<kt7b-V<CI-Im?A^|nB)KRel3#b4N-~?JgokHs5&<X4ZsB=Oc
zL?KWD>XaHm6rfI57NCw>G*|)}K^>s;S_XW<SP%~iKnAdZSU{a6>g-VmDgvwo<A4Hq
z0qRs!hjJkZ0u3M&h(QOS&Tl!G2ogX5NCUl~6O@5uAOY0jodqfZb;hZ~N1a*ffX)Vb
zFb7a4k2+;7U=?tKTCf68=XxF(52(Y8wT0W%q~opwtzZ(MPG2<$1k_Q?1Ixf<;0@}5
z3Q#9e1Lgwi1eX9QFaYYr7J?oy15igW7o-5{<Q4<ETE&1;&<;XD6_^GV12aek{$K&P
zjT1?m`IAnh#EZI+Qn^qe^TEQU6jC3O9TyZV6OEq`tk&>+WFbTrpw!CLp%Vkbe098V
zy+A9NB-65!Sy_Z&N>~^%s3O@YUo9&zs#(2EXd(i=EP@fKqxEczK^@Bz8lvTKvUs^E
z!7G4OMkdB3vMParRkK1iX(~KV_AWg;Q=O2+nheCu`pPV7gT@+b(5sVW!T<x~6`SJ4
z+Ggqvsmkf$UP3)9OVhGyQ+hlbVPKOoCi4=BUaOfU2xn&tyb?3xw21+7t&O#3k?`ys
zRxV)$4px$@(GtU)yfD4YnI91@5NZnwi}ZT6zId)dpE#Wu0#pWdMudr-rwyAwWdTm0
z#)Qij3RtVKBrG6WBg~#J3drKI*2F@#G|NyHURbWxo75IzN?e6LkyQt<qDm4FB?}jX
zS9z(c^<F}`X}WqMuV%K;;G>nrvVk%+8x@^hqGzYp#?RJk>)1Gh4=Z5hY+!_5Xy93W
zgoUaZ0gJRO%g(4*Mz~m6gFw&IYisp}{0xCQg^j3IMhH!MZKE(i#)b(K4Q!K$cxi=U
zR&RkQNyuvJMe`R=W7W-xq$M^?&jz#_>|U%spiLl=S@ovu*{sZDGesEGL=+}h$-ScB
zIJUE127@V4*dY`~*yQ<gX(!ud?^d%BvL&)awN?<JFJxH*yEJZrsa~jEu*{@42#GAb
zy2ubFs%3>$%T0QTuR$gwHa0*>OsqOk)+6(ZVzUK$L6Q$ke3!EB!ZK}Yucjq#EYTWd
zhUp~Aq>qw^NrbaIt?Cu|f|U_bY;(0iCtyiOymqNpZDRF$p@L<qc|<GFYcXyjl7wmt
zwK@3d-dZ%$iA?ZV(=C4fyZv<EFVr0>;GD=2BY)Io@AhBGP7~$%+k7P}9V%fV0sTtW
zbNsF+y1y~3urFHEtNG0TZ$)W4kEH~E^~LJ(GY%Cl*}C=e?)Be(+UmFA`I*6AE|{}&
zP2durvc(TIEU&G3;*;0EiF(5n+C6_T^SArHyY%VKqdPySDOjIhefj*pV??*9apYo=
z=xN@QJpbc7-8z@<y?So3;6B^^%!a*6loxK(u(F}j5e@U|{wncw?a^~9HJb!~>k@7a
zPxUUW=>Pudy;p|DUb|Z^J6DpVj@c8(KQs38t$PBe&8atSOKH@sdhnvx=h6$4CN-;f
zXTEYe`h3co(+|H|Czs?;c1&OdAwu0i^T?JZd0KXa=YNu?JK3aL)ASQ9J5H1@!GDe9
zR>65ga|ybF+FUv=sPuHr;RCuZ()EJ}=z2oeH)?z7+C$e1y8h60kFF`y&eL^_+EKce
z(RGoog|Mez2$kOnZ+3F{G|LTpdW`+{x$CZupoej5gqnU;u81jjLz;eOm6d4vi71e9
zM++G<&KyJ{hP}jSNWZ^cBVf!WYf)GYfUNkGpc2w`;|#Ovt=ci}cl&TskD%=UpQ%aL
z&?HzX2=f?iC$rMc!@?+`ZI$OR?gQTAg!mw)VpR;va4#OPU~S8?lxUH7a5{6olf_w=
znOpBfQkp5mvzA>VBDY1(lppvlJ%BRG=|U+{iTH!4jmlDrEY=b|fkxdVCwZL)vTQ+>
z{Q~zM>TNi@na`NP0i1%6OHk|(bV|&qNCVZ+b_(b+Om&Z-s>B`7)QI?F%te*7Tlfs$
z&q3~mEaqO`z~o6R!<R6FjN6;X-)u!LLfiEiZ-92GdoqH|K@B6P{l+&-XrJ>+LKvn5
zCA*85XBAjsjnw7oC7aM>y1Wt;J)kwZd2WW!!+>y`35FobL4PABqx)d#-)#tMF&RB$
zag*5pVX<yUo9<)_XGBJH_s^Vb|D+L-E)V(Be@^~%dc9AdpyKE<AK}OMu4}mR=MQ_m
zTdo|jd^-N^yHs8E%U+0m|35nK4@xL8oz?65HOF7jW+W`nsNv_E%4eTVEpv#T?$}j)
zFjujyvZU(2LvkAkqr1;d{yx{xz~PVV9y79Sx$cu?KQY;UVv{|<Q=YudCcB@sq8F%<
z<fRif&#5C~?0c2YQY9YQJf%NTe&$1@`3}V8DV>VE?aqm0Ouo~B&GAIKi*`Ej0{>XC
z#YVsfTWn_vcE`bl&3BF?GtJQ`CSOmKd6n3L+wz@xWCo)o)e{{~6%pj1SQUxq+nhvh
zR^%y_NP@EQPo5KsIlt;=#SEXHA7lNWWp(fL65YMMgj3D!_t94*%HK&cl^=NA@k4@d
zpWtC6a;{|=?igj9S)rzhx_8voy%*5=ya>1eb%m)rNnK9rzS8MNU1#dD+Cd$l?q3C<
z?kZI%HBAoz9l%WG%vA*L)WxK3XeO8lI>07CO<W|P^Y9>`?r%Sc0)3zn>;x8|1=Qu-
z4(<a|KuzL0U<Py|P#3ou^nxW|7uW$F2HQXtNC63;9jpX<05z{cU^bWr_5y0MsoR?h
zGQb4zD3}96fdjw_ap$FUK6e5-{R@E`2*7?Y1yq6+fSS|CK`fvqTnt)44d4MafcfIK
z98ch$28`e_PztDPPu=zsun}wq*`NVLgH?bo9}j>mK;3S-q)Z0|ARX{QIamy~g4LiN
zB!X}-AB+KyfLd@DpxdDFU_EF8@t_(A!9H*ga00q52nHd*3c|o{kPPTnA{X2Z9t6w4
z0T2jgfi<8T+z)QERiG-lv9yhH-pt^{nZc1WD`CC7#nQ1vM&zu*$CvM?RQLz5bjQF3
zu|7C=hF7l<)01a*Y?wk99;D~7lNjM-mQ_TsQw$+U#mXajQ9Rb45gXxLn&9xV(Y%<k
zu|&+WJi^AY@jSVKNnpbh*@Sr3mmxu8qvT0sYPjDtG0%rZrl6?lQQ{d2_AW(`e5RST
zSXmyE%nM49+q6NcY?>sU%@8xhz+)^5BAXTEqlhA8b|y)-vsvOOThb&Sevl6vl+ByT
zvN>i4k7d0vT!gHKRj|2pSV)u?E@2a>tU*pDzd*vu8K1(;BA=PrjNCFITpnGlAVC_&
zW|7RD$L5fDMbv!OM^OOFv|wQnTjI+qO2;x#5K~qjR3T?7<&v3I6$EmVYLhQp6Rrqi
zP?fxvWi9sE{&hY?!j`aX(V}`boUjTPBWX}HvePCJxlv5`4Q$h5Y!le@oG7zAh~+IB
ztMDbwEk3LQs%$mDazwQ$H0>-eXl$}w(GetNIs?Xbb@P@iWydOz&!#YhN7<H1B$l@2
zJ*?5qj%6oIRTZ%;@#$5tD_EO2O2HC4%dRZ+3JMChtg=`XtHo<rQxIPgq*yyaVu6@M
z9%Q&HXq|k$UAZAh&LbPcH~IPmvzsHfuv;x+<Y)O~!}}!mpt}Rnzdqa0r}u=*eHoVA
zi<T-Z;oAf4JoY};7K}HF`=g3>Jn*36AwJv2DjxO;dL*2+`O4Ax@jLlJ3j!Z)BR)a>
z3~!O#XHJlO7umf>&K8Rq_Awq$%<lD>#O`DEM<__d0fqc=c^=eykoA?2Nvs?i$Hpy)
z+7JA6k39@)fBnE2%i!>d^ErS2kNkCS^%32L)tt5Gj~M-<uJmr1a)T^WIq3N&Ixyzj
zmsx862uK0cK2RG)ZQ4XY?JKn%)Rw*qsO_aT>jI#2klHM2yZ#KQT|WuvoTl@89-ua$
z3Oojmg2~_tpti#YoB}5RwQCaqT_f_rVL<H?wOtXw3|0baU#LCZ366u`06Wlv`GDF;
zYA@@67x)+q0%}v90?&fC0bN&~2CswjU@kZU?gIA$YCowRdly^+M(`$x1AhU31R6kX
zn*yYQ%U~_21rjh0%mzh(+Bmu{d;#d%@g(>X%mTh37AyplKrg5UAAnZ?AIL!|Xat{u
zz2JRt-A2#A^H$&w9s$&rrh~6QGgt?T!3*GdAOkE=f@MGtV!#q$1B-wO&^03yxWE8-
z3%mqEz!ney=$dd5c!N*CbKo`b5O^6J0NcT@z*z7gSPlxnZ^4J)8Ylw^zzwp%MsPQn
z4pxA6paW8H4(tOR;2p3AtOiejAg}>+g9`8m&;&jL`@!$P9uNvt;1D<nhQJvx2Yd>)
zfk<!~{2F9{RUjPv3FLv#K_Pep%ml|k4u}F*K`VF$^niNs9(WP-gL}a5ffMWi77z$_
z12u4fG++P^g9ktfm<rB<$3YVCtb^Z6ZrS3|=8RYehc>c^82;=^2v#Th<&}uUL_+Af
z86k8b#5(AwWN~grNZ>dU6imh=euA2?n(=fY3?sx_N5TowPnt|3$P^r*5Q49;Cfq`3
zS~fa{#Kw_$ghN9^5)yGISntR*;q)1I%_PK3EY{?d5Szdlo0>+_kxQI0Yc?U7#JA5L
zl9f$J4sj48$t830NT?IJz4HkvAcaIniz><@!lCKKb7SX``D8&zC|OuiN<zy>b~zyx
zmD6UDDm@isTJ`uEQp?sYLY8`0LnG0Ki1FC8m^Awl%h2=|1nAdULOKcQ>LyFb(q%?M
zmXjVrd{Cvkmw2xTB`cAPtRmTDbqmQSYiOust&VIM%3ViDybKY`L$VizAi9UIBkMyq
zY{cUxLN=4}TecE+>2lJC9E99Wwvl_ty*=CSTfUr-`$Kjh*+$x)2WC9@5P5h?NC<g^
zkRAM;j}oD~f7k9k<gvYE-}>=nKdGrX@Hi14e8NqZLqPK(5^|V?RXj<aI`VW4nQ}Cq
zJo7BcdG@(uA>?_YI!;c6kfH38FPwUj@C_k^y!7$_dF3=YLrmiJ*$6LxC<wK_3Kg9V
z2|b5L@)|TwSaP0_*C9~th3q$K5b@^4w+Imv<M_9iLz|>oL&&8N9l30NC*;tjcL~}1
z-uvW(kPjQgW<NrVSBUsmA2p!!uL?i@HL`#58}$E_jRyyjmHhU1A<*mEkcF9q{QlEW
z@&~B%+8@bhpZk+Pkw23KtHdbc*W?Q%6=I~Jo4zFEtH0!tm!LEZSp0N5kKerb4Sz8F
zYMlQU<8+@KBD$lGcox4ALw?l7@6ON44taI|$KDn8&^5buedhmPMPs&9&D)gzX0Ix4
zQ(@<pweRj(e{QhJPqJ@j@Y#iymFEJN2+J0qOsT6iKJkI&>&?G0vCT#OSp)semj)lC
zXJwXNq3%}$Yp`Ma^o5&ubHiW1JlovC6GV6Dgl9K*yYn($;1BXRoV~P^*HpSnl@l1H
zcNmZUQF<7Y84M*VC0sC?QS!Klm@xm$L%t#(`pT0s_IgTBFz)7H{PmzGd%};hCtgqB
z$)0pQNYm4UiCZ$zt^D;QGzY@b*TchEMv6Eb75n2-uFHu(y$=iy{6TgHf0X~f{1<7z
zz#k3V(U<c(`oMZepZ<P<lkPh@O}?W8+h5{{?GBHz?)b3h7urAkLYDZBt)2g?uMo_4
zoyh;b!^96>(-f7kskEV@v_<u(Vu`l4p`x;LVR>VHo2#KkJ5@7xiFOg<@T6@nZK$r)
zPSI!^N?nzRXlSdiN2sN>w7C^QBlq0cgva<KPoR<pOv~ouji<zgT=A8|1>amwRc&Q`
z#loB8Jc2E3?P{VGYTFuW%Nr{^GPJpxx^BE^EUT+5Z>6%iDqBk{N?S{je2KQQ0m*9{
zs%c(lUaBEcyKICoH%p<lm}vf+q!^C7MT+v$*2?O}=GyYodQKZRNmAR;n)o9*8rob7
zujk?#AB~zCn=2}tE2(B%o7*avEt>*?hg+|;M2{j17k7&+RrQUft?{uxmSe<=5h|fw
zc5_8Bqg2Gj-BMB8^@gD>BNTrl`X5%p$#;upxwu;@xvrx-&}__4X*O<@26J)0yey{J
zpAsg16k)iyUtXBRpOPhE6j`{qTV%P><^Mk;ftwq*j07(3mP&qsk?@mqENPSmb8-I$
zSxm-Jgz-e)BF!DC-!!U?<05aV=3mhPrqQgNC-Rq<<|hq<X*5&liM&OcJL`hcjG<{X
zYj|hXjArLd|4W8$w5i4|`puQ4t&PpxT3laS)?C`$MVI2HuF~e_#tu5-+bf$}Y8xA9
zY<#3K(xk=eq#aiGh<B#Q=*SrDGDhWX$<56!nCjW7sobGa^w(LPWwBTsfc|D%EU^}A
zwI$VJskS6nb8%MdP47Esr0vH0EJubXj4M|P_a1Q_7UaX77LweOj4Q$x%dAAiSral`
z^=-vz`LTu3S?y)%`K@IQ_S)=PYguL2tme{!xJ4Otjnx?qvues*74;dl9n~4GxSFy8
zCr#5<QPA0vnQW;okIAhmZ*U;a(lH!vFHg>h&qyhXvnL}@Lzc0)&_dsLR^=O8GLmQ2
zl|^@&$_nDDXdT6cHTC6PNT1pYd461ev8%qmEG@UH%$3?&JO}C0;!I^}9kiT|BIGYB
z%&MwLukYaACC@UIx$04V?sa)hWrfzb!kFB+@-!;9rM4`3*5cxVETgAAruvGsRFsih
zL)(gW*O$4n+Kbbm7b;Woto(diQEPVctfX6H@22SsVp?i%$dx?HYM+xq<#Lr4<hK;3
zCbbn6nCdwlF0yrU^)*<bOX$m#WUMT-dfvIRYAVw58_Hs4H5EmtwhZg}M{+|~#_VLS
z{MKT~Tb!0^ETZ~IhYSt*t?0uW<rEjvezbJa{y@3V@vQn{+9&C`T@?lSNmX+?Xj@0h
z*X60VxT(0Hb5UXR^?tC_I`T4VxqeMc#aK!09;wS>S7&=sp(8rWBNxVmsiQJ~WZQ~c
z%A!+^S#=JsjIx4!V^Kja<g`IP&lnm-mhRijV!y2{6$SNe6)Dx8dZ3T;^!l5|#clP+
zji=6}%w#$rMz-4{^N+?K`gyp$qm92hjHUYG=%lXFf~=ad+v;V6tT*d~%3vzXPa089
zJ~!6vU3C5>bz#1C6&KL?W;K>K<kwdfVouR9aD9GI`EEZBP5*{**qKG`z&|T{?QQ2!
z^uIy&*q<?nVs1Zf%m0Pr)-wl28@I#u<6k>&xpm{88@JiFAGdM;hQ9wf_OsLOnd_t4
z{^56{>D@HKF8$bkHsm*zH8^5D<Lc(M3O2u`Ed9rG-=1gV#@cB8fBjwgD7GAHkFmI*
zqP-mJXHh}N?d<4n<)P~d?0?mWay;vL3OAooZf<?KE1&ZzD$-|}iVIDn*j(E`Scit+
z{cG3I?w>LLJMCS!wO!ZW{foxH?bj}Qp6A_YW8jxxLv5b*#x<H>G5qdd+n=|e)5E?g
z#wh%gq^@Cqu%)zMR##EstT@h(Yp}-QxvMP3W6vvG`CaAF_3dS~)K|=i&aAVv+2_o{
zxQ1VRV~!Qqmp5cJl|{#L{-=FTY*%KTy|&6>Nu#d#^>ld5q2YFm-C{|vPQjxQ?z<(1
zy6+hl>biGOH{MFanbmOPtu55o$A8v`oYd)0Zp6h>MK9Pn;D;9$%c7j_s3^J}z#k^k
za5lYIxlIE1o(3(nPK(uQ&vROw?J=J878{M^DyDb5PmIcmisBv}bU$G=(r{YNaC%1u
zSE#3I?vYDM+hDQTI<qX+w5Y16DB2MGmxxJGQHk^+nG@o+^4Kk9oIJEXOC!yXyVaV@
zl^;tZZ&#naI%V;2dbFL}52e%e+|QafTtB`?a}nJLCw2U+@*S40sP^mn;L7LvlPfNj
zmPgw+Tz)F8FDmwW`IZ_k71tl!evAuRXrEggdA1Hqa#;lOqoc7CyrJKJ)PFRo)tU{p
zG`1())PEk;(vni_f7b`qXNrx6XXgy3|A*saxP0V_zqNc?wCCsiyFR%7veR(Ojq-0Z
zK4?;#Ee7&b()3(^%&Md9r!P{u{!QWZd#k><q>j8I%!f><*Q0OGpyr0|uvoZdTrY39
zKQdBKe$=92d56c7mD4wOxv>>TUXOexPhpL8(xUKci?xI9(}#b#+}$&juE$wYs&Cdu
zN)Zj`V9t!3k6czy&Jpv`qi=dBfZuF7UokikS$_RI0MVZLIYLw;N**ykEiE)`yiuP=
z|DH)OVt#ts*Ui;rDWdZoazLkCeOv%~M|-O+$jbGD!*hs0lh{%`^OuYNk^FXAwuP#U
z%a1$vOQ!h|KzoX)9_=)Ye!ec$Khqzbo}wB(ljhd`@XU9VkEsM}>ghx-$MAgTn!w4+
usMJlR&5PSATMgw+O$P2<M>BkIqv1|;@L3t((Es^=nXUNyogDmM?EW8!bfpmh

literal 0
HcmV?d00001

diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/description.xml b/test/data/openml_cache/org/openml/www/datasets/40981/description.xml
new file mode 100644
index 00000000..70843ade
--- /dev/null
+++ b/test/data/openml_cache/org/openml/www/datasets/40981/description.xml
@@ -0,0 +1,49 @@
+<oml:data_set_description xmlns:oml="http://openml.org/openml">
+  <oml:id>40981</oml:id>
+  <oml:name>Australian</oml:name>
+  <oml:version>4</oml:version>
+  <oml:description>**Author**: Confidential. Donated by Ross Quinlan  
+**Source**: [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html), [UCI](https://archive.ics.uci.edu/ml/datasets/Statlog+(Australian+Credit+Approval)) - 1987    
+**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html) 
+
+**Important note:** This dataset is derived from [credit-approval](https://www.openml.org/d/29), even though both datasets exist individually on UCI. In this version, missing values were filled in (not clear how) and a duplicate feature was removed.  
+
+**Australian Credit Approval**. This is the famous Australian Credit Approval dataset, originating from the StatLog project. It concerns credit card applications. All attribute names and values have been changed to meaningless symbols to protect the confidentiality of the data. 
+
+This dataset was retrieved 2014-11-14 from the UCI site and converted to the ARFF format.
+
+__Major changes w.r.t. version 3: dataset from UCI that matches description and data types__
+
+
+### Feature information
+
+There are 6 numerical and 8 categorical attributes, all normalized to [-1,1]. The original formatting was as follows:  
+
+A1: 0,1 CATEGORICAL (formerly: a,b)  
+A2: continuous.  
+A3: continuous.  
+A4: 1,2,3 CATEGORICAL (formerly: p,g,gg)  
+A5: 1, 2,3,4,5, 6,7,8,9,10,11,12,13,14 CATEGORICAL (formerly: ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x)  
+A6: 1, 2,3, 4,5,6,7,8,9 CATEGORICAL (formerly: ff,dd,j,bb,v,n,o,h,z)  
+A7: continuous.  
+A8: 1, 0 CATEGORICAL (formerly: t, f)  
+A9: 1, 0 CATEGORICAL (formerly: t, f)  
+A10: continuous.  
+A11: 1, 0 CATEGORICAL (formerly t, f)  
+A12: 1, 2, 3 CATEGORICAL (formerly: s, g, p)  
+A13: continuous.  
+A14: continuous.  
+A15: 1,2 class attribute (formerly: +,-)  
+
+### Relevant Papers
+
+Ross Quinlan. &quot;Simplifying decision trees&quot;, Int J Man-Machine Studies 27, Dec 1987, pp. 221-234. 
+
+Ross Quinlan. &quot;C4.5: Programs for Machine Learning&quot;, Morgan Kaufmann, Oct 1992</oml:description>
+  <oml:description_version>2</oml:description_version>
+  <oml:format>ARFF</oml:format>
+        <oml:upload_date>2017-12-04T22:15:38</oml:upload_date>
+    <oml:licence>Public</oml:licence>  <oml:url>https://api.openml.org/data/v1/download/18151910/Australian.arff</oml:url>
+  <oml:parquet_url>http://openml1.win.tue.nl/dataset40981/dataset_40981.pq</oml:parquet_url>  <oml:file_id>18151910</oml:file_id>  <oml:default_target_attribute>A15</oml:default_target_attribute>      <oml:version_label>4</oml:version_label>    <oml:tag>derived</oml:tag><oml:tag>OpenML100</oml:tag><oml:tag>study_135</oml:tag><oml:tag>study_144</oml:tag><oml:tag>study_218</oml:tag><oml:tag>study_98</oml:tag>  <oml:visibility>public</oml:visibility>  <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/Statlog+(Australian+Credit+Approval)</oml:original_data_url>    <oml:minio_url>http://openml1.win.tue.nl/dataset40981/dataset_40981.pq</oml:minio_url>  <oml:status>active</oml:status>
+  <oml:processing_date>2018-10-04 07:20:02</oml:processing_date>      <oml:md5_checksum>920e2419a28215109651fcc5cbd1662e</oml:md5_checksum>
+</oml:data_set_description>
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml
new file mode 100644
index 00000000..ba431ff5
--- /dev/null
+++ b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml
@@ -0,0 +1,175 @@
+<oml:data_features xmlns:oml="http://openml.org/openml">
+    <oml:feature>
+    <oml:index>0</oml:index>
+    <oml:name>A1</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>0</oml:nominal_value>
+          <oml:nominal_value>1</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>1</oml:index>
+    <oml:name>A2</oml:name>
+    <oml:data_type>numeric</oml:data_type>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>2</oml:index>
+    <oml:name>A3</oml:name>
+    <oml:data_type>numeric</oml:data_type>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>3</oml:index>
+    <oml:name>A4</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+          <oml:nominal_value>3</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>4</oml:index>
+    <oml:name>A5</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>10</oml:nominal_value>
+          <oml:nominal_value>11</oml:nominal_value>
+          <oml:nominal_value>12</oml:nominal_value>
+          <oml:nominal_value>13</oml:nominal_value>
+          <oml:nominal_value>14</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+          <oml:nominal_value>3</oml:nominal_value>
+          <oml:nominal_value>4</oml:nominal_value>
+          <oml:nominal_value>5</oml:nominal_value>
+          <oml:nominal_value>6</oml:nominal_value>
+          <oml:nominal_value>7</oml:nominal_value>
+          <oml:nominal_value>8</oml:nominal_value>
+          <oml:nominal_value>9</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>5</oml:index>
+    <oml:name>A6</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+          <oml:nominal_value>3</oml:nominal_value>
+          <oml:nominal_value>4</oml:nominal_value>
+          <oml:nominal_value>5</oml:nominal_value>
+          <oml:nominal_value>7</oml:nominal_value>
+          <oml:nominal_value>8</oml:nominal_value>
+          <oml:nominal_value>9</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>6</oml:index>
+    <oml:name>A7</oml:name>
+    <oml:data_type>numeric</oml:data_type>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>7</oml:index>
+    <oml:name>A8</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>0</oml:nominal_value>
+          <oml:nominal_value>1</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>8</oml:index>
+    <oml:name>A9</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>0</oml:nominal_value>
+          <oml:nominal_value>1</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>9</oml:index>
+    <oml:name>A10</oml:name>
+    <oml:data_type>numeric</oml:data_type>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>10</oml:index>
+    <oml:name>A11</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>0</oml:nominal_value>
+          <oml:nominal_value>1</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>11</oml:index>
+    <oml:name>A12</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>1</oml:nominal_value>
+          <oml:nominal_value>2</oml:nominal_value>
+          <oml:nominal_value>3</oml:nominal_value>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>12</oml:index>
+    <oml:name>A13</oml:name>
+    <oml:data_type>numeric</oml:data_type>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>13</oml:index>
+    <oml:name>A14</oml:name>
+    <oml:data_type>numeric</oml:data_type>
+        <oml:is_target>false</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+    <oml:feature>
+    <oml:index>14</oml:index>
+    <oml:name>A15</oml:name>
+    <oml:data_type>nominal</oml:data_type>
+          <oml:nominal_value>0</oml:nominal_value>
+          <oml:nominal_value>1</oml:nominal_value>
+        <oml:is_target>true</oml:is_target>
+    <oml:is_ignore>false</oml:is_ignore>
+    <oml:is_row_identifier>false</oml:is_row_identifier>
+    <oml:number_of_missing_values>0</oml:number_of_missing_values>
+  </oml:feature>
+  </oml:data_features>
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl b/test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..a865af56b27c6a868c9e6806a8fc73bb21606bef
GIT binary patch
literal 899
zcmaKqJx{|h5QZBkZJMSHVqgPiilD;ud#r?5%EvE2QA!Pt)VNiXpfVuA?hW&Qx!4I9
zkv7srmi@f<p3nDgeQnH&{NBXLw89UUE5f2hXC5R$Op;hG{3jA5>6!?>e-FpEH&<|a
zo$q~$_J;s5uZC<+HUg4ZEQkO`6D+usW%85P6>QjS8HFrZ2=3)8|0P(YBo+^#H!cJ>
zups<^rO}M6{V0s%kUba7w35!I1~ru(t-;y`k4r%v$ec!G9X<-`-qAftB8^0n>~0}V
zjWi|MQ)FH(b)h`!apMvS><W?_Send|vVN%yVYnE=&<lhi1H#?x2Np;4m>WIF*@qZF
z3?W7k;~$Tz=24X%gEEgS6>ZZ~dPdJ_rzo(diEC0k+|mJCBW+2JN^DxxD^k}rL0t;Q
zy9JIWa6n*ApeyAM*U-?0Lh<h8uBLF6BA<%CFPOEdxIt6XG?ixCe6%#QrO<pp{tca{
Op`Jp^hT^xAY3CbuX&*HJ

literal 0
HcmV?d00001

diff --git a/test/data_manager.py b/test/data_manager.py
deleted file mode 100644
index 1bb78bc6..00000000
--- a/test/data_manager.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from pathlib import Path
-
-from meta_automl.data_preparation.data_manager import DataManager
-
-
-class TestDataManager(DataManager):
-    @classmethod
-    def get_data_dir(cls) -> Path:
-        return cls.get_project_root().joinpath('test/data')
diff --git a/test/general_checks.py b/test/general_checks.py
deleted file mode 100644
index a1d8610d..00000000
--- a/test/general_checks.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from pathlib import Path
-from typing import Union
-
-from meta_automl.data_preparation.dataset import Dataset, DatasetCache
-from test.constants import CACHED_DATASETS
-from test.data_manager import TestDataManager
-
-
-def assert_file_unmodified_during_test(path: Path, test_start_timestamp: float):
-    assert path.stat().st_mtime < test_start_timestamp, f'The file should not be modified during the test: ' \
-                                                        f'"{path.relative_to(TestDataManager.get_project_root())}".'
-
-
-def assert_cache_file_exists(path: Path):
-    assert path.exists(), 'Cache not found at the path: ' \
-                          f'"{path.relative_to(TestDataManager.get_project_root())}".'
-
-
-def check_dataset_and_cache(dataset_or_cache: Union[Dataset, DatasetCache], desired_name: str, desired_path: Path,
-                            test_start_time: float):
-    assert dataset_or_cache.name == desired_name
-    assert dataset_or_cache.cache_path == desired_path
-    assert_cache_file_exists(desired_path)
-    if desired_name in CACHED_DATASETS:
-        assert_file_unmodified_during_test(desired_path, test_start_time)
diff --git a/test/unit/datasets/__init__.py b/test/unit/datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/test/unit/datasets/conftest.py b/test/unit/datasets/conftest.py
new file mode 100644
index 00000000..bd43ec3e
--- /dev/null
+++ b/test/unit/datasets/conftest.py
@@ -0,0 +1,18 @@
+import shutil
+
+import pytest
+
+from meta_automl.data_preparation.dataset import OpenMLDataset
+from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id
+from test.constants import OPENML_CACHED_DATASETS, OPENML_DATASET_IDS_TO_LOAD
+
+
+@pytest.fixture
+def openml_dataset_ids():
+    ids = OPENML_DATASET_IDS_TO_LOAD
+    yield ids
+    for dataset_id in ids:
+        if dataset_id in OPENML_CACHED_DATASETS:
+            continue
+        cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id)
+        shutil.rmtree(cache_path, ignore_errors=True)
diff --git a/test/unit/datasets/general_checks.py b/test/unit/datasets/general_checks.py
new file mode 100644
index 00000000..5e2f446d
--- /dev/null
+++ b/test/unit/datasets/general_checks.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+
+import test.constants
+from meta_automl.data_preparation.dataset import DatasetBase
+from meta_automl.data_preparation.file_system import get_project_root
+from meta_automl.data_preparation.file_system import get_dataset_cache_path
+
+
+def assert_file_unmodified_during_test(path: Path):
+    failure_message = ('The file should not be modified during the test: '
+                       f'"{path.relative_to(get_project_root())}".')
+    assert path.stat().st_mtime < test.constants.TEST_START_TIMESTAMP, failure_message
+
+
+def assert_cache_file_exists(path: Path):
+    assert path.exists(), 'Cache not found at the path: ' \
+                          f'"{path.relative_to(get_project_root())}".'
+
+
+def check_dataset_cache(dataset: DatasetBase):
+    cache_path = get_dataset_cache_path(dataset)
+    assert_cache_file_exists(cache_path)
+    if dataset.id_ in test.constants.OPENML_CACHED_DATASETS:
+        assert_file_unmodified_during_test(cache_path)
diff --git a/test/unit/datasets/test_custom_dataset.py b/test/unit/datasets/test_custom_dataset.py
new file mode 100644
index 00000000..5f34b194
--- /dev/null
+++ b/test/unit/datasets/test_custom_dataset.py
@@ -0,0 +1,48 @@
+import numpy as np
+import pytest
+
+from meta_automl.data_preparation.dataset import DataNotFoundError, CustomDataset, DatasetData
+from test.unit.datasets.general_checks import assert_cache_file_exists
+
+
+@pytest.fixture(scope='module')
+def new_dataset_data():
+    dataset_data = DatasetData(
+        x=np.array([['a', 'b'], ['b', 'a']]),
+        y=np.array([5, 10]),
+        categorical_indicator=[True, True],
+        attribute_names=['foo', 'bar']
+    )
+    return dataset_data
+
+
+@pytest.fixture(scope='module')
+def new_dataset(new_dataset_data):
+    dataset = CustomDataset(42)
+    dataset.dump_data(new_dataset_data)
+    yield dataset
+    dataset.cache_path.unlink()
+
+
+def test_error_on_missing_dataset_cache():
+    with pytest.raises(DataNotFoundError):
+        CustomDataset('random_missing_dataset').get_data()
+
+
+def test_custom_dataset_dumping(new_dataset):
+    # Act
+    cache_path = new_dataset.cache_path
+    # Assert
+    assert_cache_file_exists(cache_path)
+
+
+def test_custom_dataset_data_loading(new_dataset_data, new_dataset):
+    # Act
+    correct_data = new_dataset_data
+    dataset = new_dataset
+    data = dataset.get_data()
+    # Assert
+    assert np.all(np.equal(data.x, correct_data.x))
+    assert np.all(np.equal(data.y, correct_data.y))
+    assert data.categorical_indicator == correct_data.categorical_indicator
+    assert data.attribute_names == correct_data.attribute_names
diff --git a/test/unit/datasets/test_datasets_loaders.py b/test/unit/datasets/test_datasets_loaders.py
new file mode 100644
index 00000000..0fd1ce17
--- /dev/null
+++ b/test/unit/datasets/test_datasets_loaders.py
@@ -0,0 +1,24 @@
+from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
+from test.unit.datasets.general_checks import check_dataset_cache
+
+
+def test_group_load_new_datasets(openml_dataset_ids):
+    loader = OpenMLDatasetsLoader()
+    datasets = loader.load(openml_dataset_ids)
+    assert loader.dataset_ids == openml_dataset_ids
+    for dataset_id, dataset in zip(openml_dataset_ids, datasets):
+        check_dataset_cache(dataset)
+
+
+def test_load_single(openml_dataset_ids):
+    loader = OpenMLDatasetsLoader()
+    for dataset_id in openml_dataset_ids:
+        dataset = loader.load_single(dataset_id)
+        check_dataset_cache(dataset)
+
+
+def test_load_new_datasets_on_demand(openml_dataset_ids):
+    loader = OpenMLDatasetsLoader()
+    for dataset_id in openml_dataset_ids:
+        dataset = loader.load_single(dataset_id)
+        check_dataset_cache(dataset)
diff --git a/test/unit/datasets/test_file_dataset.py b/test/unit/datasets/test_file_dataset.py
new file mode 100644
index 00000000..125cb641
--- /dev/null
+++ b/test/unit/datasets/test_file_dataset.py
@@ -0,0 +1,48 @@
+import numpy as np
+import pytest
+
+from meta_automl.data_preparation.dataset import CacheNotFoundError, FileDataset, DatasetData
+from test.unit.datasets.general_checks import assert_cache_file_exists
+
+
+@pytest.fixture(scope='module')
+def new_dataset_data():
+    dataset_data = DatasetData(
+        x=np.array([['a', 'b'], ['b', 'a']]),
+        y=np.array([5, 10]),
+        categorical_indicator=[True, True],
+        attribute_names=['foo', 'bar']
+    )
+    return dataset_data
+
+
+@pytest.fixture(scope='module')
+def new_dataset(new_dataset_data):
+    dataset = FileDataset(42)
+    dataset.dump_data(new_dataset_data)
+    yield dataset
+    dataset.cache_path.unlink()
+
+
+def test_error_on_missing_dataset_cache():
+    with pytest.raises(CacheNotFoundError):
+        FileDataset('random_missing_dataset').get_data()
+
+
+def test_file_dataset_dumping(new_dataset):
+    # Act
+    cache_path = new_dataset.cache_path
+    # Assert
+    assert_cache_file_exists(cache_path)
+
+
+def test_file_dataset_data_loading(new_dataset_data, new_dataset):
+    # Act
+    correct_data = new_dataset_data
+    dataset = new_dataset
+    data = dataset.get_data()
+    # Assert
+    assert np.all(np.equal(data.x, correct_data.x))
+    assert np.all(np.equal(data.y, correct_data.y))
+    assert data.categorical_indicator == correct_data.categorical_indicator
+    assert data.attribute_names == correct_data.attribute_names
diff --git a/test/unit/datasets/test_openml_dataset.py b/test/unit/datasets/test_openml_dataset.py
new file mode 100644
index 00000000..81042648
--- /dev/null
+++ b/test/unit/datasets/test_openml_dataset.py
@@ -0,0 +1,27 @@
+from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData
+from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id
+from test.constants import OPENML_CACHED_DATASETS
+from test.unit.datasets.general_checks import check_dataset_cache
+
+
+def test_openml_dataset_creation(openml_dataset_ids):
+    for dataset_id in openml_dataset_ids:
+        dataset = OpenMLDataset(dataset_id)
+
+        assert dataset.id_ == dataset_id
+
+
+def test_openml_dataset_is_cached_cached(openml_dataset_ids):
+    for dataset_id in openml_dataset_ids:
+        cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id)
+
+        is_exist = dataset_id in OPENML_CACHED_DATASETS
+        assert is_exist == cache_path.exists()
+
+
+def test_openml_dataset_data_loading(openml_dataset_ids):
+    for dataset_id in openml_dataset_ids:
+        dataset = OpenMLDataset(dataset_id)
+        dataset_data = dataset.get_data()
+        assert isinstance(dataset_data, DatasetData)
+        check_dataset_cache(dataset)
diff --git a/test/unit/test_dataset.py b/test/unit/test_dataset.py
deleted file mode 100644
index 3ac46d6d..00000000
--- a/test/unit/test_dataset.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import numpy as np
-import pytest
-
-from meta_automl.data_preparation.dataset import DatasetCache, NoCacheError
-from test.constants import CACHED_DATASETS
-from test.data_manager import TestDataManager
-
-
-@pytest.fixture
-def dumped_cache_path():
-    path = TestDataManager.get_dataset_cache_path('data_dumped')
-    yield path
-    path.unlink()
-
-
-def test_dataset_caching(dumped_cache_path):
-    dataset_name = CACHED_DATASETS[0]
-
-    cache_path = TestDataManager.get_dataset_cache_path(dataset_name)
-
-    dataset_cache = DatasetCache(dataset_name, cache_path)
-    dataset = dataset_cache.from_cache()
-    dumped_cache = dataset.dump_to_cache(dumped_cache_path)
-    reloaded_dataset = dumped_cache.from_cache()
-    # Check data integrity.
-    assert dataset.name == dataset_name
-    assert reloaded_dataset.name == dataset_name
-    assert dataset.id == reloaded_dataset.id
-    assert np.all(np.equal(dataset.x, reloaded_dataset.x))
-    assert np.all(np.equal(dataset.y, reloaded_dataset.y))
-    # Check caching integrity.
-    assert dataset_cache.cache_path == cache_path
-    assert dataset.cache_path == cache_path
-    assert dumped_cache.cache_path == dumped_cache_path
-    assert reloaded_dataset.cache_path == dumped_cache_path
-
-
-def test_error_on_missing_dataset_cache():
-    with pytest.raises(NoCacheError):
-        DatasetCache('random_missing_cache').from_cache()
diff --git a/test/unit/test_datasets_loaders.py b/test/unit/test_datasets_loaders.py
deleted file mode 100644
index 1596e312..00000000
--- a/test/unit/test_datasets_loaders.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import time
-
-import pytest
-
-from meta_automl.data_preparation.dataset import DatasetCache
-from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
-from test.general_checks import check_dataset_and_cache
-from test.constants import CACHED_DATASETS
-from test.data_manager import TestDataManager
-
-
-@pytest.fixture
-def dataset_names():
-    dataset_names = ['australian', 'blood-transfusion-service-center']
-    yield dataset_names
-    for dataset_name in dataset_names:
-        if dataset_name not in CACHED_DATASETS:
-            TestDataManager.get_dataset_cache_path(dataset_name).unlink(missing_ok=True)
-
-
-def test_group_load_new_datasets(dataset_names):
-    test_start_time = time.time()
-    loader = OpenMLDatasetsLoader()
-    loader.data_manager = TestDataManager
-
-    datasets = loader.load(dataset_names)
-
-    assert loader.dataset_sources == dataset_names
-
-    for dataset_name, dataset_cache in zip(dataset_names, datasets):
-        check_dataset_and_cache(dataset_cache, dataset_name, dataset_cache.cache_path, test_start_time)
-
-
-def test_load_single(dataset_names):
-    test_start_time = time.time()
-    loader = OpenMLDatasetsLoader()
-    loader.data_manager = TestDataManager
-    for dataset_name in dataset_names:
-        dataset_cache = loader.load_single(dataset_name)
-        check_dataset_and_cache(dataset_cache, dataset_name, dataset_cache.cache_path, test_start_time)
-
-
-def test_load_new_datasets_on_demand(dataset_names):
-    test_start_time = time.time()
-    loader = OpenMLDatasetsLoader()
-    loader.data_manager = TestDataManager
-    for dataset_name in dataset_names:
-        cache_path = TestDataManager.get_dataset_cache_path(dataset_name)
-        dataset = loader.cache_to_memory(DatasetCache(dataset_name, cache_path))
-        check_dataset_and_cache(dataset, dataset_name, cache_path, test_start_time)
diff --git a/test/unit/test_file_system.py b/test/unit/test_file_system.py
new file mode 100644
index 00000000..dba55923
--- /dev/null
+++ b/test/unit/test_file_system.py
@@ -0,0 +1,7 @@
+import pytest
+from pathlib import Path
+
+from meta_automl.data_preparation.file_system import get_data_dir, get_project_root
+
+# def test_root_dir():
+#     assert get_project_root() ==
diff --git a/test/unit/test_meta_features_extractors.py b/test/unit/test_meta_features_extractors.py
index c5625f53..bd9b925b 100644
--- a/test/unit/test_meta_features_extractors.py
+++ b/test/unit/test_meta_features_extractors.py
@@ -1,38 +1,37 @@
-import time
+import shutil
 
 import pytest
 
+from meta_automl.data_preparation.dataset import OpenMLDataset
+from meta_automl.data_preparation.file_system import get_dataset_cache_path_by_id, get_meta_features_cache_path
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
-from test.general_checks import assert_file_unmodified_during_test, assert_cache_file_exists
-from test.data_manager import TestDataManager
-from test.constants import CACHED_DATASETS, DATASETS_WITH_CACHED_META_FEATURES
+from test.unit.datasets.general_checks import assert_file_unmodified_during_test, assert_cache_file_exists
+from test.constants import OPENML_DATASET_IDS_TO_LOAD, OPENML_CACHED_DATASETS, DATASETS_WITH_CACHED_META_FEATURES
 
 
 @pytest.fixture
-def dataset_names():
-    dataset_names = ['australian', 'monks-problems-1', 'monks-problems-2', 'blood-transfusion-service-center']
-    yield dataset_names
-    for dataset_name in dataset_names:
-        if dataset_name not in CACHED_DATASETS + DATASETS_WITH_CACHED_META_FEATURES:
-            TestDataManager.get_dataset_cache_path(dataset_name).unlink(missing_ok=True)
-        if dataset_name not in DATASETS_WITH_CACHED_META_FEATURES:
-            TestDataManager.get_meta_features_cache_path(dataset_name, PymfeExtractor.SOURCE).unlink(missing_ok=True)
+def dataset_ids():
+    dataset_ids = set(OPENML_CACHED_DATASETS + DATASETS_WITH_CACHED_META_FEATURES + OPENML_DATASET_IDS_TO_LOAD)
+    yield dataset_ids
+    for dataset_id in dataset_ids:
+        if dataset_id not in OPENML_CACHED_DATASETS:
+            dataset_cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id)
+            shutil.rmtree(dataset_cache_path)
+        if dataset_id not in DATASETS_WITH_CACHED_META_FEATURES:
+            mf_cache_path = get_meta_features_cache_path(PymfeExtractor, dataset_id)
+            mf_cache_path.unlink(missing_ok=True)
 
 
-def test_meta_features_extraction(dataset_names):
-    test_start_time = time.time()
+def test_meta_features_extraction(dataset_ids):
     extractor = PymfeExtractor(extractor_params={'groups': 'general'})
-    extractor.data_manager = TestDataManager
-    extractor.datasets_loader.data_manager = TestDataManager
-    meta_features = extractor.extract(dataset_names)
-    assert list(meta_features.index) == dataset_names
-    for dataset_name in dataset_names:
-        meta_features_cache_path = TestDataManager.get_meta_features_cache_path(
-            dataset_name, extractor.SOURCE)
+    meta_features = extractor.extract(dataset_ids)
+    assert set(meta_features.index) == dataset_ids
+    for dataset_id in dataset_ids:
+        meta_features_cache_path = get_meta_features_cache_path(PymfeExtractor, dataset_id)
         assert_cache_file_exists(meta_features_cache_path)
 
-        if dataset_name in DATASETS_WITH_CACHED_META_FEATURES:
-            assert_file_unmodified_during_test(meta_features_cache_path, test_start_time)
+        if dataset_id in DATASETS_WITH_CACHED_META_FEATURES:
+            assert_file_unmodified_during_test(meta_features_cache_path)
         else:
-            cache_path = TestDataManager.get_dataset_cache_path(dataset_name)
+            cache_path = get_dataset_cache_path_by_id(OpenMLDataset, dataset_id)
             assert_cache_file_exists(cache_path)

From 0b9ed49fcc200a8ecc2bc658ff285a7fb367f57c Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Mon, 3 Jul 2023 20:44:53 +0300
Subject: [PATCH 46/60] Auto-sklearn baseline in a progress

---
 .../__init__.py                                 |  0
 experiments/auto-sklearn/experimental_data.csv  | 17 +++++++++++++++++
 .../openml_suite.py                             |  0
 3 files changed, 17 insertions(+)
 rename experiments/{auto-sklearn_run => auto-sklearn}/__init__.py (100%)
 create mode 100644 experiments/auto-sklearn/experimental_data.csv
 rename experiments/{auto-sklearn_run => auto-sklearn}/openml_suite.py (100%)

diff --git a/experiments/auto-sklearn_run/__init__.py b/experiments/auto-sklearn/__init__.py
similarity index 100%
rename from experiments/auto-sklearn_run/__init__.py
rename to experiments/auto-sklearn/__init__.py
diff --git a/experiments/auto-sklearn/experimental_data.csv b/experiments/auto-sklearn/experimental_data.csv
new file mode 100644
index 00000000..18dbec46
--- /dev/null
+++ b/experiments/auto-sklearn/experimental_data.csv
@@ -0,0 +1,17 @@
+dataset_id,dataset_name,run_label,roc_auc,f1,accuracy,logloss,precision,fit_time,inference_time,model_str
+1590,adult,Auto-sklearn,-0.798,-0.923,-0.878,4.391,-0.897,596.2,0.1,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=1.7108930238344161e-10, learning_rate=0.010827728124541558, loss='auto', max_iter=512, max_leaf_nodes=25, min_samples_leaf=4, n_iter_no_change=19, random_state=1, validation_fraction=0.1759114608225653, warm_start=True)"
+1461,bank-marketing,Auto-sklearn,-0.695,-0.507,-0.9,3.587,-0.63,595.0,0.6,"RandomForestClassifier(max_features=4, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)"
+1464,blood-transfusion-service-center,Auto-sklearn,-0.627,-0.415,-0.793,7.449,-0.688,593.6,0.0,"MLPClassifier(alpha=6.875656304664039e-05, beta_1=0.999, beta_2=0.9, hidden_layer_sizes=(224,), learning_rate_init=0.00011403871479850849, max_iter=256, n_iter_no_change=32, random_state=1, validation_fraction=0.0, verbose=0, warm_start=True)"
+1489,phoneme,Auto-sklearn,-0.881,-0.82,-0.891,3.934,-0.787,593.8,0.0,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=0.00030344870480744136, learning_rate=0.18318625129457267, loss='auto', max_iter=512, max_leaf_nodes=366, min_samples_leaf=2, n_iter_no_change=11, random_state=1, validation_fraction=None, warm_start=True)"
+40975,car,Auto-sklearn,-0.999,-0.949,-0.98,0.114,-0.93,598.1,0.0,"RandomForestClassifier(bootstrap=False, max_features=2, min_samples_split=3, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)"
+40996,fashion-mnist,Auto-sklearn,-0.968,-0.866,-0.866,1.945,-0.866,596.9,0.8,"KNeighborsClassifier(n_neighbors=4, weights='distance')"
+41027,jungle_chess_2pcs_raw_endgame_complete,Auto-sklearn,-0.974,-0.82,-0.866,0.275,-0.826,596.7,0.2,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=9.674948183980905e-09, learning_rate=0.014247987845444413, loss='auto', max_iter=512, max_leaf_nodes=55, min_samples_leaf=164, n_iter_no_change=1, random_state=1, validation_fraction=0.11770489601182355, warm_start=True)"
+54,vehicle,Auto-sklearn,-0.965,-0.847,-0.859,0.896,-0.848,594.9,0.0,LinearDiscriminantAnalysis(tol=0.06932929810851429)roc_auc,f1,accuracy,logloss,precision,dataset_id,dataset_name,run_label,fit_time,inference_time,model_str
+1590,adult,Hist gradient boosting classifier,-0.806,-0.924,-0.88,4.313,-0.903,0.4,0.0,
+1461,bank-marketing,Hist gradient boosting classifier,-0.718,-0.544,-0.904,3.452,-0.639,0.7,0.0,
+1464,blood-transfusion-service-center,Hist gradient boosting classifier,-0.565,-0.328,-0.7,10.813,-0.367,0.1,0.0,
+1489,phoneme,Hist gradient boosting classifier,-0.876,-0.82,-0.894,3.834,-0.806,0.2,0.0,
+40975,car,Hist gradient boosting classifier,-1.0,-0.94,-0.986,0.025,-0.93,0.7,0.0,
+40996,fashion-mnist,Hist gradient boosting classifier,-0.993,-0.904,-0.904,0.267,-0.903,113.4,0.9,
+41027,jungle_chess_2pcs_raw_endgame_complete,Hist gradient boosting classifier,-0.976,-0.825,-0.87,0.272,-0.833,1.4,0.1,
+54,vehicle,Hist gradient boosting classifier,-0.928,-0.763,-0.782,0.778,-0.768,1.2,0.0,
diff --git a/experiments/auto-sklearn_run/openml_suite.py b/experiments/auto-sklearn/openml_suite.py
similarity index 100%
rename from experiments/auto-sklearn_run/openml_suite.py
rename to experiments/auto-sklearn/openml_suite.py

From 42e343ba956f0a73c46cbd9689fa552a468837f3 Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Mon, 3 Jul 2023 20:46:59 +0300
Subject: [PATCH 47/60] WIP: auto-sklearn baseline

---
 experiments/__init__.py                  |   0
 experiments/auto-sklearn/openml_suite.py | 147 ++++++++++++++++-------
 experiments/fedot_warm_start/run.py      |  30 +++--
 3 files changed, 120 insertions(+), 57 deletions(-)
 create mode 100644 experiments/__init__.py

diff --git a/experiments/__init__.py b/experiments/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/experiments/auto-sklearn/openml_suite.py b/experiments/auto-sklearn/openml_suite.py
index 588d3b93..4a655653 100644
--- a/experiments/auto-sklearn/openml_suite.py
+++ b/experiments/auto-sklearn/openml_suite.py
@@ -1,57 +1,73 @@
+import csv
+import os
 import pickle
 import re
+import time
 
 import numpy as np
 import json
 
 import autosklearn.classification
+import autosklearn.ensembles
 from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import Balancing
 from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice
 from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
 from autosklearn.pipeline.components.classification import AutoSklearnClassificationAlgorithm, ClassifierChoice
 
-from experiments.fedot_warm_start.run import prepare_data
+# from experiments.fedot_warm_start.run import fetch_openml_data, mock_data_fetching
 from sklearn import model_selection, metrics
+from sklearn import ensemble
 from sklearn.base import ClassifierMixin
 
 
-class AutoSklearnEncoder(json.JSONEncoder):
-    def default(self, o):
-        if isinstance(o, ClassifierChoice):
-            return repr(o.choice.estimator)
-        # if isinstance(o, (DataPreprocessorChoice, FeaturePreprocessorChoice)):
-        #     return None
-        if isinstance(o, ClassifierMixin):
-            return re.sub(r'\s{2,}', ' ', repr(o))
-        elif isinstance(o, Balancing):
-            return repr(o)
-        elif isinstance(o, np.integer):
-            return int(o)
-        elif isinstance(o, np.floating):
-            return float(o)
+# class AutoSklearnEncoder(json.JSONEncoder):
+#     def default(self, o):
+#         # if isinstance(o, dict):
+#         #     return json.dumps(o)
+#         if isinstance(o, ClassifierChoice):
+#             return repr(o.choice.estimator)
+#         # if isinstance(o, (DataPreprocessorChoice, FeaturePreprocessorChoice)):
+#         #     return None
+#         elif isinstance(o, ClassifierMixin):
+#             return re.sub(r'\s{2,}', ' ', repr(o))
+#         # elif isinstance(o, Balancing):
+#         #     return repr(o)
+#         elif isinstance(o, np.integer):
+#             return int(o)
+#         elif isinstance(o, np.floating):
+#             return float(o)
 
 
-class AutoSklearnValidator:
+class AutoSklearnBaseline:
 
     def __init__(self):
         pass
 
     @staticmethod
     def main():
-        ds_with_ids, ds_names = prepare_data()
-        train_ds_names, test_ds_names = ds_names
+        openml_data = None
+        # dataset_names = [dataset.name for dataset in openml_data]
 
-        ds_ids, datasets = ds_with_ids
+        # train_data_names, test_data_names = model_selection.train_test_split(
+        #     [dataset.name for dataset in openml_data],
+        #     test_size=0.2,
+        #     random_state=42
+        # )
+        # train_ds_names, test_ds_names = ds_names
 
-        for ds_name in train_ds_names:
-        # if train_ds_names[0] is not None:
-            print("Sanity check")
-            dataset = datasets[ds_name].from_cache()
+        # ds_ids, datasets = ds_with_ids
 
-            # cannot wait longer because of the slow data fetching, issue#9
-            estimator = autosklearn.classification.AutoSklearnClassifier(
-                time_left_for_this_task=60
-            )
+        # for ds_name in train_ds_names:
+
+        for iteration, dataset in enumerate(openml_data):
+            print(f"Fetched data name: {dataset.name}")
+            dataset = dataset.from_cache()
+
+            # estimator = autosklearn.classification.AutoSklearnClassifier(
+            #     ensemble_class=autosklearn.ensembles.SingleBest,
+            #     time_left_for_this_task=600
+            # )
+            estimator = ensemble.HistGradientBoostingClassifier()
 
             X_train, X_test, y_train, y_test = model_selection.train_test_split(
                 dataset.x,
@@ -60,32 +76,75 @@ def main():
                 random_state=42
             )
 
+            fitting_start_time = time.time()
             pipeline = estimator.fit(X_train, y_train)
+            fitting_end_time = time.time() - fitting_start_time
+            # print(f"Fitting time is {fitting_end_time}sec")
 
+            inference_start_time = time.time()
             predictions = estimator.predict(X_test)
-
-            quality_estimation = metrics.roc_auc_score(y_test, predictions)
-
-            results = {
-                'ensemble': pipeline.show_models(),
-                'score': quality_estimation
+            inference_end_time = time.time() - inference_start_time
+
+            prediction_probabilities = estimator.predict_proba(X_test)
+
+            is_multi_classification_problem = True if len(set(predictions)) > 2 else False
+            # print(f"Inference time is {inference_end_time}sec")
+            # roc_auc_score = metrics.roc_auc_score(y_test, predictions)
+
+            # autosklearn_ensemble = pipeline.show_models()
+            # formatted_ensemble = {
+            #     model_id: {
+            #         'rank': autosklearn_ensemble[model_id].get('rank'),
+            #         'cost': float(f"{autosklearn_ensemble[model_id].get('cost'):.3f}"),
+            #         'ensemble_weight': autosklearn_ensemble[model_id].get('ensemble_weight'),
+            #         'model': autosklearn_ensemble[model_id].get('sklearn_classifier')
+            #     } for model_id in autosklearn_ensemble.keys()
+            # }
+
+            # best_single_model = list(pipeline.show_models().values())[0].get('sklearn_classifier')
+            best_single_model = repr(pipeline)
+            # encoded_ensemble = str(formatted_ensemble).encode('base64')
+
+            # print(f"y_test is {predictions}")
+
+            general_run_info = {
+                # 'id': iteration + 1,
+                'dataset_id': dataset.id,
+                'dataset_name': dataset.name,
+                'run_label': 'Hist gradient boosting classifier'
+            }
+            average = 'macro' if is_multi_classification_problem else 'binary'
+            model_dependent_run_info = {
+                'roc_auc': -1 * float(f"{metrics.roc_auc_score(y_test, prediction_probabilities if is_multi_classification_problem else predictions, multi_class='ovr'):.3f}"),
+                'f1': -1 * float(f"{metrics.f1_score(y_test, predictions, average=average):.3f}"),
+                'accuracy': -1 * float(f"{metrics.accuracy_score(y_test, predictions):.3f}"),
+                'logloss': float(f"{metrics.log_loss(y_test, prediction_probabilities if is_multi_classification_problem else predictions):.3f}"),
+                'precision': -1 * float(f"{metrics.precision_score(y_test, predictions, average=average):.3f}"),
+                'fit_time': float(f'{fitting_end_time:.1f}'),
+                'inference_time': float(f'{inference_end_time:.1f}'),
+                # 'model_str': re.sub(r'\s{2,}', ' ', repr(best_single_model))
+                'model_str': None
             }
+            results = {**general_run_info, **model_dependent_run_info}
+
+            # for key in autosklearn_ensemble.keys():
+            #     ensemble_model = autosklearn_ensemble[key]
+            #     formatted_ensemble = results['ensemble']
+            #     for model_id in formatted_ensemble.keys():
+            #         formatted_ensemble[model_id] = ensemble_model.get("rank", None)
 
             # pickle.dump(pipeline.show_models(), open("results.pickle", "wb"))
 
             # print(type(pipeline.show_models().get(list(pipeline.show_models().keys())[0]).get("classifier")))
 
-            with open("results.json", "w") as file:
-                json.dump(
-                    results,
-                    file,
-                    cls=AutoSklearnEncoder,
-                    indent=2
-                )
-
-if __name__ == '__main__':
-    AutoSklearnValidator.main()
-
+            # knowledge_base_path = os.path.dirname('knowledge_base_0')
 
+            with open('experimental_data.csv', 'a', newline='') as file:
+                writer = csv.writer(file, delimiter=',')
+                # if iteration == 0:
+                #     writer.writerow(results.keys())
+                writer.writerow(results.values())
 
+if __name__ == '__main__':
+    AutoSklearnBaseline.main()
 
diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 1df6a0b6..60cbdcb3 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -1,6 +1,7 @@
 import functools
 import json
 import logging
+import os
 import timeit
 from datetime import datetime
 from itertools import chain
@@ -57,7 +58,7 @@
 TIME_NOW = None
 TIME_NOW_FOR_PATH = None
 
-DEBUG = False
+DEBUG = True
 
 
 def setup_logging():
@@ -66,7 +67,9 @@ def setup_logging():
     global TIME_NOW_FOR_PATH
     TIME_NOW_FOR_PATH = time_now_for_path = time_now.replace(":", ".")
     global SAVE_DIR
-    SAVE_DIR = save_dir = Path(f'run_{time_now_for_path}')
+    SAVE_DIR = save_dir = Path(__file__).parent\
+        .resolve()\
+        .joinpath(f'run_{time_now_for_path}')
     save_dir.mkdir()
     log_file = save_dir.joinpath('log.txt')
     Log(log_file=log_file)
@@ -78,7 +81,7 @@ def setup_logging():
                         )
 
 
-def fetch_openml_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
+def fetch_openml_data() -> List[DatasetCache]:
     """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
     dataset_ids = openml.study.get_suite(99).data
     if N_DATASETS is not None:
@@ -86,8 +89,11 @@ def fetch_openml_data() -> Tuple[List[int], Dict[str, DatasetCache]]:
         dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
         dataset_ids = list(dataset_ids)
 
-    datasets = {cache.name: cache for cache in OpenMLDatasetsLoader().load(dataset_ids)}
-    return dataset_ids, datasets
+    data = [cache for cache in OpenMLDatasetsLoader().load(dataset_ids)]
+    return data
+
+def mock_data_fetching() -> List[DatasetCache]:
+    return [cache for cache in OpenMLDatasetsLoader().load([1590, 1461, 1464, 1489, 40975, 40996, 41027, 54])]
 
 
 def transform_data_for_fedot(data: Dataset) -> (np.array, np.array):
@@ -176,28 +182,26 @@ def extract_best_history_models(dataset_cache, history):
     return best_models
 
 
-def prepare_data() -> Tuple[Tuple[List[int], Dict[str, DatasetCache]], Tuple[List[str], List[str]]]:
-    dataset_ids, datasets = fetch_openml_data()
+def ds_train_test_split() -> Tuple[Tuple[List[int], Dict[str, DatasetCache]], Tuple[List[str], List[str]]]:
+    openml_data = fetch_openml_data()
 
     train_data_names, test_data_names = train_test_split(
-        list(datasets.keys()),
+        [dataset.name for dataset in openml_data],
         test_size=TEST_SIZE,
         random_state=SEED
     )
-    return (dataset_ids, datasets), (train_data_names, test_data_names)
+    return train_data_names, test_data_names
 
 
 def main():
     baseline_pipeline = PipelineBuilder().add_node('rf').build()
 
-    ds_with_ids, dataset_names = prepare_data()
+    ds_with_ids, dataset_names = ds_train_test_split()
 
     train_ds_names, test_ds_names = dataset_names
 
     ds_ids, datasets = ds_with_ids
 
-    data_similarity_assessor, extractor = prepare_extractor_and_assessor(train_ds_names)
-
     results = []
     best_models_per_dataset = {}
     progress_file = open(SAVE_DIR.joinpath('progress.txt'), 'a')
@@ -221,7 +225,7 @@ def main():
         except Exception:
             logging.exception(f'Train dataset "{name}"')
 
-    data_similarity_assessor, extractor = prepare_extractor_and_assessor(datasets_train)
+    data_similarity_assessor, extractor = prepare_extractor_and_assessor(train_ds_names)
     model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
                                                 minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
     model_advisor.fit(best_models_per_dataset)

From 26d57b848468276a5ff1ef625aa6f75244d99cf8 Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Wed, 5 Jul 2023 19:44:58 +0300
Subject: [PATCH 48/60] Implemented Auto-sklearn baseline.

---
 ...dvise_models_from_similar_datasets.py.orig |  47 ------
 experiments/auto-sklearn/baseline.py          | 159 ++++++++++++++++++
 .../auto-sklearn/experimental_data.csv        |  74 ++++++--
 experiments/auto-sklearn/openml_suite.py      | 150 -----------------
 4 files changed, 216 insertions(+), 214 deletions(-)
 delete mode 100644 examples/4_advising_models/advise_models_from_similar_datasets.py.orig
 create mode 100644 experiments/auto-sklearn/baseline.py
 delete mode 100644 experiments/auto-sklearn/openml_suite.py

diff --git a/examples/4_advising_models/advise_models_from_similar_datasets.py.orig b/examples/4_advising_models/advise_models_from_similar_datasets.py.orig
deleted file mode 100644
index c8f50581..00000000
--- a/examples/4_advising_models/advise_models_from_similar_datasets.py.orig
+++ /dev/null
@@ -1,47 +0,0 @@
-from fedot.core.pipelines.pipeline_builder import PipelineBuilder
-from golem.core.optimisers.fitness import SingleObjFitness
-from sklearn.model_selection import train_test_split
-
-from meta_automl.data_preparation.dataset import DatasetCache
-from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
-from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
-from meta_automl.data_preparation.model import Model
-from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
-from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
-
-
-def main():
-    # Define datasets.
-    dataset_names = ['monks-problems-1', 'apsfailure', 'australian', 'bank-marketing']
-    # Extract meta-features and load on demand.
-    extractor = PymfeExtractor(extractor_params={'groups': 'general'}, datasets_loader=OpenMLDatasetsLoader())
-    meta_features = extractor.extract(dataset_names)
-    # Preprocess meta-features, as KNN does not support NaNs.
-    meta_features = meta_features.dropna(axis=1, how='any')
-    # Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
-    x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
-    y_train = x_train.index
-<<<<<<< HEAD
-    assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=2)
-||||||| 9541bf7
-    assessor = KNNSimilarityAssessor({'n_neighbors': 2}, n_best=2)
-=======
-    assessor = KNNSimilarityAssessor({'n_neighbors': 3}, n_best=2)
->>>>>>> e140a34de32bf20396693e888560bcc51fb5539e
-    assessor.fit(x_train, y_train)
-    # Define best models for datasets.
-    best_pipelines = [
-        PipelineBuilder().add_node('scaling').add_node('rf').build(),
-        PipelineBuilder().add_node('normalization').add_node('logit').build(),
-        PipelineBuilder().add_node('rf').add_node('logit').build()
-    ]
-    best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', DatasetCache(dataset_name))]
-                   for dataset_name, pipeline in zip(y_train, best_pipelines)]
-
-    dataset_names_to_best_pipelines = dict(zip(y_train, best_models))
-    advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines)
-    return advisor.predict(x_test)
-
-
-if __name__ == '__main__':
-    result = main()
diff --git a/experiments/auto-sklearn/baseline.py b/experiments/auto-sklearn/baseline.py
new file mode 100644
index 00000000..da1c21a4
--- /dev/null
+++ b/experiments/auto-sklearn/baseline.py
@@ -0,0 +1,159 @@
+import csv
+import time
+
+from typing import Any, Tuple, Dict
+
+import numpy as np
+import logging
+
+import autosklearn.classification
+import autosklearn.ensembles
+
+from sklearn import model_selection, metrics
+
+from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
+from meta_automl.data_preparation.models_loaders import KnowledgeBaseModelsLoader
+from autosklearn.classification import AutoSklearnClassifier
+
+
+
+class AutoSklearnBaseline:
+    def __init__(self, estimator_ensemble_type, time_limit):
+        self.estimator = AutoSklearnClassifier(
+            ensemble_class=estimator_ensemble_type,
+            time_left_for_this_task=time_limit,
+        )
+        self.knowledge_base_loader = KnowledgeBaseModelsLoader()
+
+    def make_quality_metric_estimates(self, y, predictions, prediction_proba, is_multi_label):
+        """ Compute roc_auc, f1, accuracy, log_loss and precision scores. """
+        results = {
+            'roc_auc': -1 * float(
+                "{:.3f}".format(
+                    metrics.roc_auc_score(
+                        y,
+                        prediction_proba if is_multi_label else predictions,
+                        multi_class='ovr'
+                    )
+                )
+            ),
+            'f1': -1 * float(
+                "{:.3f}".format(
+                    metrics.f1_score(
+                        y,
+                        predictions,
+                        average='macro' if is_multi_label else 'binary'
+                    )
+                )
+            ),
+            'accuracy': -1 * float(
+                "{:.3f}".format(
+                    metrics.accuracy_score(
+                        y,
+                        predictions
+                    )
+                )
+            ),
+            'logloss': float(
+                "{:.3f}".format(
+                    metrics.log_loss(
+                        y,
+                        prediction_proba if is_multi_label else predictions
+                    )
+                )
+            ),
+            'precision': -1 * float(
+                "{:.3f}".format(
+                    metrics.precision_score(
+                        y,
+                        predictions,
+                        average='macro' if is_multi_label else 'binary',
+                        labels=np.unique(predictions)
+                    )
+                )
+            )
+        }
+        return results
+
+    def run(self):
+        """ Fit auto-sklearn meta-optimizer to knowledge base datasets and output a single best model. """
+        dataset_ids_to_load = [
+            dataset_id for dataset_id in self.knowledge_base_loader
+                                             .parse_datasets('test')
+                                             .loc[:, 'dataset_id']
+        ]
+        dataset_ids_to_load = [dataset_ids_to_load[dataset_ids_to_load.index(41166)]]
+
+        loaded_datasets = OpenMLDatasetsLoader().load(dataset_ids_to_load)
+
+        for iteration, dataset in enumerate(loaded_datasets):
+            logging.log(logging.INFO, f"Loaded dataset name: {dataset.name}")
+            dataset = dataset.from_cache()
+
+            X_train, X_test, y_train, y_test = model_selection.train_test_split(
+                dataset.x,
+                dataset.y,
+                test_size=0.2,
+                random_state=42,
+                stratify=dataset.y
+            )
+
+            fitting_start_time = time.time()
+            ensemble = self.estimator.fit(X_train, y_train)
+            fitting_time = time.time() - fitting_start_time
+            logging.log(logging.INFO, f"Fitting time is {fitting_time}sec")
+
+            inference_start_time = time.time()
+            predicted_results = self.estimator.predict(X_test)
+            inference_time = time.time() - inference_start_time
+            logging.log(logging.INFO, f"Inference time is {inference_time}sec")
+
+            predicted_probabilities = self.estimator.predict_proba(X_test)
+
+            best_single_model = list(ensemble.show_models().values())[0].get('sklearn_classifier')
+
+            # autosklearn_ensemble = pipeline.show_models()
+            # formatted_ensemble = {
+            #     model_id: {
+            #         'rank': autosklearn_ensemble[model_id].get('rank'),
+            #         'cost': float(f"{autosklearn_ensemble[model_id].get('cost'):.3f}"),
+            #         'ensemble_weight': autosklearn_ensemble[model_id].get('ensemble_weight'),
+            #         'model': autosklearn_ensemble[model_id].get('sklearn_classifier')
+            #     } for model_id in autosklearn_ensemble.keys()
+            # }
+
+            general_run_info = {
+                'dataset_id': dataset.id,
+                'dataset_name': dataset.name,
+                'run_label': 'Auto-sklearn',
+            }
+
+            is_multilabel_classification = True if len(set(predicted_results)) > 2 else False
+            quality_metric_estimates = self.make_quality_metric_estimates(
+                y_test,
+                predicted_results,
+                predicted_probabilities,
+                is_multilabel_classification
+            )
+
+            model_dependent_run_info = {
+                'fit_time': float(f'{fitting_time:.1f}'),
+                'inference_time': float(f'{inference_time:.1f}'),
+                'model_str': repr(best_single_model)
+            }
+
+            results = {**general_run_info, **quality_metric_estimates, **model_dependent_run_info}
+
+            # for key in autosklearn_ensemble.keys():
+            #     ensemble_model = autosklearn_ensemble[key]
+            #     formatted_ensemble = results['ensemble']
+            #     for model_id in formatted_ensemble.keys():
+            #         formatted_ensemble[model_id] = ensemble_model.get("rank", None)
+
+            with open('experimental_data.csv', 'a', newline='') as file:
+                writer = csv.writer(file, delimiter=',')
+                writer.writerow(results.values())
+
+
+if __name__ == '__main__':
+    AutoSklearnBaseline(autosklearn.ensembles.SingleBest, 600).run()
diff --git a/experiments/auto-sklearn/experimental_data.csv b/experiments/auto-sklearn/experimental_data.csv
index 18dbec46..7a3f3cfa 100644
--- a/experiments/auto-sklearn/experimental_data.csv
+++ b/experiments/auto-sklearn/experimental_data.csv
@@ -1,17 +1,57 @@
-dataset_id,dataset_name,run_label,roc_auc,f1,accuracy,logloss,precision,fit_time,inference_time,model_str
-1590,adult,Auto-sklearn,-0.798,-0.923,-0.878,4.391,-0.897,596.2,0.1,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=1.7108930238344161e-10, learning_rate=0.010827728124541558, loss='auto', max_iter=512, max_leaf_nodes=25, min_samples_leaf=4, n_iter_no_change=19, random_state=1, validation_fraction=0.1759114608225653, warm_start=True)"
-1461,bank-marketing,Auto-sklearn,-0.695,-0.507,-0.9,3.587,-0.63,595.0,0.6,"RandomForestClassifier(max_features=4, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)"
-1464,blood-transfusion-service-center,Auto-sklearn,-0.627,-0.415,-0.793,7.449,-0.688,593.6,0.0,"MLPClassifier(alpha=6.875656304664039e-05, beta_1=0.999, beta_2=0.9, hidden_layer_sizes=(224,), learning_rate_init=0.00011403871479850849, max_iter=256, n_iter_no_change=32, random_state=1, validation_fraction=0.0, verbose=0, warm_start=True)"
-1489,phoneme,Auto-sklearn,-0.881,-0.82,-0.891,3.934,-0.787,593.8,0.0,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=0.00030344870480744136, learning_rate=0.18318625129457267, loss='auto', max_iter=512, max_leaf_nodes=366, min_samples_leaf=2, n_iter_no_change=11, random_state=1, validation_fraction=None, warm_start=True)"
-40975,car,Auto-sklearn,-0.999,-0.949,-0.98,0.114,-0.93,598.1,0.0,"RandomForestClassifier(bootstrap=False, max_features=2, min_samples_split=3, n_estimators=512, n_jobs=1, random_state=1, warm_start=True)"
-40996,fashion-mnist,Auto-sklearn,-0.968,-0.866,-0.866,1.945,-0.866,596.9,0.8,"KNeighborsClassifier(n_neighbors=4, weights='distance')"
-41027,jungle_chess_2pcs_raw_endgame_complete,Auto-sklearn,-0.974,-0.82,-0.866,0.275,-0.826,596.7,0.2,"HistGradientBoostingClassifier(early_stopping=True, l2_regularization=9.674948183980905e-09, learning_rate=0.014247987845444413, loss='auto', max_iter=512, max_leaf_nodes=55, min_samples_leaf=164, n_iter_no_change=1, random_state=1, validation_fraction=0.11770489601182355, warm_start=True)"
-54,vehicle,Auto-sklearn,-0.965,-0.847,-0.859,0.896,-0.848,594.9,0.0,LinearDiscriminantAnalysis(tol=0.06932929810851429)roc_auc,f1,accuracy,logloss,precision,dataset_id,dataset_name,run_label,fit_time,inference_time,model_str
-1590,adult,Hist gradient boosting classifier,-0.806,-0.924,-0.88,4.313,-0.903,0.4,0.0,
-1461,bank-marketing,Hist gradient boosting classifier,-0.718,-0.544,-0.904,3.452,-0.639,0.7,0.0,
-1464,blood-transfusion-service-center,Hist gradient boosting classifier,-0.565,-0.328,-0.7,10.813,-0.367,0.1,0.0,
-1489,phoneme,Hist gradient boosting classifier,-0.876,-0.82,-0.894,3.834,-0.806,0.2,0.0,
-40975,car,Hist gradient boosting classifier,-1.0,-0.94,-0.986,0.025,-0.93,0.7,0.0,
-40996,fashion-mnist,Hist gradient boosting classifier,-0.993,-0.904,-0.904,0.267,-0.903,113.4,0.9,
-41027,jungle_chess_2pcs_raw_endgame_complete,Hist gradient boosting classifier,-0.976,-0.825,-0.87,0.272,-0.833,1.4,0.1,
-54,vehicle,Hist gradient boosting classifier,-0.928,-0.763,-0.782,0.778,-0.768,1.2,0.0,
+1461,bank-marketing,Auto-sklearn,-0.711,-0.535,-0.907,3.34,-0.648,598.0,0.1,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=1.7108930238344161e-10,
+                               learning_rate=0.010827728124541558, loss='auto',
+                               max_iter=512, max_leaf_nodes=25,
+                               min_samples_leaf=4, n_iter_no_change=19,
+                               random_state=1,
+                               validation_fraction=0.1759114608225653,
+                               warm_start=True)"
+179,adult,Auto-sklearn,-0.774,-0.91,-0.859,5.077,-0.885,595.3,0.1,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=1.7108930238344161e-10,
+                               learning_rate=0.010827728124541558, loss='auto',
+                               max_iter=512, max_leaf_nodes=25,
+                               min_samples_leaf=4, n_iter_no_change=19,
+                               random_state=1,
+                               validation_fraction=0.1759114608225653,
+                               warm_start=True)"
+1464,blood-transfusion-service-center,Auto-sklearn,-0.669,-0.5,-0.8,7.209,-0.625,597.6,0.0,"PassiveAggressiveClassifier(C=0.253246830865058, average=True, max_iter=16,
+                            random_state=1, tol=0.01676578241454229,
+                            warm_start=True)"
+991,car,Auto-sklearn,-1.0,-1.0,-1.0,0.0,-1.0,596.8,0.0,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=1.9280388598217333e-10,
+                               learning_rate=0.24233932723531437, loss='auto',
+                               max_iter=128, max_leaf_nodes=35,
+                               min_samples_leaf=17, n_iter_no_change=1,
+                               random_state=1, validation_fraction=None,
+                               warm_start=True)"
+1489,phoneme,Auto-sklearn,-0.848,-0.797,-0.887,4.068,-0.845,600.4,0.1,"AdaBoostClassifier(algorithm='SAMME',
+                   base_estimator=DecisionTreeClassifier(max_depth=10),
+                   learning_rate=1.1377640450285444, n_estimators=352,
+                   random_state=1)"
+41027,jungle_chess_2pcs_raw_endgame_complete,Auto-sklearn,-0.975,-0.816,-0.865,0.271,-0.824,595.1,0.2,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=9.674948183980905e-09,
+                               learning_rate=0.014247987845444413, loss='auto',
+                               max_iter=512, max_leaf_nodes=55,
+                               min_samples_leaf=164, n_iter_no_change=1,
+                               random_state=1,
+                               validation_fraction=0.11770489601182355,
+                               warm_start=True)"
+41166,volkert,Auto-sklearn,-0.874,-0.586,-0.644,1.829,-0.587,595.8,0.3,"LinearDiscriminantAnalysis(shrinkage='auto', solver='lsqr',
+                           tol=0.018821286956948503)"
+54,vehicle,Auto-sklearn,-0.964,-0.86,-0.859,0.408,-0.861,595.5,0.0,"MLPClassifier(activation='tanh', alpha=0.0002060405669905105, beta_1=0.999,
+              beta_2=0.9, hidden_layer_sizes=(87, 87, 87),
+              learning_rate_init=0.00040205833939989724, max_iter=256,
+              n_iter_no_change=32, random_state=1, validation_fraction=0.0,
+              verbose=0, warm_start=True)"
+40996,fashion-mnist,Auto-sklearn,-0.968,-0.864,-0.865,1.913,-0.866,296.1,1.2,"KNeighborsClassifier(n_neighbors=4, weights='distance')"
+40996,fashion-mnist,Auto-sklearn,-0.968,-0.864,-0.865,1.913,-0.866,595.5,0.8,"KNeighborsClassifier(n_neighbors=4, weights='distance')"
+42344,sf-police-incidents,Auto-sklearn,-0.574,-0.589,-0.574,15.367,-0.569,594.8,0.5,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=3.609412172481434e-10,
+                               learning_rate=0.05972079854295879, loss='auto',
+                               max_iter=512, max_leaf_nodes=4,
+                               min_samples_leaf=2, n_iter_no_change=14,
+                               random_state=1, validation_fraction=None,
+                               warm_start=True)"
+1240,airlinescodrnaadult,Auto-sklearn,-0.62,-0.683,-0.631,13.306,-0.658,594.3,0.1,"SGDClassifier(alpha=1.6992296128865824e-07, average=True, eta0=0.01, loss='log',
+              max_iter=512, penalty='l1', random_state=1,
+              tol=1.535384699341134e-05, warm_start=True)"
\ No newline at end of file
diff --git a/experiments/auto-sklearn/openml_suite.py b/experiments/auto-sklearn/openml_suite.py
deleted file mode 100644
index 4a655653..00000000
--- a/experiments/auto-sklearn/openml_suite.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import csv
-import os
-import pickle
-import re
-import time
-
-import numpy as np
-import json
-
-import autosklearn.classification
-import autosklearn.ensembles
-from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import Balancing
-from autosklearn.pipeline.components.data_preprocessing import DataPreprocessorChoice
-from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
-from autosklearn.pipeline.components.classification import AutoSklearnClassificationAlgorithm, ClassifierChoice
-
-# from experiments.fedot_warm_start.run import fetch_openml_data, mock_data_fetching
-from sklearn import model_selection, metrics
-from sklearn import ensemble
-from sklearn.base import ClassifierMixin
-
-
-# class AutoSklearnEncoder(json.JSONEncoder):
-#     def default(self, o):
-#         # if isinstance(o, dict):
-#         #     return json.dumps(o)
-#         if isinstance(o, ClassifierChoice):
-#             return repr(o.choice.estimator)
-#         # if isinstance(o, (DataPreprocessorChoice, FeaturePreprocessorChoice)):
-#         #     return None
-#         elif isinstance(o, ClassifierMixin):
-#             return re.sub(r'\s{2,}', ' ', repr(o))
-#         # elif isinstance(o, Balancing):
-#         #     return repr(o)
-#         elif isinstance(o, np.integer):
-#             return int(o)
-#         elif isinstance(o, np.floating):
-#             return float(o)
-
-
-class AutoSklearnBaseline:
-
-    def __init__(self):
-        pass
-
-    @staticmethod
-    def main():
-        openml_data = None
-        # dataset_names = [dataset.name for dataset in openml_data]
-
-        # train_data_names, test_data_names = model_selection.train_test_split(
-        #     [dataset.name for dataset in openml_data],
-        #     test_size=0.2,
-        #     random_state=42
-        # )
-        # train_ds_names, test_ds_names = ds_names
-
-        # ds_ids, datasets = ds_with_ids
-
-        # for ds_name in train_ds_names:
-
-        for iteration, dataset in enumerate(openml_data):
-            print(f"Fetched data name: {dataset.name}")
-            dataset = dataset.from_cache()
-
-            # estimator = autosklearn.classification.AutoSklearnClassifier(
-            #     ensemble_class=autosklearn.ensembles.SingleBest,
-            #     time_left_for_this_task=600
-            # )
-            estimator = ensemble.HistGradientBoostingClassifier()
-
-            X_train, X_test, y_train, y_test = model_selection.train_test_split(
-                dataset.x,
-                dataset.y,
-                test_size=0.2,
-                random_state=42
-            )
-
-            fitting_start_time = time.time()
-            pipeline = estimator.fit(X_train, y_train)
-            fitting_end_time = time.time() - fitting_start_time
-            # print(f"Fitting time is {fitting_end_time}sec")
-
-            inference_start_time = time.time()
-            predictions = estimator.predict(X_test)
-            inference_end_time = time.time() - inference_start_time
-
-            prediction_probabilities = estimator.predict_proba(X_test)
-
-            is_multi_classification_problem = True if len(set(predictions)) > 2 else False
-            # print(f"Inference time is {inference_end_time}sec")
-            # roc_auc_score = metrics.roc_auc_score(y_test, predictions)
-
-            # autosklearn_ensemble = pipeline.show_models()
-            # formatted_ensemble = {
-            #     model_id: {
-            #         'rank': autosklearn_ensemble[model_id].get('rank'),
-            #         'cost': float(f"{autosklearn_ensemble[model_id].get('cost'):.3f}"),
-            #         'ensemble_weight': autosklearn_ensemble[model_id].get('ensemble_weight'),
-            #         'model': autosklearn_ensemble[model_id].get('sklearn_classifier')
-            #     } for model_id in autosklearn_ensemble.keys()
-            # }
-
-            # best_single_model = list(pipeline.show_models().values())[0].get('sklearn_classifier')
-            best_single_model = repr(pipeline)
-            # encoded_ensemble = str(formatted_ensemble).encode('base64')
-
-            # print(f"y_test is {predictions}")
-
-            general_run_info = {
-                # 'id': iteration + 1,
-                'dataset_id': dataset.id,
-                'dataset_name': dataset.name,
-                'run_label': 'Hist gradient boosting classifier'
-            }
-            average = 'macro' if is_multi_classification_problem else 'binary'
-            model_dependent_run_info = {
-                'roc_auc': -1 * float(f"{metrics.roc_auc_score(y_test, prediction_probabilities if is_multi_classification_problem else predictions, multi_class='ovr'):.3f}"),
-                'f1': -1 * float(f"{metrics.f1_score(y_test, predictions, average=average):.3f}"),
-                'accuracy': -1 * float(f"{metrics.accuracy_score(y_test, predictions):.3f}"),
-                'logloss': float(f"{metrics.log_loss(y_test, prediction_probabilities if is_multi_classification_problem else predictions):.3f}"),
-                'precision': -1 * float(f"{metrics.precision_score(y_test, predictions, average=average):.3f}"),
-                'fit_time': float(f'{fitting_end_time:.1f}'),
-                'inference_time': float(f'{inference_end_time:.1f}'),
-                # 'model_str': re.sub(r'\s{2,}', ' ', repr(best_single_model))
-                'model_str': None
-            }
-            results = {**general_run_info, **model_dependent_run_info}
-
-            # for key in autosklearn_ensemble.keys():
-            #     ensemble_model = autosklearn_ensemble[key]
-            #     formatted_ensemble = results['ensemble']
-            #     for model_id in formatted_ensemble.keys():
-            #         formatted_ensemble[model_id] = ensemble_model.get("rank", None)
-
-            # pickle.dump(pipeline.show_models(), open("results.pickle", "wb"))
-
-            # print(type(pipeline.show_models().get(list(pipeline.show_models().keys())[0]).get("classifier")))
-
-            # knowledge_base_path = os.path.dirname('knowledge_base_0')
-
-            with open('experimental_data.csv', 'a', newline='') as file:
-                writer = csv.writer(file, delimiter=',')
-                # if iteration == 0:
-                #     writer.writerow(results.keys())
-                writer.writerow(results.values())
-
-if __name__ == '__main__':
-    AutoSklearnBaseline.main()
-

From 5c106587dc56555272654e9561c2906b3bc9de88 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 6 Jul 2023 18:07:27 +0300
Subject: [PATCH 49/60] fix inner components

---
 .../datasets_loaders/openml_datasets_loader.py       | 12 ++++++------
 .../model_advisors/diverse_fedot_pipeline_advisor.py |  4 ++--
 .../meta_algorithm/model_advisors/model_advisor.py   | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
index 11294c45..f7fbfb80 100644
--- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
+++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import List, Union, Optional
+from typing import List, Union, Optional, Sequence
 
 from golem.core.log import default_log
 
@@ -10,18 +10,17 @@
 
 class OpenMLDatasetsLoader(DatasetsLoader):
     def __init__(self, allow_names: bool = False):
-        self.dataset_ids = []
+        self.dataset_ids = set()
         self._allow_names = allow_names
 
-    def load(self, dataset_ids: List[Union[OpenMLDatasetIDType, str]],
+    def load(self, dataset_ids: Sequence[Union[OpenMLDatasetIDType, str]],
              allow_names: Optional[bool] = None) -> List[OpenMLDataset]:
-        self.dataset_ids += dataset_ids
         allow_names = self._allow_names if allow_names is None else allow_names
 
         datasets = []
         # TODO: Optimize like this
         #  https://github.com/openml/automlbenchmark/commit/a09dc8aee96178dd14837d9e1cd519d1ec63f804
-        for dataset_id in self.dataset_ids:
+        for dataset_id in dataset_ids:
             dataset = self.load_single(dataset_id, allow_name=allow_names)
             datasets.append(dataset)
         return datasets
@@ -35,7 +34,8 @@ def load_single(self, dataset_id: Union[OpenMLDatasetIDType, str],
         else:
             dataset = OpenMLDataset(dataset_id)
 
-        self.dataset_ids.append(dataset.id_)
+        self.dataset_ids.add(dataset.id_)
+
         return dataset
 
     @property
diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
index 6f7e4a66..21879365 100644
--- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
+++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py
@@ -12,8 +12,8 @@ class DiverseFEDOTPipelineAdvisor(SimpleSimilarityModelAdvisor):
     def __init__(self,
                  fitted_similarity_assessor: DatasetsSimilarityAssessor,
                  n_best_to_advise: Optional[int] = None,
-                 minimal_distance: int = 1,
-                 distance_func: Callable[[Pipeline, Pipeline], int] = get_distance_between):
+                 minimal_distance: float = 1,
+                 distance_func: Callable[[Pipeline, Pipeline], float] = get_distance_between):
         super().__init__(fitted_similarity_assessor)
         self.minimal_distance = minimal_distance
         self.n_best_to_advise = n_best_to_advise
diff --git a/meta_automl/meta_algorithm/model_advisors/model_advisor.py b/meta_automl/meta_algorithm/model_advisors/model_advisor.py
index c653a173..163dbe2f 100644
--- a/meta_automl/meta_algorithm/model_advisors/model_advisor.py
+++ b/meta_automl/meta_algorithm/model_advisors/model_advisor.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import List, Dict, Iterable
+from typing import List, Dict, Iterable, Sequence
 
 import pandas as pd
 
@@ -18,13 +18,13 @@ def predict(self, *args, **kwargs) -> List[List[Model]]:
 class SimpleSimilarityModelAdvisor(ModelAdvisor):
     def __init__(self, fitted_similarity_assessor: DatasetsSimilarityAssessor):
         self.similarity_assessor = fitted_similarity_assessor
-        self.best_models: Dict[DatasetIDType, List[Model]] = {}
+        self.best_models: Dict[DatasetIDType, Sequence[Model]] = {}
 
     @property
     def datasets(self):
         return self.similarity_assessor.datasets
 
-    def fit(self, dataset_names_to_best_pipelines: Dict[DatasetIDType, List[Model]]):
+    def fit(self, dataset_names_to_best_pipelines: Dict[DatasetIDType, Sequence[Model]]):
         self.best_models.update(dataset_names_to_best_pipelines)
         return self
 
@@ -35,8 +35,8 @@ def predict(self, meta_features: pd.DataFrame) -> List[List[Model]]:
             advised_pipelines.append(self._predict_single(similar_datasets))
         return advised_pipelines
 
-    def _predict_single(self, similar_dataset_names: Iterable[str]) -> List[Model]:
+    def _predict_single(self, similar_dataset_ids: Iterable[DatasetIDType]) -> List[Model]:
         dataset_pipelines = []
-        for dataset_name in similar_dataset_names:
-            dataset_pipelines += self.best_models.get(dataset_name)
+        for dataset_id in similar_dataset_ids:
+            dataset_pipelines += list(self.best_models.get(dataset_id))
         return dataset_pipelines

From e2c1b890c3de9c6425500f5ec48c53399c9a86ef Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 6 Jul 2023 18:10:17 +0300
Subject: [PATCH 50/60] separate framework cache from other data

---
 .dockerignore                                 |   2 +-
 .gitignore                                    |   2 +-
 .../data_preparation/file_system/__init__.py  |   2 +-
 .../data_preparation/file_system/cache.py     |  12 +++--
 .../knowledge_base_models_loader.py           |   2 +-
 .../{ => cache}/metafeatures/pymfe/334.pkl    | Bin
 .../{ => cache}/metafeatures/pymfe/40981.pkl  | Bin
 .../org/openml/www/datasets/333/dataset.arff  |   0
 .../www/datasets/333/dataset_333.pkl.py3      | Bin
 .../openml/www/datasets/333/dataset_333.pq    | Bin
 .../openml/www/datasets/333/description.xml   |   0
 .../org/openml/www/datasets/333/features.xml  |   0
 .../openml/www/datasets/333/features.xml.pkl  | Bin
 .../openml/www/datasets/40981/dataset.arff    |   0
 .../www/datasets/40981/dataset_40981.pkl.py3  | Bin
 .../www/datasets/40981/dataset_40981.pq       | Bin
 .../openml/www/datasets/40981/description.xml |   0
 .../openml/www/datasets/40981/features.xml    |   0
 .../www/datasets/40981/features.xml.pkl       | Bin
 test/unit/datasets/test_datasets_loaders.py   |   2 +-
 test/unit/datasets/test_file_dataset.py       |  48 ------------------
 21 files changed, 13 insertions(+), 57 deletions(-)
 rename test/data/{ => cache}/metafeatures/pymfe/334.pkl (100%)
 rename test/data/{ => cache}/metafeatures/pymfe/40981.pkl (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/dataset.arff (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/dataset_333.pq (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/description.xml (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/features.xml (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/333/features.xml.pkl (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/dataset.arff (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/description.xml (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/features.xml (100%)
 rename test/data/{ => cache}/openml_cache/org/openml/www/datasets/40981/features.xml.pkl (100%)
 delete mode 100644 test/unit/datasets/test_file_dataset.py

diff --git a/.dockerignore b/.dockerignore
index 2bfa6863..66731471 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -10,4 +10,4 @@ notebooks
 test
 
 # User data
-data
+data/cache
diff --git a/.gitignore b/.gitignore
index a5f9134a..bf5dbd4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,4 @@ dmypy.json
 .pyre/
 
 # User data
-/data
+/data/cache
diff --git a/meta_automl/data_preparation/file_system/__init__.py b/meta_automl/data_preparation/file_system/__init__.py
index a228da6e..c9f8393a 100644
--- a/meta_automl/data_preparation/file_system/__init__.py
+++ b/meta_automl/data_preparation/file_system/__init__.py
@@ -1,5 +1,5 @@
 from meta_automl.data_preparation.file_system.file_system import PathType, get_project_root, get_data_dir
-from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_dataset_cache_path,
+from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_cache_dir, get_dataset_cache_path,
                                                             get_dataset_cache_path_by_id, get_meta_features_cache_path,
                                                             get_local_meta_features, update_local_meta_features,
                                                             get_openml_cache_dir, update_openml_cache_dir)
diff --git a/meta_automl/data_preparation/file_system/cache.py b/meta_automl/data_preparation/file_system/cache.py
index 99daf965..04a904b7 100644
--- a/meta_automl/data_preparation/file_system/cache.py
+++ b/meta_automl/data_preparation/file_system/cache.py
@@ -19,12 +19,16 @@ class CacheOperator:
     pass
 
 
+def get_cache_dir() -> Path:
+    return ensure_dir_exists(get_data_dir().joinpath('cache'))
+
+
 def get_openml_cache_dir() -> Path:
-    return get_data_dir().joinpath('openml_cache')
+    return get_cache_dir().joinpath('openml_cache')
 
 
 def get_full_openml_cache_dir() -> Path:
-    return get_data_dir().joinpath('openml_cache/org/openml/www')
+    return get_cache_dir().joinpath('openml_cache/org/openml/www')
 
 
 def update_openml_cache_dir():
@@ -82,11 +86,11 @@ def get_cache_properties(class_name: str) -> CacheProperties:
             template='{id_}'),
         'CustomDataset': CacheProperties(
             type_=CacheType.file,
-            dir_=get_data_dir().joinpath('datasets/custom_dataset'),
+            dir_=get_cache_dir().joinpath('datasets/custom_dataset'),
             template='{id_}.pkl'),
         'PymfeExtractor': CacheProperties(
             type_=CacheType.file,
-            dir_=get_data_dir().joinpath('metafeatures/pymfe'),
+            dir_=get_cache_dir().joinpath('metafeatures/pymfe'),
             template='{id_}.pkl'),
     }
     try:
diff --git a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py
index 7c38b9d8..df8a0f70 100644
--- a/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py
+++ b/meta_automl/data_preparation/models_loaders/knowledge_base_models_loader.py
@@ -13,7 +13,7 @@
 from meta_automl.data_preparation.model import Model
 from meta_automl.data_preparation.models_loaders import ModelsLoader
 
-DEFAULT_KNOWLEDGE_BASE_PATH = get_data_dir().joinpath('knowledge_base_0')
+DEFAULT_KNOWLEDGE_BASE_PATH = get_data_dir() / 'knowledge_base_0'
 
 
 class KnowledgeBaseModelsLoader(ModelsLoader):
diff --git a/test/data/metafeatures/pymfe/334.pkl b/test/data/cache/metafeatures/pymfe/334.pkl
similarity index 100%
rename from test/data/metafeatures/pymfe/334.pkl
rename to test/data/cache/metafeatures/pymfe/334.pkl
diff --git a/test/data/metafeatures/pymfe/40981.pkl b/test/data/cache/metafeatures/pymfe/40981.pkl
similarity index 100%
rename from test/data/metafeatures/pymfe/40981.pkl
rename to test/data/cache/metafeatures/pymfe/40981.pkl
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset.arff b/test/data/cache/openml_cache/org/openml/www/datasets/333/dataset.arff
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/333/dataset.arff
rename to test/data/cache/openml_cache/org/openml/www/datasets/333/dataset.arff
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3 b/test/data/cache/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3
rename to test/data/cache/openml_cache/org/openml/www/datasets/333/dataset_333.pkl.py3
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq b/test/data/cache/openml_cache/org/openml/www/datasets/333/dataset_333.pq
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/333/dataset_333.pq
rename to test/data/cache/openml_cache/org/openml/www/datasets/333/dataset_333.pq
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/description.xml b/test/data/cache/openml_cache/org/openml/www/datasets/333/description.xml
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/333/description.xml
rename to test/data/cache/openml_cache/org/openml/www/datasets/333/description.xml
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml b/test/data/cache/openml_cache/org/openml/www/datasets/333/features.xml
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/333/features.xml
rename to test/data/cache/openml_cache/org/openml/www/datasets/333/features.xml
diff --git a/test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl b/test/data/cache/openml_cache/org/openml/www/datasets/333/features.xml.pkl
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/333/features.xml.pkl
rename to test/data/cache/openml_cache/org/openml/www/datasets/333/features.xml.pkl
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff b/test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset.arff
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/40981/dataset.arff
rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset.arff
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3 b/test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3
rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset_40981.pkl.py3
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq b/test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq
rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/dataset_40981.pq
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/description.xml b/test/data/cache/openml_cache/org/openml/www/datasets/40981/description.xml
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/40981/description.xml
rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/description.xml
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml b/test/data/cache/openml_cache/org/openml/www/datasets/40981/features.xml
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/40981/features.xml
rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/features.xml
diff --git a/test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl b/test/data/cache/openml_cache/org/openml/www/datasets/40981/features.xml.pkl
similarity index 100%
rename from test/data/openml_cache/org/openml/www/datasets/40981/features.xml.pkl
rename to test/data/cache/openml_cache/org/openml/www/datasets/40981/features.xml.pkl
diff --git a/test/unit/datasets/test_datasets_loaders.py b/test/unit/datasets/test_datasets_loaders.py
index 0fd1ce17..f49e1989 100644
--- a/test/unit/datasets/test_datasets_loaders.py
+++ b/test/unit/datasets/test_datasets_loaders.py
@@ -5,7 +5,7 @@
 def test_group_load_new_datasets(openml_dataset_ids):
     loader = OpenMLDatasetsLoader()
     datasets = loader.load(openml_dataset_ids)
-    assert loader.dataset_ids == openml_dataset_ids
+    assert loader.dataset_ids == set(openml_dataset_ids)
     for dataset_id, dataset in zip(openml_dataset_ids, datasets):
         check_dataset_cache(dataset)
 
diff --git a/test/unit/datasets/test_file_dataset.py b/test/unit/datasets/test_file_dataset.py
deleted file mode 100644
index 125cb641..00000000
--- a/test/unit/datasets/test_file_dataset.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import numpy as np
-import pytest
-
-from meta_automl.data_preparation.dataset import CacheNotFoundError, FileDataset, DatasetData
-from test.unit.datasets.general_checks import assert_cache_file_exists
-
-
-@pytest.fixture(scope='module')
-def new_dataset_data():
-    dataset_data = DatasetData(
-        x=np.array([['a', 'b'], ['b', 'a']]),
-        y=np.array([5, 10]),
-        categorical_indicator=[True, True],
-        attribute_names=['foo', 'bar']
-    )
-    return dataset_data
-
-
-@pytest.fixture(scope='module')
-def new_dataset(new_dataset_data):
-    dataset = FileDataset(42)
-    dataset.dump_data(new_dataset_data)
-    yield dataset
-    dataset.cache_path.unlink()
-
-
-def test_error_on_missing_dataset_cache():
-    with pytest.raises(CacheNotFoundError):
-        FileDataset('random_missing_dataset').get_data()
-
-
-def test_file_dataset_dumping(new_dataset):
-    # Act
-    cache_path = new_dataset.cache_path
-    # Assert
-    assert_cache_file_exists(cache_path)
-
-
-def test_file_dataset_data_loading(new_dataset_data, new_dataset):
-    # Act
-    correct_data = new_dataset_data
-    dataset = new_dataset
-    data = dataset.get_data()
-    # Assert
-    assert np.all(np.equal(data.x, correct_data.x))
-    assert np.all(np.equal(data.y, correct_data.y))
-    assert data.categorical_indicator == correct_data.categorical_indicator
-    assert data.attribute_names == correct_data.attribute_names

From 20fb4391438e7bd1366a4456d645c654696c2eb7 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 6 Jul 2023 18:11:42 +0300
Subject: [PATCH 51/60] use yaml config for the experiment

---
 experiments/fedot_warm_start/config.yaml | 26 ++++++++++++++
 experiments/fedot_warm_start/run.py      | 44 +++++++++++++++---------
 2 files changed, 54 insertions(+), 16 deletions(-)
 create mode 100644 experiments/fedot_warm_start/config.yaml

diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml
new file mode 100644
index 00000000..bcab1083
--- /dev/null
+++ b/experiments/fedot_warm_start/config.yaml
@@ -0,0 +1,26 @@
+---
+seed: 42
+#data_settings:
+n_datasets: null # null for all available datasets
+test_size: 0.25
+train_timeout: 15
+test_timeout: 15
+#meta_learning_params:
+n_best_dataset_models_to_memorize: 10
+n_closest_datasets_to_propose: 5
+minimal_distance_between_advised_models: 1
+n_best_models_to_advise: 5
+mf_extractor_params:
+  groups: general
+#evaluation_params:
+collect_metrics:
+  - f1
+  - roc_auc
+  - accuracy
+  - neg_log_loss
+  - precision
+common_fedot_params:
+  problem: classification
+  n_jobs: -1
+  show_progress: false
+baseline_model: 'xgboost'
diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index c0461f30..9920d4d5 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -2,6 +2,10 @@
 import json
 import logging
 import timeit
+from pathlib import Path
+
+import yaml
+
 from datetime import datetime
 from itertools import chain
 from typing import Dict, List, Tuple, Sequence
@@ -31,24 +35,32 @@
 from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
 from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
 
-# Meta-alg hyperparameters
-SEED = 42
-# Datasets sampling
-N_DATASETS = 3
-TEST_SIZE = 0.33
-# Evaluation timeouts
-TRAIN_TIMEOUT = 0.01
-TEST_TIMEOUT = 0.01
-# Models & datasets
-N_BEST_DATASET_MODELS_TO_MEMORIZE = 10
-N_CLOSEST_DATASETS_TO_PROPOSE = 5
-MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = 1
-N_BEST_MODELS_TO_ADVISE = 5
-# Meta-features
-MF_EXTRACTOR_PARAMS = {'groups': 'general'}
-COLLECT_METRICS = ['f1', 'roc_auc', 'accuracy', 'neg_log_loss', 'precision']
+
+CONFIG_PATH = 'config.yaml'
+
+
+with open(CONFIG_PATH, 'r') as config_file:
+    config = yaml.load(config_file, yaml.Loader)
+
+# Load constants
+SEED = config['seed']
+N_DATASETS = config['n_datasets']
+TEST_SIZE = config['test_size']
+TRAIN_TIMEOUT = config['train_timeout']
+TEST_TIMEOUT = config['test_timeout']
+N_BEST_DATASET_MODELS_TO_MEMORIZE = config['n_best_dataset_models_to_memorize']
+N_CLOSEST_DATASETS_TO_PROPOSE = config['n_closest_datasets_to_propose']
+MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = config['minimal_distance_between_advised_models']
+N_BEST_MODELS_TO_ADVISE = config['n_best_models_to_advise']
+MF_EXTRACTOR_PARAMS = config['mf_extractor_params']
+COLLECT_METRICS = config['collect_metrics']
+COMMON_FEDOT_PARAMS = config['common_fedot_params']
+BASELINE_MODEL = config['baseline_model']
+
+# Postprocess constants
 COLLECT_METRICS_ENUM = tuple(map(MetricsRepository.metric_by_id, COLLECT_METRICS))
 COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss'
+COMMON_FEDOT_PARAMS['seed'] = SEED
 
 COMMON_FEDOT_PARAMS = dict(
     problem='classification',

From d4d50ce8e4a12eaff673120cb30f31674c3a0b13 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Thu, 6 Jul 2023 18:11:57 +0300
Subject: [PATCH 52/60] refactor run.py

---
 experiments/fedot_warm_start/run.py | 187 +++++++++++++++-------------
 1 file changed, 97 insertions(+), 90 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 9920d4d5..20b3eee4 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -8,7 +8,7 @@
 
 from datetime import datetime
 from itertools import chain
-from typing import Dict, List, Tuple, Sequence
+from typing import Dict, List, Tuple, Sequence, Any
 
 import numpy as np
 import openml
@@ -23,6 +23,7 @@
 from fedot.core.repository.quality_metrics_repository import QualityMetricsEnum, MetricsRepository
 from fedot.core.validation.split import tabular_cv_generator
 from golem.core.log import Log
+from golem.core.optimisers.fitness import SingleObjFitness
 from sklearn.model_selection import StratifiedKFold
 from tqdm import tqdm
 
@@ -30,6 +31,7 @@
 from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split
+from meta_automl.data_preparation.file_system import get_cache_dir
 from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
 from meta_automl.data_preparation.model import Model
 from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
@@ -62,32 +64,34 @@
 COLLECT_METRICS[COLLECT_METRICS.index('neg_log_loss')] = 'logloss'
 COMMON_FEDOT_PARAMS['seed'] = SEED
 
-COMMON_FEDOT_PARAMS = dict(
-    problem='classification',
-    n_jobs=-1,
-    seed=SEED,
-    show_progress=False,
-)
-
-# Setup logging
-time_now = datetime.now()
-time_now_iso = time_now.isoformat(timespec="minutes")
-time_now_for_path = time_now_iso.replace(":", ".")
-save_dir = get_data_dir(). \
-    joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}')
-save_dir.mkdir(parents=True)
-log_file = save_dir.joinpath('log.txt')
-Log(log_file=log_file)
-logging.basicConfig(
-    filename=log_file,
-    filemode='a',
-    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
-    datefmt='%H:%M:%S',
-    force=True,
-)
-
-
-def prepare_data() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]:
+
+def setup_logging(save_dir):
+    log_file = save_dir.joinpath('log.txt')
+    Log(log_file=log_file)
+    logging.basicConfig(
+        filename=log_file,
+        filemode='a',
+        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
+        datefmt='%H:%M:%S',
+        force=True,
+    )
+
+
+def get_formatted_time() -> (datetime, str, str):
+    time_now = datetime.now()
+    time_now_iso = time_now.isoformat(timespec="minutes")
+    time_now_for_path = time_now_iso.replace(":", ".")
+    return time_now, time_now_iso, time_now_for_path
+
+
+def get_save_dir(time_now_for_path) -> Path:
+    save_dir = get_cache_dir(). \
+        joinpath('experiments').joinpath('fedot_warm_start').joinpath(f'run_{time_now_for_path}')
+    save_dir.mkdir(parents=True)
+    return save_dir
+
+
+def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]:
     """Returns dictionary with dataset names and cached datasets downloaded from OpenML."""
 
     dataset_ids = openml.study.get_suite(99).data
@@ -103,18 +107,10 @@ def prepare_data() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]
     return df_datasets_train, df_datasets_test, datasets
 
 
-def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array):
-    x = data.x
-    y = data.y
-    if len(y.shape) == 1:
-        y = y.reshape(-1, 1)
-    return x, y
-
-
-def get_pipeline_metrics(pipeline: Pipeline,
-                         input_data: InputData,
-                         metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM,
-                         metric_names: Sequence[str] = COLLECT_METRICS) -> dict:
+def evaluate_pipeline(pipeline: Pipeline,
+                      input_data: InputData,
+                      metrics: Sequence[QualityMetricsEnum] = COLLECT_METRICS_ENUM,
+                      metric_names: Sequence[str] = COLLECT_METRICS) -> Dict[str, float]:
     """Gets quality metrics for the fitted pipeline.
     The function is based on `Fedot.get_metrics()`
 
@@ -134,14 +130,30 @@ def get_pipeline_metrics(pipeline: Pipeline,
     return metric_values
 
 
-def prepare_extractor_and_assessor(datasets_train: List[str]):
+def fit_offline_meta_learning_components(best_models_per_dataset_id: Dict[int, Sequence[Model]]) \
+        -> (KNeighborsBasedSimilarityAssessor, PymfeExtractor, DiverseFEDOTPipelineAdvisor):
+    dataset_ids = list(best_models_per_dataset_id.keys())
+    # Meta Features
     extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS)
-    meta_features_train = extractor.extract(datasets_train, fill_input_nans=True)
+    meta_features_train = extractor.extract(dataset_ids, fill_input_nans=True)
     meta_features_train = meta_features_train.fillna(0)
+    # Datasets similarity
     data_similarity_assessor = KNeighborsBasedSimilarityAssessor(
-        n_neighbors=min(len(datasets_train), N_CLOSEST_DATASETS_TO_PROPOSE))
-    data_similarity_assessor.fit(meta_features_train, datasets_train)
-    return data_similarity_assessor, extractor
+        n_neighbors=min(len(dataset_ids), N_CLOSEST_DATASETS_TO_PROPOSE))
+    data_similarity_assessor.fit(meta_features_train, dataset_ids)
+    # Model advisor
+    model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
+                                                minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
+    model_advisor.fit(best_models_per_dataset_id)
+    return extractor, model_advisor
+
+
+def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array):
+    x = data.x
+    y = data.y
+    if len(y.shape) == 1:
+        y = y.reshape(-1, 1)
+    return x, y
 
 
 def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None):
@@ -152,7 +164,7 @@ def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_as
     fedot.fit(x, y)
     automl_time = timeit.default_timer() - time_start
 
-    metrics = get_pipeline_metrics(fedot.current_pipeline, fedot.train_data)
+    metrics = evaluate_pipeline(fedot.current_pipeline, fedot.train_data)
     pipeline = fedot.current_pipeline
     run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
                                       automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics)
@@ -160,7 +172,7 @@ def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_as
 
 
 def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, history_obj=None, automl_time_sec=0.,
-                        automl_timeout_min=0., **metrics):
+                        automl_timeout_min=0., **metrics) -> Dict[str, Any]:
     run_results = dict(dataset_id=dataset.id_,
                        dataset_name=dataset.name,
                        run_label=run_label,
@@ -174,23 +186,32 @@ def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, histor
     return run_results
 
 
-def extract_best_history_models(dataset, history):
-    best_individuals = sorted(chain(*history.individuals),
-                              key=lambda ind: ind.fitness,
-                              reverse=True)
-    best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
-    best_models = []
-    for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
-        pipeline = PipelineAdapter().restore(individual.graph)
-        model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset)
-        best_models.append(model)
+def extract_best_models_from_history(dataset, history) -> List[Model]:
+    if history.individuals:
+        best_individuals = sorted(chain(*history.individuals),
+                                  key=lambda ind: ind.fitness,
+                                  reverse=True)
+        best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
+        best_models = []
+        for individual in best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]:
+            pipeline = PipelineAdapter().restore(individual.graph)
+            model = Model(pipeline, individual.fitness, history.objective.metric_names[0], dataset)
+            best_models.append(model)
+    else:
+        pipeline = PipelineAdapter().restore(history.tuning_result)
+        best_models = [Model(pipeline, SingleObjFitness(), history.objective.metric_names[0], dataset)]
+
     return best_models
 
 
 def main():
-    baseline_pipeline = PipelineBuilder().add_node('rf').build()
+    time_now, time_now_iso, time_now_for_path = get_formatted_time()
+    save_dir = get_save_dir(time_now_for_path)
+    setup_logging(save_dir)
+
+    baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build()
 
-    df_datasets_train, df_datasets_test, datasets = prepare_data()
+    df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets()
 
     dataset_ids_train = df_datasets_train.index.to_list()
     dataset_ids_test = df_datasets_test.index.to_list()
@@ -198,9 +219,8 @@ def main():
     evaluation_results = []
     best_models_per_dataset = {}
     progress_file = open(save_dir.joinpath('progress.txt'), 'a')
-    for dataset_id in tqdm(datasets.keys(), 'FEDOT, all datasets', file=progress_file):
+    for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file):
         try:
-            dataset = datasets[dataset_id]
             timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT
             fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT')
             evaluation_results.append(run_results)
@@ -211,24 +231,20 @@ def main():
 
             # Filter out unique individuals with the best fitness
             history = fedot.history
-            best_models = extract_best_history_models(dataset, history)
+            best_models = extract_best_models_from_history(dataset, history)
             best_models_per_dataset[dataset_id] = best_models
         except Exception:
             logging.exception(f'Train dataset "{dataset_id}"')
 
-    data_similarity_assessor, extractor = prepare_extractor_and_assessor(dataset_ids_train)
-    model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
-                                                minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
-    model_advisor.fit(best_models_per_dataset)
+    mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset)
 
-    for dataset_id in tqdm(dataset_ids_test, 'MetaFEDOT, Test datasets', file=progress_file):
+    datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items()))
+    for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file):
         try:
-            dataset = datasets[dataset_id]
-
             # Run meta AutoML
             # 1
             time_start = timeit.default_timer()
-            meta_features = extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True)
+            meta_features = mf_extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True)
             meta_features = meta_features.fillna(0)
             meta_learning_time_sec = timeit.default_timer() - time_start
             initial_assumptions = model_advisor.predict(meta_features)[0]
@@ -240,7 +256,7 @@ def main():
             evaluation_results.append(fedot_meta_results)
 
             # Fit & evaluate simple baseline
-            baseline_metrics = get_pipeline_metrics(baseline_pipeline, fedot_meta.train_data)
+            baseline_metrics = evaluate_pipeline(baseline_pipeline, fedot_meta.train_data)
             baseline_res = get_result_data_row(dataset=dataset, run_label='simple baseline', pipeline=baseline_pipeline,
                                                **baseline_metrics)
             evaluation_results.append(baseline_res)
@@ -248,7 +264,7 @@ def main():
             # Fit & evaluate initial assumptions
             for i, assumption in enumerate(initial_assumptions):
                 pipeline = assumption.predictor
-                assumption_metrics = get_pipeline_metrics(pipeline, fedot_meta.train_data)
+                assumption_metrics = evaluate_pipeline(pipeline, fedot_meta.train_data)
                 assumption_res = get_result_data_row(dataset=dataset, run_label=f'MetaFEDOT - initial assumption {i}',
                                                      pipeline=pipeline, **assumption_metrics)
                 evaluation_results.append(assumption_res)
@@ -281,25 +297,16 @@ def main():
     pd.DataFrame(evaluation_results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
 
     # save experiment hyperparameters
-    params = {
-        'run_date': time_now_iso,
-        'seed': SEED,
-        'n_datasets': N_DATASETS or len(datasets),
-        'test_size': TEST_SIZE,
-        'dataset_ids': list(datasets.keys()),
-        'dataset_ids_train': dataset_ids_train,
-        'dataset_ids_test': dataset_ids_test,
-        'dataset_names_train': df_datasets_train['dataset_name'].to_list(),
-        'dataset_names_test': df_datasets_test['dataset_name'].to_list(),
-        'train_timeout': TRAIN_TIMEOUT,
-        'test_timeout': TEST_TIMEOUT,
-        'n_best_dataset_models_to_memorize': N_BEST_DATASET_MODELS_TO_MEMORIZE,
-        'n_closest_datasets_to_propose': N_CLOSEST_DATASETS_TO_PROPOSE,
-        'minimal_distance_between_advised_models': MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS,
-        'n_best_models_to_advise': N_BEST_MODELS_TO_ADVISE,
-        'common_fedot_params': COMMON_FEDOT_PARAMS,
-        'baseline_pipeline': baseline_pipeline.descriptive_id,
-    }
+    params = dict(
+        run_date=time_now_iso,
+        input_config=config,
+        dataset_ids=list(datasets_dict.keys()),
+        dataset_ids_train=dataset_ids_train,
+        dataset_names_train=df_datasets_train['dataset_name'].to_list(),
+        dataset_ids_test=dataset_ids_test,
+        dataset_names_test=df_datasets_test['dataset_name'].to_list(),
+        baseline_pipeline=baseline_pipeline.descriptive_id,
+    )
     with open(save_dir.joinpath('parameters.json'), 'w') as params_file:
         json.dump(params, params_file, indent=2)
 

From e581c9e564a729972efd6b22ffa0d06155db15e0 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Fri, 7 Jul 2023 18:16:36 +0300
Subject: [PATCH 53/60] update requirements

---
 .../data_preparation/dataset/openml_dataset.py  |   2 ++
 .../data_preparation/file_system/__init__.py    |   2 +-
 .../data_preparation/file_system/cache.py       |  12 ++++--------
 requirements.txt                                | Bin 460 -> 310 bytes
 4 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/meta_automl/data_preparation/dataset/openml_dataset.py b/meta_automl/data_preparation/dataset/openml_dataset.py
index 08fc5c1d..72fbb1f8 100644
--- a/meta_automl/data_preparation/dataset/openml_dataset.py
+++ b/meta_automl/data_preparation/dataset/openml_dataset.py
@@ -20,6 +20,7 @@ def __init__(self, id_: OpenMLDatasetIDType):
             raise ValueError('Creating OpenMLDataset by dataset name is ambiguous. Please, use dataset id.'
                              f'Otherwise, you can perform search by f{self.__class__.__name__}.from_search().')
         self._openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False,
+                                                           download_features_meta_data=False,
                                                            error_if_multiple=True)
         id_ = self._openml_dataset.id
         name = self._openml_dataset.name
@@ -28,6 +29,7 @@ def __init__(self, id_: OpenMLDatasetIDType):
     @classmethod
     def from_search(cls, id_: Union[OpenMLDatasetIDType, str], **get_dataset_kwargs) -> OpenMLDataset:
         openml_dataset = openml.datasets.get_dataset(id_, download_data=False, download_qualities=False,
+                                                     download_features_meta_data=False,
                                                      **get_dataset_kwargs)
         return cls(openml_dataset.id)
 
diff --git a/meta_automl/data_preparation/file_system/__init__.py b/meta_automl/data_preparation/file_system/__init__.py
index c9f8393a..1d52c516 100644
--- a/meta_automl/data_preparation/file_system/__init__.py
+++ b/meta_automl/data_preparation/file_system/__init__.py
@@ -2,4 +2,4 @@
 from meta_automl.data_preparation.file_system.cache import (CacheOperator, get_cache_dir, get_dataset_cache_path,
                                                             get_dataset_cache_path_by_id, get_meta_features_cache_path,
                                                             get_local_meta_features, update_local_meta_features,
-                                                            get_openml_cache_dir, update_openml_cache_dir)
+                                                            update_openml_cache_dir)
diff --git a/meta_automl/data_preparation/file_system/cache.py b/meta_automl/data_preparation/file_system/cache.py
index 04a904b7..0b021abe 100644
--- a/meta_automl/data_preparation/file_system/cache.py
+++ b/meta_automl/data_preparation/file_system/cache.py
@@ -24,16 +24,12 @@ def get_cache_dir() -> Path:
 
 
 def get_openml_cache_dir() -> Path:
-    return get_cache_dir().joinpath('openml_cache')
-
-
-def get_full_openml_cache_dir() -> Path:
-    return get_cache_dir().joinpath('openml_cache/org/openml/www')
+    return Path(openml.config.get_cache_directory())
 
 
 def update_openml_cache_dir():
-    openml_cache_path = str(get_openml_cache_dir())
-    openml.config.set_cache_directory(openml_cache_path)
+    openml_cache_path = get_cache_dir().joinpath('openml_cache')
+    openml.config.set_root_cache_directory(str(openml_cache_path))
 
 
 def _get_cache_path(object_class: Type[CacheOperator], object_id: str, _create_parent_dir: bool = True) -> Path:
@@ -82,7 +78,7 @@ def get_cache_properties(class_name: str) -> CacheProperties:
     cache_properties_by_class_name = {
         'OpenMLDataset': CacheProperties(
             type_=CacheType.directory,
-            dir_=get_full_openml_cache_dir().joinpath('datasets'),
+            dir_=get_openml_cache_dir().joinpath('datasets'),
             template='{id_}'),
         'CustomDataset': CacheProperties(
             type_=CacheType.file,
diff --git a/requirements.txt b/requirements.txt
index ad0a22332f176f2c866188116575624428ac1536..2337a1746f2da881a344c83f36cd8b55cba6ffff 100644
GIT binary patch
delta 76
zcmX@Zyp2in|GzYbRE89We1;MRTOc%G&|@$MV#A5+Wf@H-o|Og?FBKV$Cv!8JvYG%T
V3?{oWs<Ik`S^13NY#^2)0|4~!5gq^l

literal 460
zcmY+B%}&BV6ot>)gr{(6EVL8~BqT1-ts7qeWcUdTMHv=-dG))uHfScDcJ6o2oHNt!
zudVh<deDwJ>8y)p>NDE<PFJmz>0PxV)g|4L)zRyHz35pBz3NR*nrH|gYN`?MoO!LW
z3YFTRC?J)Sg6afbGAa#t)~qUYfoDw56tdDi>n&rBKG0K>uctem@Qx&HAaj(Nd)REW
zh6H*d8I2F%Io=sx#?NFsvD=c98ZVsOXoJ%HI&JCux5|bD0bO!hIq7^aoN=#%-{vFU
r0f$eVZW;e~{>`8NrZktN57I@~nYZ0XmoD*_>`rRmzwNQ}wP*MNE&M|H


From 2f8b409881fe70d705727a53b86c94cd7d6d285a Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Sat, 8 Jul 2023 22:39:11 +0300
Subject: [PATCH 54/60] Removing IDE configuration files.

---
 .gitignore                                   |  2 ++
 .idea/.gitignore                             |  8 --------
 .idea/inspectionProfiles/Project_Default.xml | 13 -------------
 .idea/libraries/py4j0_10_9_7.xml             |  9 ---------
 .idea/libraries/ziptestdata.xml              |  9 ---------
 .idea/libraries/ziptestdata1.xml             |  9 ---------
 .idea/libraries/ziptestdata2.xml             |  9 ---------
 .idea/libraries/ziptestdata3.xml             |  9 ---------
 .idea/meta-automl-research.iml               |  9 ---------
 .idea/misc.xml                               |  6 ------
 .idea/modules.xml                            |  8 --------
 .idea/runConfigurations.xml                  | 10 ----------
 .idea/vcs.xml                                |  6 ------
 13 files changed, 2 insertions(+), 105 deletions(-)
 delete mode 100644 .idea/.gitignore
 delete mode 100644 .idea/inspectionProfiles/Project_Default.xml
 delete mode 100644 .idea/libraries/py4j0_10_9_7.xml
 delete mode 100644 .idea/libraries/ziptestdata.xml
 delete mode 100644 .idea/libraries/ziptestdata1.xml
 delete mode 100644 .idea/libraries/ziptestdata2.xml
 delete mode 100644 .idea/libraries/ziptestdata3.xml
 delete mode 100644 .idea/meta-automl-research.iml
 delete mode 100644 .idea/misc.xml
 delete mode 100644 .idea/modules.xml
 delete mode 100644 .idea/runConfigurations.xml
 delete mode 100644 .idea/vcs.xml

diff --git a/.gitignore b/.gitignore
index 9e584fd4..44149102 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+.idea
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 13566b81..00000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
deleted file mode 100644
index 0616d54f..00000000
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <profile version="1.0">
-    <option name="myName" value="Project Default" />
-    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
-    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
-      <option name="ignoredIdentifiers">
-        <list>
-          <option value="experiments.fedot_warm_start.run.b4915f8620c1b6753fb1a058d6839f1fe374ab" />
-        </list>
-      </option>
-    </inspection_tool>
-  </profile>
-</component>
\ No newline at end of file
diff --git a/.idea/libraries/py4j0_10_9_7.xml b/.idea/libraries/py4j0_10_9_7.xml
deleted file mode 100644
index f6a7627a..00000000
--- a/.idea/libraries/py4j0_10_9_7.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<component name="libraryTable">
-  <library name="py4j0.10.9.7">
-    <CLASSES>
-      <root url="jar://$PROJECT_DIR$/venv/share/py4j/py4j0.10.9.7.jar!/" />
-    </CLASSES>
-    <JAVADOC />
-    <SOURCES />
-  </library>
-</component>
\ No newline at end of file
diff --git a/.idea/libraries/ziptestdata.xml b/.idea/libraries/ziptestdata.xml
deleted file mode 100644
index 7f8b1b21..00000000
--- a/.idea/libraries/ziptestdata.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<component name="libraryTable">
-  <library name="ziptestdata">
-    <CLASSES>
-      <root url="jar://$PROJECT_DIR$/venv/lib64/python3.8/site-packages/importlib_resources/tests/zipdata02/ziptestdata.zip!/" />
-    </CLASSES>
-    <JAVADOC />
-    <SOURCES />
-  </library>
-</component>
\ No newline at end of file
diff --git a/.idea/libraries/ziptestdata1.xml b/.idea/libraries/ziptestdata1.xml
deleted file mode 100644
index 054994be..00000000
--- a/.idea/libraries/ziptestdata1.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<component name="libraryTable">
-  <library name="ziptestdata1">
-    <CLASSES>
-      <root url="jar://$PROJECT_DIR$/venv/lib/python3.8/site-packages/importlib_resources/tests/zipdata02/ziptestdata.zip!/" />
-    </CLASSES>
-    <JAVADOC />
-    <SOURCES />
-  </library>
-</component>
\ No newline at end of file
diff --git a/.idea/libraries/ziptestdata2.xml b/.idea/libraries/ziptestdata2.xml
deleted file mode 100644
index 7b383000..00000000
--- a/.idea/libraries/ziptestdata2.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<component name="libraryTable">
-  <library name="ziptestdata2">
-    <CLASSES>
-      <root url="jar://$PROJECT_DIR$/venv/lib/python3.8/site-packages/importlib_resources/tests/zipdata01/ziptestdata.zip!/" />
-    </CLASSES>
-    <JAVADOC />
-    <SOURCES />
-  </library>
-</component>
\ No newline at end of file
diff --git a/.idea/libraries/ziptestdata3.xml b/.idea/libraries/ziptestdata3.xml
deleted file mode 100644
index a0322347..00000000
--- a/.idea/libraries/ziptestdata3.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<component name="libraryTable">
-  <library name="ziptestdata3">
-    <CLASSES>
-      <root url="jar://$PROJECT_DIR$/venv/lib64/python3.8/site-packages/importlib_resources/tests/zipdata01/ziptestdata.zip!/" />
-    </CLASSES>
-    <JAVADOC />
-    <SOURCES />
-  </library>
-</component>
\ No newline at end of file
diff --git a/.idea/meta-automl-research.iml b/.idea/meta-automl-research.iml
deleted file mode 100644
index d6ebd480..00000000
--- a/.idea/meta-automl-research.iml
+++ /dev/null
@@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="JAVA_MODULE" version="4">
-  <component name="NewModuleRootManager" inherit-compiler-output="true">
-    <exclude-output />
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index 33b8d9d1..00000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_16" project-jdk-name="Python 3.8 (meta-automl-research)" project-jdk-type="Python SDK">
-    <output url="file://$PROJECT_DIR$/out" />
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index c8283092..00000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/meta-automl-research.iml" filepath="$PROJECT_DIR$/.idea/meta-automl-research.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/runConfigurations.xml b/.idea/runConfigurations.xml
deleted file mode 100644
index 797acea5..00000000
--- a/.idea/runConfigurations.xml
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="RunConfigurationProducerService">
-    <option name="ignoredProducers">
-      <set>
-        <option value="com.android.tools.idea.compose.preview.runconfiguration.ComposePreviewRunConfigurationProducer" />
-      </set>
-    </option>
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 35eb1ddf..00000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="" vcs="Git" />
-  </component>
-</project>
\ No newline at end of file

From 67812b757f2ddd4c1a7969f55a6b6abfeaca0bfd Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 16 Jul 2023 17:02:12 +0300
Subject: [PATCH 55/60] make absolute path to config.yaml

---
 experiments/fedot_warm_start/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 20b3eee4..8f3c6d85 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -38,7 +38,7 @@
 from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
 
 
-CONFIG_PATH = 'config.yaml'
+CONFIG_PATH = Path(__file__).parent.joinpath('config.yaml')
 
 
 with open(CONFIG_PATH, 'r') as config_file:

From 4a0b144dd447edeed0affc12466edd90a4ddb68e Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 16 Jul 2023 17:40:55 +0300
Subject: [PATCH 56/60] fix train test split

---
 experiments/fedot_warm_start/run.py                 |  2 +-
 .../data_preparation/datasets_train_test_split.py   | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 8f3c6d85..53a130cd 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -99,7 +99,7 @@ def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDatase
         dataset_ids = pd.Series(dataset_ids)
         dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
 
-    df_split_datasets = openml_datasets_train_test_split(dataset_ids, seed=SEED)
+    df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED)
     df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1]
     df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0]
 
diff --git a/meta_automl/data_preparation/datasets_train_test_split.py b/meta_automl/data_preparation/datasets_train_test_split.py
index 101b7ce8..b262a44c 100644
--- a/meta_automl/data_preparation/datasets_train_test_split.py
+++ b/meta_automl/data_preparation/datasets_train_test_split.py
@@ -2,12 +2,15 @@
 import pandas as pd
 
 from sklearn.model_selection import train_test_split
+from typing import List
 
+from meta_automl.data_preparation.dataset import OpenMLDatasetIDType
 
-def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed: int = 42):
+
+def openml_datasets_train_test_split(dataset_ids: List[OpenMLDatasetIDType], test_size: float, seed=None):
     df_openml_datasets = openml.datasets.list_datasets(dataset_ids, output_format='dataframe')
     df_openml_datasets_split_features = df_openml_datasets[
-        ['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']]
+        ['name', 'NumberOfInstances', 'NumberOfFeatures', 'NumberOfClasses']].copy(deep=False)
     for column in df_openml_datasets_split_features.columns[1:]:
         if column != 'NumberOfClasses':
             median = df_openml_datasets_split_features[column].median()
@@ -31,7 +34,7 @@ def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed:
     if not df_datasets_to_split.empty:
         df_train_datasets, df_test_datasets = train_test_split(
             df_datasets_to_split,
-            train_size=train_size,
+            test_size=test_size,
             shuffle=True,
             stratify=df_datasets_to_split['category'],
             random_state=seed
@@ -40,7 +43,7 @@ def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed:
     else:
         df_train_datasets, df_test_datasets = train_test_split(
             df_split_categories,
-            train_size=train_size,
+            test_size=test_size,
             shuffle=True,
             random_state=seed
         )
@@ -56,7 +59,7 @@ def openml_datasets_train_test_split(dataset_ids, train_size: float = 0.7, seed:
 
 def main():
     dataset_ids = openml.study.get_suite(99).data
-    df_split_datasets = openml_datasets_train_test_split(dataset_ids)
+    df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=0.3)
     df_split_datasets.to_csv('train_test_datasets_opencc18.csv')
 
 
From 44857b0f99756b72d8a5d4f73ab7827d00c9c847 Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 16 Jul 2023 17:41:35 +0300
Subject: [PATCH 57/60] refactor for frequent results saving

---
 experiments/fedot_warm_start/run.py           | 241 ++++++++++--------
 .../dataset/custom_dataset.py                 |   1 -
 2 files changed, 141 insertions(+), 101 deletions(-)

diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py
index 53a130cd..956ecf30 100644
--- a/experiments/fedot_warm_start/run.py
+++ b/experiments/fedot_warm_start/run.py
@@ -24,11 +24,12 @@
 from fedot.core.validation.split import tabular_cv_generator
 from golem.core.log import Log
 from golem.core.optimisers.fitness import SingleObjFitness
+from golem.core.optimisers.opt_history_objects.opt_history import OptHistory
 from sklearn.model_selection import StratifiedKFold
 from tqdm import tqdm
 
 
-from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData
+from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData, DatasetBase
 from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
 from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split
 from meta_automl.data_preparation.file_system import get_cache_dir
@@ -65,7 +66,8 @@
 COMMON_FEDOT_PARAMS['seed'] = SEED
 
 
-def setup_logging(save_dir):
+def setup_logging(save_dir: Path):
+    """ Creates "log.txt" at the "save_dir" and redirects all logging output to it. """
     log_file = save_dir.joinpath('log.txt')
     Log(log_file=log_file)
     logging.basicConfig(
@@ -77,7 +79,12 @@ def setup_logging(save_dir):
     )
 
 
-def get_formatted_time() -> (datetime, str, str):
+def get_current_formatted_date() -> (datetime, str, str):
+    """ Returns current date in the following formats:
+
+        1. datetime
+        2. str: ISO
+        3. str: ISO compatible with Windows file system path (with "." instead of ":") """
     time_now = datetime.now()
     time_now_iso = time_now.isoformat(timespec="minutes")
     time_now_for_path = time_now_iso.replace(":", ".")
@@ -156,7 +163,10 @@ def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array):
     return x, y
 
 
-def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None):
+def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_assumption=None) \
+        -> (Fedot, Dict[str, Any]):
+    """ Runs Fedot evaluation on the dataset, the evaluates the final pipeline on the dataset.
+     Returns Fedot instance & properties of the run along with the evaluated metrics. """
     x, y = transform_data_for_fedot(dataset.get_data(dataset_format='array'))
 
     time_start = timeit.default_timer()
@@ -166,8 +176,9 @@ def fit_fedot(dataset: OpenMLDataset, timeout: float, run_label: str, initial_as
 
     metrics = evaluate_pipeline(fedot.current_pipeline, fedot.train_data)
     pipeline = fedot.current_pipeline
-    run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline, automl_time_sec=automl_time,
-                                      automl_timeout_min=fedot.params.timeout, history_obj=fedot.history, **metrics)
+    run_results = get_result_data_row(dataset=dataset, run_label=run_label, pipeline=pipeline,
+                                      automl_time_sec=automl_time, automl_timeout_min=fedot.params.timeout,
+                                      history_obj=fedot.history, **metrics)
     return fedot, run_results
 
 
@@ -186,7 +197,7 @@ def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, histor
     return run_results
 
 
-def extract_best_models_from_history(dataset, history) -> List[Model]:
+def extract_best_models_from_history(dataset: DatasetBase, history: OptHistory) -> List[Model]:
     if history.individuals:
         best_individuals = sorted(chain(*history.individuals),
                                   key=lambda ind: ind.fitness,
@@ -204,111 +215,141 @@ def extract_best_models_from_history(dataset, history) -> List[Model]:
     return best_models
 
 
+def save_experiment_params(params_dict: Dict[str, Any], save_dir: Path):
+    """ Save the hyperparameters of the experiment """
+    params_file_path = save_dir.joinpath('parameters.json')
+    with open(params_file_path, 'w') as params_file:
+        json.dump(params_dict, params_file, indent=2)
+
+
+def save_evaluation(evaluation_properties: Dict[str, Any], run_date: datetime, experiment_date: datetime,
+                    save_dir: Path):
+    histories_dir = save_dir.joinpath('histories')
+    models_dir = save_dir.joinpath('models')
+    eval_results_path = save_dir.joinpath('evaluation_results.csv')
+
+    histories_dir.mkdir(exist_ok=True)
+    models_dir.mkdir(exist_ok=True)
+
+    try:
+        evaluation_properties['experiment_date'] = experiment_date
+        evaluation_properties['run_date'] = run_date
+        dataset_id = evaluation_properties['dataset_id']
+        run_label = evaluation_properties['run_label']
+        # define saving paths
+        model_path = models_dir.joinpath(f'{dataset_id}_{run_label}')
+        history_path = histories_dir.joinpath(f'{dataset_id}_{run_label}_history.json')
+        # replace objects with export paths for csv
+        evaluation_properties['model_path'] = str(model_path)
+        evaluation_properties.pop('model_obj').save(model_path)
+        evaluation_properties['history_path'] = str(history_path)
+        history_obj = evaluation_properties.pop('history_obj')
+        if history_obj is not None:
+            history_obj.save(evaluation_properties['history_path'])
+
+        df_evaluation_properties = pd.DataFrame([evaluation_properties])
+
+        if eval_results_path.exists():
+            df_results = pd.read_csv(eval_results_path)
+            df_results = pd.concat([df_results, df_evaluation_properties])
+        else:
+            df_results = df_evaluation_properties
+        df_results.to_csv(eval_results_path, index=False)
+
+    except Exception:
+        logging.exception(f'Saving results "{evaluation_properties}"')
+
+
 def main():
-    time_now, time_now_iso, time_now_for_path = get_formatted_time()
-    save_dir = get_save_dir(time_now_for_path)
+    experiment_date, experiment_date_iso, experiment_date_for_path = get_current_formatted_date()
+    save_dir = get_save_dir(experiment_date_for_path)
     setup_logging(save_dir)
-
-    baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build()
+    progress_file_path = save_dir.joinpath('progress.txt')
 
     df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets()
 
+    dataset_ids = list(datasets_dict.keys())
     dataset_ids_train = df_datasets_train.index.to_list()
     dataset_ids_test = df_datasets_test.index.to_list()
 
-    evaluation_results = []
+    dataset_names_train = df_datasets_train['dataset_name'].to_list()
+    dataset_names_test = df_datasets_test['dataset_name'].to_list()
+
+    datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items()))
+
+    experiment_params_dict = dict(
+            experiment_start_date_iso=experiment_date_iso,
+            input_config=config,
+            dataset_ids=dataset_ids,
+            dataset_ids_train=dataset_ids_train,
+            dataset_names_train=dataset_names_train,
+            dataset_ids_test=dataset_ids_test,
+            dataset_names_test=dataset_names_test,
+            baseline_pipeline=BASELINE_MODEL,
+        )
+    save_experiment_params(experiment_params_dict, save_dir)
+
     best_models_per_dataset = {}
-    progress_file = open(save_dir.joinpath('progress.txt'), 'a')
-    for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file):
-        try:
-            timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT
-            fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT')
-            evaluation_results.append(run_results)
-            # TODO:
-            #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
-            #   x Evaluate historical pipelines on the data instead of using fitness
-            #   x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run
-
-            # Filter out unique individuals with the best fitness
-            history = fedot.history
-            best_models = extract_best_models_from_history(dataset, history)
-            best_models_per_dataset[dataset_id] = best_models
-        except Exception:
-            logging.exception(f'Train dataset "{dataset_id}"')
+    with open(progress_file_path, 'a') as progress_file:
+        for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file):
+            try:
+                timeout = TRAIN_TIMEOUT if dataset_id in dataset_ids_train else TEST_TIMEOUT
+                run_date = datetime.now()
+                fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT')
+                save_evaluation(run_results, run_date, experiment_date, save_dir)
+                # TODO:
+                #   x Turn the tuned pipeline into a model (evaluate its fitness on the data)
+                #   x Evaluate historical pipelines on the data instead of using fitness
+                #   x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run
+
+                # Filter out unique individuals with the best fitness
+                history = fedot.history
+                best_models = extract_best_models_from_history(dataset, history)
+                best_models_per_dataset[dataset_id] = best_models
+            except Exception:
+                logging.exception(f'Train dataset "{dataset_id}"')
 
     mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset)
 
-    datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items()))
-    for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file):
-        try:
-            # Run meta AutoML
-            # 1
-            time_start = timeit.default_timer()
-            meta_features = mf_extractor.extract([dataset], fill_input_nans=True, use_cached=False, update_cached=True)
-            meta_features = meta_features.fillna(0)
-            meta_learning_time_sec = timeit.default_timer() - time_start
-            initial_assumptions = model_advisor.predict(meta_features)[0]
-            assumption_pipelines = [model.predictor for model in initial_assumptions]
-            # 2
-            fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
-                                                       initial_assumption=assumption_pipelines)
-            fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec
-            evaluation_results.append(fedot_meta_results)
-
-            # Fit & evaluate simple baseline
-            baseline_metrics = evaluate_pipeline(baseline_pipeline, fedot_meta.train_data)
-            baseline_res = get_result_data_row(dataset=dataset, run_label='simple baseline', pipeline=baseline_pipeline,
-                                               **baseline_metrics)
-            evaluation_results.append(baseline_res)
-
-            # Fit & evaluate initial assumptions
-            for i, assumption in enumerate(initial_assumptions):
-                pipeline = assumption.predictor
-                assumption_metrics = evaluate_pipeline(pipeline, fedot_meta.train_data)
-                assumption_res = get_result_data_row(dataset=dataset, run_label=f'MetaFEDOT - initial assumption {i}',
-                                                     pipeline=pipeline, **assumption_metrics)
-                evaluation_results.append(assumption_res)
-        except Exception:
-            logging.exception(f'Test dataset "{dataset_id}"')
-    progress_file.close()
-
-    # Save the accumulated results
-    history_dir = save_dir.joinpath('histories')
-    history_dir.mkdir()
-    models_dir = save_dir.joinpath('models')
-    for res in evaluation_results:
-        try:
-            res['run_date'] = time_now
-            dataset_id = res['dataset_id']
-            run_label = res['run_label']
-            # define saving paths
-            model_path = models_dir.joinpath(f'{dataset_id}_{run_label}')
-            history_path = history_dir.joinpath(f'{dataset_id}_{run_label}_history.json')
-            # replace objects with export paths for csv
-            res['model_path'] = str(model_path)
-            res.pop('model_obj').save(res['model_path'])
-            res['history_path'] = str(history_path)
-            history_obj = res.pop('history_obj')
-            if history_obj is not None:
-                history_obj.save(res['history_path'])
-        except Exception:
-            logging.exception(f'Saving results "{res}"')
-
-    pd.DataFrame(evaluation_results).to_csv(save_dir.joinpath(f'results_{time_now_for_path}.csv'))
-
-    # save experiment hyperparameters
-    params = dict(
-        run_date=time_now_iso,
-        input_config=config,
-        dataset_ids=list(datasets_dict.keys()),
-        dataset_ids_train=dataset_ids_train,
-        dataset_names_train=df_datasets_train['dataset_name'].to_list(),
-        dataset_ids_test=dataset_ids_test,
-        dataset_names_test=df_datasets_test['dataset_name'].to_list(),
-        baseline_pipeline=baseline_pipeline.descriptive_id,
-    )
-    with open(save_dir.joinpath('parameters.json'), 'w') as params_file:
-        json.dump(params, params_file, indent=2)
+    with open(progress_file_path, 'a') as progress_file:
+        for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file):
+            try:
+                # Run meta AutoML
+                # 1
+                time_start = timeit.default_timer()
+                meta_features = mf_extractor.extract([dataset],
+                                                     fill_input_nans=True, use_cached=False, update_cached=True)
+                meta_features = meta_features.fillna(0)
+                meta_learning_time_sec = timeit.default_timer() - time_start
+                initial_assumptions = model_advisor.predict(meta_features)[0]
+                assumption_pipelines = [model.predictor for model in initial_assumptions]
+                # 2
+                run_date = datetime.now()
+                fedot_meta, fedot_meta_results = fit_fedot(dataset=dataset, timeout=TEST_TIMEOUT, run_label='MetaFEDOT',
+                                                           initial_assumption=assumption_pipelines)
+                fedot_meta_results['meta_learning_time_sec'] = meta_learning_time_sec
+                save_evaluation(fedot_meta_results, run_date, experiment_date, save_dir)
+
+                # Fit & evaluate simple baseline
+                baseline_pipeline = PipelineBuilder().add_node(BASELINE_MODEL).build()
+                run_date = datetime.now()
+                baseline_metrics = evaluate_pipeline(baseline_pipeline, fedot_meta.train_data)
+                baseline_res = get_result_data_row(dataset=dataset, run_label=f'simple baseline {BASELINE_MODEL}',
+                                                   pipeline=baseline_pipeline,
+                                                   **baseline_metrics)
+                save_evaluation(baseline_res, run_date, experiment_date, save_dir)
+
+                # Fit & evaluate initial assumptions
+                for i, assumption in enumerate(initial_assumptions):
+                    pipeline = assumption.predictor
+                    run_date = datetime.now()
+                    assumption_metrics = evaluate_pipeline(pipeline, fedot_meta.train_data)
+                    assumption_res = get_result_data_row(dataset=dataset,
+                                                         run_label=f'MetaFEDOT - initial assumption {i}',
+                                                         pipeline=pipeline, **assumption_metrics)
+                    save_evaluation(assumption_res, run_date, experiment_date, save_dir)
+            except Exception:
+                logging.exception(f'Test dataset "{dataset_id}"')
 
 
 if __name__ == "__main__":
diff --git a/meta_automl/data_preparation/dataset/custom_dataset.py b/meta_automl/data_preparation/dataset/custom_dataset.py
index 505868f6..1001b5be 100644
--- a/meta_automl/data_preparation/dataset/custom_dataset.py
+++ b/meta_automl/data_preparation/dataset/custom_dataset.py
@@ -8,7 +8,6 @@
 from meta_automl.data_preparation.dataset.dataset_base import DatasetData
 
 
-
 class DataNotFoundError(FileNotFoundError):
     pass
 

From 68a24433ca953ebadbea3a7e73f0f5d492eaa28d Mon Sep 17 00:00:00 2001
From: morrisnein <petroochcho@gmail.com>
Date: Sun, 16 Jul 2023 18:13:21 +0300
Subject: [PATCH 58/60] fix logging

---
 .../datasets_loaders/openml_datasets_loader.py              | 6 ------
 .../meta_features_extractors/pymfe_extractor.py             | 5 ++---
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
index f7fbfb80..89cd2445 100644
--- a/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
+++ b/meta_automl/data_preparation/datasets_loaders/openml_datasets_loader.py
@@ -2,8 +2,6 @@
 
 from typing import List, Union, Optional, Sequence
 
-from golem.core.log import default_log
-
 from meta_automl.data_preparation.dataset import OpenMLDataset, OpenMLDatasetIDType
 from meta_automl.data_preparation.datasets_loaders import DatasetsLoader
 
@@ -37,7 +35,3 @@ def load_single(self, dataset_id: Union[OpenMLDatasetIDType, str],
         self.dataset_ids.add(dataset.id_)
 
         return dataset
-
-    @property
-    def _log(self):
-        return default_log(self)
diff --git a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
index edfa6925..1542e823 100644
--- a/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
+++ b/meta_automl/data_preparation/meta_features_extractors/pymfe_extractor.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
+import logging
 from typing import List, Union, Dict, Any
 
 import pandas as pd
-from golem.core.log import default_log
 from pymfe.mfe import MFE
 
 from meta_automl.data_preparation.dataset import DatasetBase, DatasetIDType
@@ -18,7 +18,6 @@ def __init__(self, extractor_params: Dict[str, Any] = None, datasets_loader: Dat
         self.extractor_params = extractor_params if extractor_params is not None else self.default_params
         self._datasets_loader = datasets_loader or OpenMLDatasetsLoader()
         self._extractor = MFE(**self.extractor_params)
-        self._logger = default_log(self)
 
     @property
     def datasets_loader(self) -> DatasetsLoader:
@@ -35,7 +34,7 @@ def extract(self, datasets_or_ids: List[Union[DatasetBase, DatasetIDType]],
             if not isinstance(dataset, DatasetBase):
                 dataset = self._datasets_loader.load_single(dataset)
 
-            self._logger.info(f'Extracting meta features of the dataset {dataset}...')
+            logging.critical(f'Extracting meta features of the dataset {dataset}...')
             if (use_cached and
                     (mfs := self._get_meta_features_cache(dataset.id_, meta_feature_names))):
                 meta_features[dataset.id_] = mfs

From b4c714f3a245ed33537a63d804d3dad286c0e81c Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Wed, 19 Jul 2023 18:10:47 +0300
Subject: [PATCH 59/60] Adding an AutoML baseline class

---
 baselines/__init__.py        |  0
 baselines/automl_baseline.py | 11 +++++++++++
 2 files changed, 11 insertions(+)
 create mode 100644 baselines/__init__.py
 create mode 100644 baselines/automl_baseline.py

diff --git a/baselines/__init__.py b/baselines/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/baselines/automl_baseline.py b/baselines/automl_baseline.py
new file mode 100644
index 00000000..36a82b28
--- /dev/null
+++ b/baselines/automl_baseline.py
@@ -0,0 +1,11 @@
+from abc import ABC
+
+
+class AutoMLBaseline(ABC):
+    def run(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def save_on_disk(data):
+        raise NotImplementedError
+

From 645a98f8f3806bdb1c89f538691cb6decaac2754 Mon Sep 17 00:00:00 2001
From: max <imaxaliev@gmail.com>
Date: Wed, 19 Jul 2023 18:15:04 +0300
Subject: [PATCH 60/60] Reflecting API changes in an asklearn baseline

---
 baselines/auto-sklearn/__init__.py            |   0
 .../auto-sklearn/auto-sklearn_baseline.py     | 166 ++++++++++++++++++
 .../auto-sklearn/data/experimental_data.csv   |  57 ++++++
 3 files changed, 223 insertions(+)
 create mode 100644 baselines/auto-sklearn/__init__.py
 create mode 100644 baselines/auto-sklearn/auto-sklearn_baseline.py
 create mode 100644 baselines/auto-sklearn/data/experimental_data.csv

diff --git a/baselines/auto-sklearn/__init__.py b/baselines/auto-sklearn/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/baselines/auto-sklearn/auto-sklearn_baseline.py b/baselines/auto-sklearn/auto-sklearn_baseline.py
new file mode 100644
index 00000000..e467e8f1
--- /dev/null
+++ b/baselines/auto-sklearn/auto-sklearn_baseline.py
@@ -0,0 +1,166 @@
+import csv
+import time
+
+from typing import Any, Tuple, Dict
+
+import numpy as np
+import logging
+
+import autosklearn.classification
+import autosklearn.ensembles
+
+from sklearn import model_selection, metrics
+
+from baselines.automl_baseline import AutoMLBaseline
+from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
+from meta_automl.data_preparation.models_loaders import KnowledgeBaseModelsLoader
+from autosklearn.classification import AutoSklearnClassifier
+
+
+class AutoSklearnBaseline(AutoMLBaseline):
+    def __init__(self, ensemble_type, time_limit):
+        self.estimator = AutoSklearnClassifier(
+            ensemble_class=ensemble_type,
+            time_left_for_this_task=time_limit,
+        )
+        self.knowledge_base_loader = KnowledgeBaseModelsLoader()
+
+    @staticmethod
+    def make_quality_metric_estimates(y, predictions, prediction_proba, is_multi_label):
+        """ Compute roc_auc, f1, accuracy, log_loss and precision scores. """
+        results = {
+            'roc_auc': -1 * float(
+                "{:.3f}".format(
+                    metrics.roc_auc_score(
+                        y,
+                        prediction_proba if is_multi_label else predictions,
+                        multi_class='ovr'
+                    )
+                )
+            ),
+            'f1': -1 * float(
+                "{:.3f}".format(
+                    metrics.f1_score(
+                        y,
+                        predictions,
+                        average='macro' if is_multi_label else 'binary'
+                    )
+                )
+            ),
+            'accuracy': -1 * float(
+                "{:.3f}".format(
+                    metrics.accuracy_score(
+                        y,
+                        predictions
+                    )
+                )
+            ),
+            'logloss': float(
+                "{:.3f}".format(
+                    metrics.log_loss(
+                        y,
+                        prediction_proba if is_multi_label else predictions
+                    )
+                )
+            ),
+            'precision': -1 * float(
+                "{:.3f}".format(
+                    metrics.precision_score(
+                        y,
+                        predictions,
+                        average='macro' if is_multi_label else 'binary',
+                        labels=np.unique(predictions)
+                    )
+                )
+            )
+        }
+        return results
+
+    def run(self):
+        """ Fit auto-sklearn meta-optimizer to knowledge base datasets and output a single best model. """
+        dataset_ids_to_load = [
+            dataset_id for dataset_id in self.knowledge_base_loader
+                                             .parse_datasets('test')
+                                             .loc[:, 'dataset_id']
+        ]
+        # dataset_ids_to_load = [dataset_ids_to_load[dataset_ids_to_load.index(41166)]]
+
+        loaded_datasets = OpenMLDatasetsLoader().load(dataset_ids_to_load)
+
+        for iteration, dataset in enumerate(loaded_datasets):
+            logging.log(logging.INFO, f"Loaded dataset name: {dataset.name}")
+            dataset_data = dataset.get_data()
+
+            X_train, X_test, y_train, y_test = model_selection.train_test_split(
+                dataset_data.x,
+                dataset_data.y,
+                test_size=0.2,
+                random_state=42,
+                stratify=dataset_data.y
+            )
+
+            fitting_start_time = time.time()
+            ensemble = self.estimator.fit(X_train, y_train)
+            fitting_time = time.time() - fitting_start_time
+            logging.log(logging.INFO, f"Fitting time is {fitting_time}sec")
+
+            inference_start_time = time.time()
+            predicted_results = self.estimator.predict(X_test)
+            inference_time = time.time() - inference_start_time
+            logging.log(logging.INFO, f"Inference time is {inference_time}sec")
+
+            predicted_probabilities = self.estimator.predict_proba(X_test)
+
+            best_single_model = list(ensemble.show_models().values())[0].get('sklearn_classifier')
+
+            # autosklearn_ensemble = pipeline.show_models()
+            # formatted_ensemble = {
+            #     model_id: {
+            #         'rank': autosklearn_ensemble[model_id].get('rank'),
+            #         'cost': float(f"{autosklearn_ensemble[model_id].get('cost'):.3f}"),
+            #         'ensemble_weight': autosklearn_ensemble[model_id].get('ensemble_weight'),
+            #         'model': autosklearn_ensemble[model_id].get('sklearn_classifier')
+            #     } for model_id in autosklearn_ensemble.keys()
+            # }
+
+            general_run_info = {
+                'dataset_id': dataset.id_,
+                'dataset_name': dataset.name,
+                'run_label': 'Auto-sklearn',
+            }
+
+            is_multilabel_classification = True if len(set(predicted_results)) > 2 else False
+            quality_metric_estimates = AutoSklearnBaseline.make_quality_metric_estimates(
+                y_test,
+                predicted_results,
+                predicted_probabilities,
+                is_multilabel_classification
+            )
+
+            model_dependent_run_info = {
+                'fit_time': float(f'{fitting_time:.1f}'),
+                'inference_time': float(f'{inference_time:.1f}'),
+                'model_str': repr(best_single_model)
+            }
+
+            results = {**general_run_info, **quality_metric_estimates, **model_dependent_run_info}
+
+            # for key in autosklearn_ensemble.keys():
+            #     ensemble_model = autosklearn_ensemble[key]
+            #     formatted_ensemble = results['ensemble']
+            #     for model_id in formatted_ensemble.keys():
+            #         formatted_ensemble[model_id] = ensemble_model.get("rank", None)
+
+            AutoSklearnBaseline.save_on_disk(results.valuess())
+
+            return results
+
+    @staticmethod
+    def save_on_disk(data):
+        with open('data/experimental_data.csv', 'a', newline='') as file:
+            writer = csv.writer(file, delimiter=',')
+            writer.writerow(data)
+
+
+if __name__ == '__main__':
+    AutoSklearnBaseline(autosklearn.ensembles.SingleBest, 600).run()
diff --git a/baselines/auto-sklearn/data/experimental_data.csv b/baselines/auto-sklearn/data/experimental_data.csv
new file mode 100644
index 00000000..7a3f3cfa
--- /dev/null
+++ b/baselines/auto-sklearn/data/experimental_data.csv
@@ -0,0 +1,57 @@
+1461,bank-marketing,Auto-sklearn,-0.711,-0.535,-0.907,3.34,-0.648,598.0,0.1,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=1.7108930238344161e-10,
+                               learning_rate=0.010827728124541558, loss='auto',
+                               max_iter=512, max_leaf_nodes=25,
+                               min_samples_leaf=4, n_iter_no_change=19,
+                               random_state=1,
+                               validation_fraction=0.1759114608225653,
+                               warm_start=True)"
+179,adult,Auto-sklearn,-0.774,-0.91,-0.859,5.077,-0.885,595.3,0.1,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=1.7108930238344161e-10,
+                               learning_rate=0.010827728124541558, loss='auto',
+                               max_iter=512, max_leaf_nodes=25,
+                               min_samples_leaf=4, n_iter_no_change=19,
+                               random_state=1,
+                               validation_fraction=0.1759114608225653,
+                               warm_start=True)"
+1464,blood-transfusion-service-center,Auto-sklearn,-0.669,-0.5,-0.8,7.209,-0.625,597.6,0.0,"PassiveAggressiveClassifier(C=0.253246830865058, average=True, max_iter=16,
+                            random_state=1, tol=0.01676578241454229,
+                            warm_start=True)"
+991,car,Auto-sklearn,-1.0,-1.0,-1.0,0.0,-1.0,596.8,0.0,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=1.9280388598217333e-10,
+                               learning_rate=0.24233932723531437, loss='auto',
+                               max_iter=128, max_leaf_nodes=35,
+                               min_samples_leaf=17, n_iter_no_change=1,
+                               random_state=1, validation_fraction=None,
+                               warm_start=True)"
+1489,phoneme,Auto-sklearn,-0.848,-0.797,-0.887,4.068,-0.845,600.4,0.1,"AdaBoostClassifier(algorithm='SAMME',
+                   base_estimator=DecisionTreeClassifier(max_depth=10),
+                   learning_rate=1.1377640450285444, n_estimators=352,
+                   random_state=1)"
+41027,jungle_chess_2pcs_raw_endgame_complete,Auto-sklearn,-0.975,-0.816,-0.865,0.271,-0.824,595.1,0.2,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=9.674948183980905e-09,
+                               learning_rate=0.014247987845444413, loss='auto',
+                               max_iter=512, max_leaf_nodes=55,
+                               min_samples_leaf=164, n_iter_no_change=1,
+                               random_state=1,
+                               validation_fraction=0.11770489601182355,
+                               warm_start=True)"
+41166,volkert,Auto-sklearn,-0.874,-0.586,-0.644,1.829,-0.587,595.8,0.3,"LinearDiscriminantAnalysis(shrinkage='auto', solver='lsqr',
+                           tol=0.018821286956948503)"
+54,vehicle,Auto-sklearn,-0.964,-0.86,-0.859,0.408,-0.861,595.5,0.0,"MLPClassifier(activation='tanh', alpha=0.0002060405669905105, beta_1=0.999,
+              beta_2=0.9, hidden_layer_sizes=(87, 87, 87),
+              learning_rate_init=0.00040205833939989724, max_iter=256,
+              n_iter_no_change=32, random_state=1, validation_fraction=0.0,
+              verbose=0, warm_start=True)"
+40996,fashion-mnist,Auto-sklearn,-0.968,-0.864,-0.865,1.913,-0.866,296.1,1.2,"KNeighborsClassifier(n_neighbors=4, weights='distance')"
+40996,fashion-mnist,Auto-sklearn,-0.968,-0.864,-0.865,1.913,-0.866,595.5,0.8,"KNeighborsClassifier(n_neighbors=4, weights='distance')"
+42344,sf-police-incidents,Auto-sklearn,-0.574,-0.589,-0.574,15.367,-0.569,594.8,0.5,"HistGradientBoostingClassifier(early_stopping=True,
+                               l2_regularization=3.609412172481434e-10,
+                               learning_rate=0.05972079854295879, loss='auto',
+                               max_iter=512, max_leaf_nodes=4,
+                               min_samples_leaf=2, n_iter_no_change=14,
+                               random_state=1, validation_fraction=None,
+                               warm_start=True)"
+1240,airlinescodrnaadult,Auto-sklearn,-0.62,-0.683,-0.631,13.306,-0.658,594.3,0.1,"SGDClassifier(alpha=1.6992296128865824e-07, average=True, eta0=0.01, loss='log',
+              max_iter=512, penalty='l1', random_state=1,
+              tol=1.535384699341134e-05, warm_start=True)"
\ No newline at end of file