diff --git a/experiments/fedot_warm_start/config.yaml b/experiments/fedot_warm_start/config.yaml index bcab1083..5023e74b 100644 --- a/experiments/fedot_warm_start/config.yaml +++ b/experiments/fedot_warm_start/config.yaml @@ -7,11 +7,13 @@ train_timeout: 15 test_timeout: 15 #meta_learning_params: n_best_dataset_models_to_memorize: 10 -n_closest_datasets_to_propose: 5 -minimal_distance_between_advised_models: 1 -n_best_models_to_advise: 5 mf_extractor_params: groups: general +assessor_params: + n_neighbors: 5 +advisor_params: + minimal_distance: 1 + n_best_to_advise: 5 #evaluation_params: collect_metrics: - f1 diff --git a/experiments/fedot_warm_start/config_debug.yaml b/experiments/fedot_warm_start/config_debug.yaml index 226cbe38..11d0d26a 100644 --- a/experiments/fedot_warm_start/config_debug.yaml +++ b/experiments/fedot_warm_start/config_debug.yaml @@ -7,11 +7,13 @@ train_timeout: 0.01 test_timeout: 0.01 #meta_learning_params: n_best_dataset_models_to_memorize: 10 -n_closest_datasets_to_propose: 5 -minimal_distance_between_advised_models: 1 -n_best_models_to_advise: 5 mf_extractor_params: groups: general +assessor_params: + n_neighbors: 2 +advisor_params: + minimal_distance: 1 + n_best_to_advise: 5 #evaluation_params: collect_metrics: - f1 diff --git a/experiments/fedot_warm_start/run.py b/experiments/fedot_warm_start/run.py index bc181902..9a3de148 100644 --- a/experiments/fedot_warm_start/run.py +++ b/experiments/fedot_warm_start/run.py @@ -1,47 +1,34 @@ import functools import json import logging +import pickle import timeit -from pathlib import Path - -import yaml - from datetime import datetime -from itertools import chain -from typing import Dict, List, Tuple, Sequence, Any +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple import numpy as np import openml import pandas as pd - +import yaml from fedot.api.main import Fedot from fedot.core.data.data import InputData from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate -from fedot.core.pipelines.adapters import PipelineAdapter from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.pipeline_builder import PipelineBuilder -from fedot.core.repository.quality_metrics_repository import QualityMetricsEnum, MetricsRepository +from fedot.core.repository.quality_metrics_repository import MetricsRepository, QualityMetricsEnum from fedot.core.validation.split import tabular_cv_generator from golem.core.log import Log -from golem.core.optimisers.fitness import SingleObjFitness -from golem.core.optimisers.opt_history_objects.opt_history import OptHistory from sklearn.model_selection import StratifiedKFold from tqdm import tqdm - -from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData, DatasetBase -from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.approaches import AdviseFedotPipelinesByNearestOpenmlDatasets +from meta_automl.data_preparation.dataset import DatasetData, DatasetIDType, OpenMLDataset from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split from meta_automl.data_preparation.file_system import get_cache_dir -from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor -from meta_automl.data_preparation.model import Model -from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor -from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor - CONFIG_PATH = Path(__file__).parent.joinpath('config.yaml') - with open(CONFIG_PATH, 'r') as config_file: config = yaml.load(config_file, yaml.Loader) @@ -52,9 +39,8 @@ TRAIN_TIMEOUT = config['train_timeout'] TEST_TIMEOUT = config['test_timeout'] N_BEST_DATASET_MODELS_TO_MEMORIZE = config['n_best_dataset_models_to_memorize'] -N_CLOSEST_DATASETS_TO_PROPOSE = config['n_closest_datasets_to_propose'] -MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = config['minimal_distance_between_advised_models'] -N_BEST_MODELS_TO_ADVISE = config['n_best_models_to_advise'] +ASSESSOR_PARAMS = config['assessor_params'] +ADVISOR_PARAMS = config['advisor_params'] MF_EXTRACTOR_PARAMS = config['mf_extractor_params'] COLLECT_METRICS = config['collect_metrics'] COMMON_FEDOT_PARAMS = config['common_fedot_params'] @@ -99,27 +85,33 @@ def get_save_dir(time_now_for_path) -> Path: return save_dir -def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]: - """Returns dictionary with dataset names and cached datasets downloaded from OpenML.""" - +def get_dataset_ids() -> List[DatasetIDType]: dataset_ids = openml.study.get_suite(99).data if N_DATASETS is not None: dataset_ids = pd.Series(dataset_ids) dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED) + return list(dataset_ids) + +def split_datasets(dataset_ids, n_datasets: Optional[int] = None, update_train_test_split: bool = False) \ + -> Tuple[pd.DataFrame, pd.DataFrame]: split_path = Path(__file__).parent / 'train_test_datasets_split.csv' + if n_datasets is not None: + dataset_ids = pd.Series(dataset_ids) + dataset_ids = dataset_ids.sample(n=n_datasets, random_state=SEED) - if UPDATE_TRAIN_TEST_DATASETS_SPLIT: + if n_datasets is not None or update_train_test_split: df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED) - df_split_datasets.to_csv(split_path) else: df_split_datasets = pd.read_csv(split_path, index_col=0) df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1] df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0] - datasets = {dataset.id_: dataset for dataset in OpenMLDatasetsLoader().load(dataset_ids)} - return df_datasets_train, df_datasets_test, datasets + if update_train_test_split: + df_split_datasets.to_csv(split_path) + + return df_datasets_train, df_datasets_test def evaluate_pipeline(pipeline: Pipeline, @@ -145,24 +137,6 @@ def evaluate_pipeline(pipeline: Pipeline, return metric_values -def fit_offline_meta_learning_components(best_models_per_dataset_id: Dict[int, Sequence[Model]]) \ - -> (KNeighborsBasedSimilarityAssessor, PymfeExtractor, DiverseFEDOTPipelineAdvisor): - dataset_ids = list(best_models_per_dataset_id.keys()) - # Meta Features - extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS) - meta_features_train = extractor.extract(dataset_ids, fill_input_nans=True) - meta_features_train = meta_features_train.fillna(0) - # Datasets similarity - data_similarity_assessor = KNeighborsBasedSimilarityAssessor( - n_neighbors=min(len(dataset_ids), N_CLOSEST_DATASETS_TO_PROPOSE)) - data_similarity_assessor.fit(meta_features_train, dataset_ids) - # Model advisor - model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE, - minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS) - model_advisor.fit(best_models_per_dataset_id) - return extractor, model_advisor - - def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array): x = data.x.to_numpy() y = data.y.to_numpy() @@ -203,31 +177,6 @@ def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, histor return run_results -def extract_best_models_from_history(dataset: DatasetBase, history: OptHistory) -> List[Model]: - if history.individuals: - best_individuals = sorted(chain(*history.individuals), - key=lambda ind: ind.fitness, - reverse=True) - for individual in history.final_choices: - if individual not in best_individuals: - best_individuals.insert(0, individual) - - best_individuals = best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE] - - best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values()) - best_models = [] - for individual in best_individuals: - pipeline = PipelineAdapter().restore(individual.graph) - fitness = individual.fitness or SingleObjFitness() - model = Model(pipeline, fitness, history.objective.metric_names[0], dataset) - best_models.append(model) - else: - pipeline = PipelineAdapter().restore(history.tuning_result) - best_models = [Model(pipeline, SingleObjFitness(), history.objective.metric_names[0], dataset)] - - return best_models - - def save_experiment_params(params_dict: Dict[str, Any], save_dir: Path): """ Save the hyperparameters of the experiment """ params_file_path = save_dir.joinpath('parameters.json') @@ -278,31 +227,39 @@ def main(): save_dir = get_save_dir(experiment_date_for_path) setup_logging(save_dir) progress_file_path = save_dir.joinpath('progress.txt') + meta_learner_path = save_dir.joinpath('meta_learner.pkl') - df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets() - - dataset_ids = list(datasets_dict.keys()) + dataset_ids = get_dataset_ids() + df_datasets_train, df_datasets_test = split_datasets(dataset_ids, N_DATASETS, UPDATE_TRAIN_TEST_DATASETS_SPLIT) dataset_ids_test = df_datasets_train.index.to_list() dataset_ids_test = df_datasets_test.index.to_list() - dataset_names_train = df_datasets_train['dataset_name'].to_list() dataset_names_test = df_datasets_test['dataset_name'].to_list() + algorithm = AdviseFedotPipelinesByNearestOpenmlDatasets( + N_BEST_DATASET_MODELS_TO_MEMORIZE, + MF_EXTRACTOR_PARAMS, + ASSESSOR_PARAMS, + ADVISOR_PARAMS + ) + datasets = algorithm.components.datasets_loader.load(dataset_ids) + datasets_dict = {d.id_: d for d in datasets} datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items())) + del datasets experiment_params_dict = dict( - experiment_start_date_iso=experiment_date_iso, - input_config=config, - dataset_ids=dataset_ids, - dataset_ids_train=dataset_ids_test, - dataset_names_train=dataset_names_train, - dataset_ids_test=dataset_ids_test, - dataset_names_test=dataset_names_test, - baseline_pipeline=BASELINE_MODEL, - ) + experiment_start_date_iso=experiment_date_iso, + input_config=config, + dataset_ids=dataset_ids, + dataset_ids_train=dataset_ids_test, + dataset_names_train=dataset_names_train, + dataset_ids_test=dataset_ids_test, + dataset_names_test=dataset_names_test, + baseline_pipeline=BASELINE_MODEL, + ) save_experiment_params(experiment_params_dict, save_dir) - - best_models_per_dataset = {} + # Gathering knowledge base + train_histories_per_dataset = {} with open(progress_file_path, 'a') as progress_file: for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file): try: @@ -311,31 +268,28 @@ def main(): fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT') save_evaluation(run_results, run_date, experiment_date, save_dir) # TODO: - # x Turn the tuned pipeline into a model (evaluate its fitness on the data) - # x Evaluate historical pipelines on the data instead of using fitness # x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run - - # Filter out unique individuals with the best fitness - history = fedot.history - best_models = extract_best_models_from_history(dataset, history) - best_models_per_dataset[dataset_id] = best_models + if dataset_id not in dataset_ids_test: + history = fedot.history + train_histories_per_dataset[dataset] = history except Exception: logging.exception(f'Train dataset "{dataset_id}"') - best_models_per_dataset_test = {dataset_id: best_models_per_dataset[dataset_id] for dataset_id in dataset_ids_test} - mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset_test) + # Learning + algorithm.fit(train_histories_per_dataset) + with open(meta_learner_path, 'wb') as meta_learner_file: + pickle.dump(algorithm, meta_learner_file) with open(progress_file_path, 'a') as progress_file: - for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file): + for dataset_id, dataset in tqdm( + datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file): try: # Run meta AutoML # 1 time_start = timeit.default_timer() - meta_features = mf_extractor.extract([dataset], - fill_input_nans=True, use_cached=False, update_cached=True) - meta_features = meta_features.fillna(0) + initial_assumptions = algorithm.predict([dataset_id])[0] meta_learning_time_sec = timeit.default_timer() - time_start - initial_assumptions = model_advisor.predict(meta_features)[0] + assumption_pipelines = [model.predictor for model in initial_assumptions] # 2 run_date = datetime.now() diff --git a/meta_automl/approaches/__init__.py b/meta_automl/approaches/__init__.py new file mode 100644 index 00000000..56e526a3 --- /dev/null +++ b/meta_automl/approaches/__init__.py @@ -0,0 +1,3 @@ +from .meta_learning_approach import MetaLearningApproach +from .advise_models_by_nearest_datasets import AdviseModelsByNearestDatasets +from .advise_fedot_pipelines_by_nearest_openml_datasets import AdviseFedotPipelinesByNearestOpenmlDatasets diff --git a/meta_automl/approaches/advise_fedot_pipelines_by_nearest_openml_datasets.py b/meta_automl/approaches/advise_fedot_pipelines_by_nearest_openml_datasets.py new file mode 100644 index 00000000..601da6d8 --- /dev/null +++ b/meta_automl/approaches/advise_fedot_pipelines_by_nearest_openml_datasets.py @@ -0,0 +1,66 @@ +from dataclasses import dataclass, field +from typing import Dict + +from golem.core.optimisers.opt_history_objects.opt_history import OptHistory + +from meta_automl.approaches import AdviseModelsByNearestDatasets +from meta_automl.data_preparation.dataset import DatasetBase +from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader +from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor +from meta_automl.data_preparation.models_loaders import FedotHistoryLoader +from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor +from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor + + +class AdviseFedotPipelinesByNearestOpenmlDatasets(AdviseModelsByNearestDatasets): + @dataclass + class Parameters: + n_best_dataset_models_to_memorize: int + mf_extractor_params: dict = field(default_factory=dict) + assessor_params: dict = field(default_factory=dict) + advisor_params: dict = field(default_factory=dict) + advisor_class = DiverseFEDOTPipelineAdvisor + + @dataclass + class Components: + datasets_loader: OpenMLDatasetsLoader = None + models_loader: FedotHistoryLoader = None + meta_features_extractor: PymfeExtractor = None + datasets_similarity_assessor: KNeighborsBasedSimilarityAssessor = None + model_advisor: DiverseFEDOTPipelineAdvisor = None + + def __init__(self, n_best_dataset_models_to_memorize: int, + mf_extractor_params: dict, assessor_params: dict, advisor_params: dict): + super().__init__( + n_best_dataset_models_to_memorize=n_best_dataset_models_to_memorize, + mf_extractor_params=mf_extractor_params, + assessor_params=assessor_params, + advisor_params=advisor_params, + ) + self.components.datasets_loader = OpenMLDatasetsLoader() + + def fit(self, histories_per_dataset: Dict[DatasetBase, OptHistory]): + params = self.parameters + self.load_datasets_and_models(histories_per_dataset) + self.extract_train_meta_features(**params.mf_extractor_params) + self.fit_datasets_similarity_assessor(**params.assessor_params) + self.fit_model_advisor(**params.advisor_params) + + def load_datasets_and_models(self, histories_per_dataset): + self.data.datasets = list(histories_per_dataset.keys()) + self.components.models_loader = FedotHistoryLoader() + best_models = self.components.models_loader.load( + *zip(*histories_per_dataset.items()), + self.parameters.n_best_dataset_models_to_memorize + ) + best_models_per_dataset_id = {dataset.id_: models for dataset, models in + zip(histories_per_dataset.keys(), best_models)} + self.data.best_models_per_dataset_id = best_models_per_dataset_id + + def extract_train_meta_features(self, **mf_extractor_params): + self.components.meta_features_extractor = PymfeExtractor(extractor_params=mf_extractor_params, + datasets_loader=self.components.datasets_loader) + meta_features_train = self.components.meta_features_extractor.extract( + self.data.datasets, fill_input_nans=True) + meta_features_train = meta_features_train.fillna(0) + self.data.meta_features = meta_features_train diff --git a/meta_automl/approaches/advise_models_by_nearest_datasets.py b/meta_automl/approaches/advise_models_by_nearest_datasets.py new file mode 100644 index 00000000..a2027f37 --- /dev/null +++ b/meta_automl/approaches/advise_models_by_nearest_datasets.py @@ -0,0 +1,62 @@ +from abc import abstractmethod +from dataclasses import dataclass, field +from typing import Dict, List + +import pandas as pd + +from meta_automl.approaches import MetaLearningApproach +from meta_automl.data_preparation.dataset import DatasetBase, DatasetIDType +from meta_automl.data_preparation.meta_features_extractors import MetaFeaturesExtractor +from meta_automl.data_preparation.model import Model +from meta_automl.meta_algorithm.datasets_similarity_assessors import DatasetsSimilarityAssessor, \ + KNeighborsBasedSimilarityAssessor +from meta_automl.meta_algorithm.model_advisors import ModelAdvisor, SimpleSimilarityModelAdvisor + + +class AdviseModelsByNearestDatasets(MetaLearningApproach): + @dataclass + class Parameters: + assessor_params: dict = field(default_factory=dict) + advisor_params: dict = field(default_factory=dict) + advisor_class = SimpleSimilarityModelAdvisor + + @dataclass + class Data: + meta_features: pd.DataFrame = None + datasets: List[DatasetBase] = None + best_models_per_dataset_id: Dict[DatasetIDType, List[Model]] = None + + @dataclass + class Components: + meta_features_extractor: MetaFeaturesExtractor = None + datasets_similarity_assessor: DatasetsSimilarityAssessor = None + model_advisor: ModelAdvisor = None + + @abstractmethod + def fit(self, *args, **kwargs): + # ... # load data + # self.fit_datasets_similarity_assessor(**self.parameters.assessor_params) + # self.fit_model_advisor(**self.parameters.advisor_params) + raise NotImplementedError() + + def fit_datasets_similarity_assessor(self, **assessor_params): + data = self.data + components = self.components + components.datasets_similarity_assessor = KNeighborsBasedSimilarityAssessor(**assessor_params) + components.datasets_similarity_assessor.fit(data.meta_features, data.datasets) + + def fit_model_advisor(self, **advisor_params): + params = self.parameters + data = self.data + components = self.components + components.model_advisor = params.advisor_class(components.datasets_similarity_assessor, **advisor_params) + components.model_advisor.fit(data.best_models_per_dataset_id) + + def predict(self, datasets_ids) -> List[List[Model]]: + extraction_params = dict( + fill_input_nans=True, use_cached=False, update_cached=True + ) + mf_extractor = self.components.meta_features_extractor + advisor = self.components.model_advisor + meta_features = mf_extractor.extract(datasets_ids, **extraction_params).fillna(0) + return advisor.predict(meta_features) diff --git a/meta_automl/approaches/meta_learning_approach.py b/meta_automl/approaches/meta_learning_approach.py new file mode 100644 index 00000000..75782ce1 --- /dev/null +++ b/meta_automl/approaches/meta_learning_approach.py @@ -0,0 +1,29 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass + + +class MetaLearningApproach(ABC): + @dataclass + class Parameters: + pass + + @dataclass + class Data: + pass + + @dataclass + class Components: + pass + + def __init__(self, *args, **kwargs): + self.parameters = self.Parameters(*args, **kwargs) + self.data = self.Data() + self.components = self.Components() + + @abstractmethod + def fit(self, *args, **kwargs): + raise NotImplementedError() + + @abstractmethod + def predict(self, *args, **kwargs): + raise NotImplementedError() diff --git a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py index 116e9e17..6ff0a30c 100644 --- a/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py +++ b/meta_automl/meta_algorithm/model_advisors/diverse_fedot_pipeline_advisor.py @@ -25,7 +25,7 @@ def _predict_single(self, similar_dataset_ids: Iterable[DatasetIDType]) -> List[ first_model = dataset_advice[0] diverse_dataset_advice = [first_model] for model in dataset_advice[1:]: - if self.distance_func(first_model.predictor, model.predictor) > self.minimal_distance: + if self.distance_func(first_model.predictor, model.predictor) >= self.minimal_distance: diverse_dataset_advice.append(model) if self.n_best_to_advise is not None: