Skip to content

Commit

Permalink
add MetaLearningApproach and its children
Browse files Browse the repository at this point in the history
  • Loading branch information
MorrisNein committed Oct 25, 2023
1 parent d83aeaa commit 111ec03
Show file tree
Hide file tree
Showing 8 changed files with 227 additions and 109 deletions.
8 changes: 5 additions & 3 deletions experiments/fedot_warm_start/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ train_timeout: 15
test_timeout: 15
#meta_learning_params:
n_best_dataset_models_to_memorize: 10
n_closest_datasets_to_propose: 5
minimal_distance_between_advised_models: 1
n_best_models_to_advise: 5
mf_extractor_params:
groups: general
assessor_params:
n_neighbors: 5
advisor_params:
minimal_distance: 1
n_best_to_advise: 5
#evaluation_params:
collect_metrics:
- f1
Expand Down
8 changes: 5 additions & 3 deletions experiments/fedot_warm_start/config_debug.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ train_timeout: 0.01
test_timeout: 0.01
#meta_learning_params:
n_best_dataset_models_to_memorize: 10
n_closest_datasets_to_propose: 5
minimal_distance_between_advised_models: 1
n_best_models_to_advise: 5
mf_extractor_params:
groups: general
assessor_params:
n_neighbors: 2
advisor_params:
minimal_distance: 1
n_best_to_advise: 5
#evaluation_params:
collect_metrics:
- f1
Expand Down
158 changes: 56 additions & 102 deletions experiments/fedot_warm_start/run.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,34 @@
import functools
import json
import logging
import pickle
import timeit
from pathlib import Path

import yaml

from datetime import datetime
from itertools import chain
from typing import Dict, List, Tuple, Sequence, Any
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple

import numpy as np
import openml
import pandas as pd

import yaml
from fedot.api.main import Fedot
from fedot.core.data.data import InputData
from fedot.core.optimisers.objective import MetricsObjective, PipelineObjectiveEvaluate
from fedot.core.pipelines.adapters import PipelineAdapter
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.pipelines.pipeline_builder import PipelineBuilder
from fedot.core.repository.quality_metrics_repository import QualityMetricsEnum, MetricsRepository
from fedot.core.repository.quality_metrics_repository import MetricsRepository, QualityMetricsEnum
from fedot.core.validation.split import tabular_cv_generator
from golem.core.log import Log
from golem.core.optimisers.fitness import SingleObjFitness
from golem.core.optimisers.opt_history_objects.opt_history import OptHistory
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm


from meta_automl.data_preparation.dataset import OpenMLDataset, DatasetData, DatasetBase
from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.approaches import AdviseFedotPipelinesByNearestOpenmlDatasets
from meta_automl.data_preparation.dataset import DatasetData, DatasetIDType, OpenMLDataset
from meta_automl.data_preparation.datasets_train_test_split import openml_datasets_train_test_split
from meta_automl.data_preparation.file_system import get_cache_dir
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
from meta_automl.data_preparation.model import Model
from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor


CONFIG_PATH = Path(__file__).parent.joinpath('config.yaml')


with open(CONFIG_PATH, 'r') as config_file:
config = yaml.load(config_file, yaml.Loader)

Expand All @@ -52,9 +39,8 @@
TRAIN_TIMEOUT = config['train_timeout']
TEST_TIMEOUT = config['test_timeout']
N_BEST_DATASET_MODELS_TO_MEMORIZE = config['n_best_dataset_models_to_memorize']
N_CLOSEST_DATASETS_TO_PROPOSE = config['n_closest_datasets_to_propose']
MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS = config['minimal_distance_between_advised_models']
N_BEST_MODELS_TO_ADVISE = config['n_best_models_to_advise']
ASSESSOR_PARAMS = config['assessor_params']
ADVISOR_PARAMS = config['advisor_params']
MF_EXTRACTOR_PARAMS = config['mf_extractor_params']
COLLECT_METRICS = config['collect_metrics']
COMMON_FEDOT_PARAMS = config['common_fedot_params']
Expand Down Expand Up @@ -99,27 +85,33 @@ def get_save_dir(time_now_for_path) -> Path:
return save_dir


def fetch_datasets() -> Tuple[pd.DataFrame, pd.DataFrame, Dict[int, OpenMLDataset]]:
"""Returns dictionary with dataset names and cached datasets downloaded from OpenML."""

def get_dataset_ids() -> List[DatasetIDType]:
dataset_ids = openml.study.get_suite(99).data
if N_DATASETS is not None:
dataset_ids = pd.Series(dataset_ids)
dataset_ids = dataset_ids.sample(n=N_DATASETS, random_state=SEED)
return list(dataset_ids)


def split_datasets(dataset_ids, n_datasets: Optional[int] = None, update_train_test_split: bool = False) \
-> Tuple[pd.DataFrame, pd.DataFrame]:
split_path = Path(__file__).parent / 'train_test_datasets_split.csv'
if n_datasets is not None:
dataset_ids = pd.Series(dataset_ids)
dataset_ids = dataset_ids.sample(n=n_datasets, random_state=SEED)

if UPDATE_TRAIN_TEST_DATASETS_SPLIT:
if n_datasets is not None or update_train_test_split:
df_split_datasets = openml_datasets_train_test_split(dataset_ids, test_size=TEST_SIZE, seed=SEED)
df_split_datasets.to_csv(split_path)
else:
df_split_datasets = pd.read_csv(split_path, index_col=0)

df_datasets_train = df_split_datasets[df_split_datasets['is_train'] == 1]
df_datasets_test = df_split_datasets[df_split_datasets['is_train'] == 0]

datasets = {dataset.id_: dataset for dataset in OpenMLDatasetsLoader().load(dataset_ids)}
return df_datasets_train, df_datasets_test, datasets
if update_train_test_split:
df_split_datasets.to_csv(split_path)

return df_datasets_train, df_datasets_test


def evaluate_pipeline(pipeline: Pipeline,
Expand All @@ -145,24 +137,6 @@ def evaluate_pipeline(pipeline: Pipeline,
return metric_values


def fit_offline_meta_learning_components(best_models_per_dataset_id: Dict[int, Sequence[Model]]) \
-> (KNeighborsBasedSimilarityAssessor, PymfeExtractor, DiverseFEDOTPipelineAdvisor):
dataset_ids = list(best_models_per_dataset_id.keys())
# Meta Features
extractor = PymfeExtractor(extractor_params=MF_EXTRACTOR_PARAMS)
meta_features_train = extractor.extract(dataset_ids, fill_input_nans=True)
meta_features_train = meta_features_train.fillna(0)
# Datasets similarity
data_similarity_assessor = KNeighborsBasedSimilarityAssessor(
n_neighbors=min(len(dataset_ids), N_CLOSEST_DATASETS_TO_PROPOSE))
data_similarity_assessor.fit(meta_features_train, dataset_ids)
# Model advisor
model_advisor = DiverseFEDOTPipelineAdvisor(data_similarity_assessor, n_best_to_advise=N_BEST_MODELS_TO_ADVISE,
minimal_distance=MINIMAL_DISTANCE_BETWEEN_ADVISED_MODELS)
model_advisor.fit(best_models_per_dataset_id)
return extractor, model_advisor


def transform_data_for_fedot(data: DatasetData) -> (np.array, np.array):
x = data.x.to_numpy()
y = data.y.to_numpy()
Expand Down Expand Up @@ -203,31 +177,6 @@ def get_result_data_row(dataset: OpenMLDataset, run_label: str, pipeline, histor
return run_results


def extract_best_models_from_history(dataset: DatasetBase, history: OptHistory) -> List[Model]:
if history.individuals:
best_individuals = sorted(chain(*history.individuals),
key=lambda ind: ind.fitness,
reverse=True)
for individual in history.final_choices:
if individual not in best_individuals:
best_individuals.insert(0, individual)

best_individuals = best_individuals[:N_BEST_DATASET_MODELS_TO_MEMORIZE]

best_individuals = list({ind.graph.descriptive_id: ind for ind in best_individuals}.values())
best_models = []
for individual in best_individuals:
pipeline = PipelineAdapter().restore(individual.graph)
fitness = individual.fitness or SingleObjFitness()
model = Model(pipeline, fitness, history.objective.metric_names[0], dataset)
best_models.append(model)
else:
pipeline = PipelineAdapter().restore(history.tuning_result)
best_models = [Model(pipeline, SingleObjFitness(), history.objective.metric_names[0], dataset)]

return best_models


def save_experiment_params(params_dict: Dict[str, Any], save_dir: Path):
""" Save the hyperparameters of the experiment """
params_file_path = save_dir.joinpath('parameters.json')
Expand Down Expand Up @@ -278,31 +227,39 @@ def main():
save_dir = get_save_dir(experiment_date_for_path)
setup_logging(save_dir)
progress_file_path = save_dir.joinpath('progress.txt')
meta_learner_path = save_dir.joinpath('meta_learner.pkl')

df_datasets_train, df_datasets_test, datasets_dict = fetch_datasets()

dataset_ids = list(datasets_dict.keys())
dataset_ids = get_dataset_ids()
df_datasets_train, df_datasets_test = split_datasets(dataset_ids, N_DATASETS, UPDATE_TRAIN_TEST_DATASETS_SPLIT)
dataset_ids_test = df_datasets_train.index.to_list()
dataset_ids_test = df_datasets_test.index.to_list()

dataset_names_train = df_datasets_train['dataset_name'].to_list()
dataset_names_test = df_datasets_test['dataset_name'].to_list()

algorithm = AdviseFedotPipelinesByNearestOpenmlDatasets(
N_BEST_DATASET_MODELS_TO_MEMORIZE,
MF_EXTRACTOR_PARAMS,
ASSESSOR_PARAMS,
ADVISOR_PARAMS
)
datasets = algorithm.components.datasets_loader.load(dataset_ids)
datasets_dict = {d.id_: d for d in datasets}
datasets_dict_test = dict(filter(lambda item: item[0] in dataset_ids_test, datasets_dict.items()))
del datasets

experiment_params_dict = dict(
experiment_start_date_iso=experiment_date_iso,
input_config=config,
dataset_ids=dataset_ids,
dataset_ids_train=dataset_ids_test,
dataset_names_train=dataset_names_train,
dataset_ids_test=dataset_ids_test,
dataset_names_test=dataset_names_test,
baseline_pipeline=BASELINE_MODEL,
)
experiment_start_date_iso=experiment_date_iso,
input_config=config,
dataset_ids=dataset_ids,
dataset_ids_train=dataset_ids_test,
dataset_names_train=dataset_names_train,
dataset_ids_test=dataset_ids_test,
dataset_names_test=dataset_names_test,
baseline_pipeline=BASELINE_MODEL,
)
save_experiment_params(experiment_params_dict, save_dir)

best_models_per_dataset = {}
# Gathering knowledge base
train_histories_per_dataset = {}
with open(progress_file_path, 'a') as progress_file:
for dataset_id, dataset in tqdm(datasets_dict.items(), 'FEDOT, all datasets', file=progress_file):
try:
Expand All @@ -311,31 +268,28 @@ def main():
fedot, run_results = fit_fedot(dataset=dataset, timeout=timeout, run_label='FEDOT')
save_evaluation(run_results, run_date, experiment_date, save_dir)
# TODO:
# x Turn the tuned pipeline into a model (evaluate its fitness on the data)
# x Evaluate historical pipelines on the data instead of using fitness
# x Start FEDOT `N_BEST_DATASET_MODELS_TO_MEMORIZE` times, but not in one run

# Filter out unique individuals with the best fitness
history = fedot.history
best_models = extract_best_models_from_history(dataset, history)
best_models_per_dataset[dataset_id] = best_models
if dataset_id not in dataset_ids_test:
history = fedot.history
train_histories_per_dataset[dataset] = history
except Exception:
logging.exception(f'Train dataset "{dataset_id}"')

best_models_per_dataset_test = {dataset_id: best_models_per_dataset[dataset_id] for dataset_id in dataset_ids_test}
mf_extractor, model_advisor = fit_offline_meta_learning_components(best_models_per_dataset_test)
# Learning
algorithm.fit(train_histories_per_dataset)
with open(meta_learner_path, 'wb') as meta_learner_file:
pickle.dump(algorithm, meta_learner_file)

with open(progress_file_path, 'a') as progress_file:
for dataset_id, dataset in tqdm(datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file):
for dataset_id, dataset in tqdm(
datasets_dict_test.items(), 'MetaFEDOT, Test datasets', file=progress_file):
try:
# Run meta AutoML
# 1
time_start = timeit.default_timer()
meta_features = mf_extractor.extract([dataset],
fill_input_nans=True, use_cached=False, update_cached=True)
meta_features = meta_features.fillna(0)
initial_assumptions = algorithm.predict([dataset_id])[0]
meta_learning_time_sec = timeit.default_timer() - time_start
initial_assumptions = model_advisor.predict(meta_features)[0]

assumption_pipelines = [model.predictor for model in initial_assumptions]
# 2
run_date = datetime.now()
Expand Down
3 changes: 3 additions & 0 deletions meta_automl/approaches/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .meta_learning_approach import MetaLearningApproach
from .advise_models_by_nearest_datasets import AdviseModelsByNearestDatasets
from .advise_fedot_pipelines_by_nearest_openml_datasets import AdviseFedotPipelinesByNearestOpenmlDatasets
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from dataclasses import dataclass, field
from typing import Dict

from golem.core.optimisers.opt_history_objects.opt_history import OptHistory

from meta_automl.approaches import AdviseModelsByNearestDatasets
from meta_automl.data_preparation.dataset import DatasetBase
from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
from meta_automl.data_preparation.models_loaders import FedotHistoryLoader
from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor


class AdviseFedotPipelinesByNearestOpenmlDatasets(AdviseModelsByNearestDatasets):
@dataclass
class Parameters:
n_best_dataset_models_to_memorize: int
mf_extractor_params: dict = field(default_factory=dict)
assessor_params: dict = field(default_factory=dict)
advisor_params: dict = field(default_factory=dict)
advisor_class = DiverseFEDOTPipelineAdvisor

@dataclass
class Components:
datasets_loader: OpenMLDatasetsLoader = None
models_loader: FedotHistoryLoader = None
meta_features_extractor: PymfeExtractor = None
datasets_similarity_assessor: KNeighborsBasedSimilarityAssessor = None
model_advisor: DiverseFEDOTPipelineAdvisor = None

def __init__(self, n_best_dataset_models_to_memorize: int,
mf_extractor_params: dict, assessor_params: dict, advisor_params: dict):
super().__init__(
n_best_dataset_models_to_memorize=n_best_dataset_models_to_memorize,
mf_extractor_params=mf_extractor_params,
assessor_params=assessor_params,
advisor_params=advisor_params,
)
self.components.datasets_loader = OpenMLDatasetsLoader()

def fit(self, histories_per_dataset: Dict[DatasetBase, OptHistory]):
params = self.parameters
self.load_datasets_and_models(histories_per_dataset)
self.extract_train_meta_features(**params.mf_extractor_params)
self.fit_datasets_similarity_assessor(**params.assessor_params)
self.fit_model_advisor(**params.advisor_params)

def load_datasets_and_models(self, histories_per_dataset):
self.data.datasets = list(histories_per_dataset.keys())
self.components.models_loader = FedotHistoryLoader()
best_models = self.components.models_loader.load(
*zip(*histories_per_dataset.items()),
self.parameters.n_best_dataset_models_to_memorize
)
best_models_per_dataset_id = {dataset.id_: models for dataset, models in
zip(histories_per_dataset.keys(), best_models)}
self.data.best_models_per_dataset_id = best_models_per_dataset_id

def extract_train_meta_features(self, **mf_extractor_params):
self.components.meta_features_extractor = PymfeExtractor(extractor_params=mf_extractor_params,
datasets_loader=self.components.datasets_loader)
meta_features_train = self.components.meta_features_extractor.extract(
self.data.datasets, fill_input_nans=True)
meta_features_train = meta_features_train.fillna(0)
self.data.meta_features = meta_features_train
Loading

0 comments on commit 111ec03

Please sign in to comment.