Skip to content

Commit

Permalink
Fix common issues (#68)
Browse files Browse the repository at this point in the history
* fix inner components

* fix time series components

* separate interface of DatasetSimilarityAssessor and ModelAdvisor

* fix examples

* fix typos

* fix feature extractors

* make abstract classes inherit ABC

* rename Model to EvaluatedModel

* fix path to surrogate knowledge base

* fix test_file_system.py, add test_cache.py

* use Path instead of str

* pep8

* pep8 & minor fixes

* fix classes inheritance

* fix ts example

* add get_checkpoints_dir(), fix examples

* add test_checkpoints_dir

* pep8

* delete inconsistent example

* fix type hints

* fix logging

* use __init__.py files for time series components

* update the archive

* fix arbitrary path at advise_by_surrogate.py example

* split index with df

* remove unnecessary context

* better mode computation

* test input nans filling by default
MorrisNein authored Nov 10, 2023
1 parent 069719f commit 150d53c
Showing 51 changed files with 526 additions and 403 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -31,7 +31,7 @@ and maintaining their external interfaces.
Automate dataset management, including retrieval, caching, and loading into memory. Optimize experiments by minimizing
calls to the dataset source and conserve memory usage.

### Models Loader & Model
### Models Loader & EvaluatedModel

Import and consolidate model evaluation data for datasets. Support experiment selection based on predefined criteria,
currently compatible with FEDOT AutoML framework results.
Binary file modified data/pymfe_meta_features_and_fedot_pipelines.zip
Binary file not shown.
5 changes: 2 additions & 3 deletions docs/source/gamlet/gamlet.rst
Original file line number Diff line number Diff line change
@@ -11,9 +11,8 @@ Datasets Loader & Dataset

| Automate dataset management, including retrieval, caching, and loading into memory.
| Optimize experiments by minimizing calls to the dataset source and conserve memory usage.
Models Loader & Model
---------------------
Models Loader & EvaluatedModel
------------------------------

| Import and consolidate model evaluation data for datasets.
| Support experiment selection based on predefined criteria, currently compatible with FEDOT AutoML framework results.
2 changes: 1 addition & 1 deletion examples/0_loading_data/load_time_series_datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from meta_automl.data_preparation.datasets_loaders.timeseries_dataset_loader import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.datasets_loaders import TimeSeriesDatasetsLoader


def get_datasets():
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import os
from pathlib import Path

from meta_automl.data_preparation.datasets_loaders.timeseries_dataset_loader import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.datasets_loaders import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.file_system import get_project_root
from meta_automl.data_preparation.meta_features_extractors.time_series.time_series_meta_features_extractor import \
TimeSeriesFeaturesExtractor
from meta_automl.data_preparation.meta_features_extractors import TimeSeriesFeaturesExtractor


def main():
@@ -20,3 +19,4 @@ def main():

if __name__ == '__main__':
result = main()
print(result)

This file was deleted.

Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@

from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
from meta_automl.meta_algorithm.dataset_similarity_assessors import KNeighborsSimilarityAssessor


def main():
@@ -17,7 +17,7 @@ def main():
# Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
y_train = x_train.index
assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=3)
assessor = KNeighborsSimilarityAssessor(n_neighbors=3)
assessor.fit(x_train, y_train)
# Get models for the best fitting datasets from train.
return x_test.index, assessor.predict(x_test, return_distance=True)
20 changes: 10 additions & 10 deletions examples/3_selecting_similar_datasets/select_similar_ts_by_knn.py
Original file line number Diff line number Diff line change
@@ -3,11 +3,10 @@

from sklearn.model_selection import train_test_split

from meta_automl.data_preparation.datasets_loaders.timeseries_dataset_loader import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.datasets_loaders import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.file_system import get_project_root
from meta_automl.data_preparation.meta_features_extractors.time_series.time_series_meta_features_extractor import \
TimeSeriesFeaturesExtractor
from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
from meta_automl.data_preparation.meta_features_extractors import TimeSeriesFeaturesExtractor
from meta_automl.meta_algorithm.dataset_similarity_assessors import KNeighborsSimilarityAssessor


def main():
@@ -21,13 +20,14 @@ def main():
# Preprocess meta-features, as KNN does not support NaNs.
meta_features = meta_features.dropna(axis=1, how='any')
# Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
y_train = x_train.index
assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=3)
assessor.fit(x_train, y_train)
# Get models for the best fitting datasets from train.
return x_test.index, assessor.predict(x_test, return_distance=True)
mf_train, mf_test, did_train, did_test = train_test_split(meta_features, meta_features.index, train_size=0.75,
random_state=42)
assessor = KNeighborsSimilarityAssessor(n_neighbors=3)
assessor.fit(mf_train, did_train)
# Get the closest datasets from train.
return did_test, assessor.predict(mf_test, return_distance=True)


if __name__ == '__main__':
result = main()
print(result)
Original file line number Diff line number Diff line change
@@ -6,17 +6,16 @@
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from meta_automl.data_preparation.dataset.time_series_dataset import TimeSeriesDataset
from meta_automl.data_preparation.datasets_loaders.timeseries_dataset_loader import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.dataset import TimeSeriesDataset
from meta_automl.data_preparation.datasets_loaders import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.evaluated_model import EvaluatedModel
from meta_automl.data_preparation.file_system import get_project_root
from meta_automl.data_preparation.meta_features_extractors.time_series.time_series_meta_features_extractor import \
TimeSeriesFeaturesExtractor
from meta_automl.data_preparation.model import Model
from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
from meta_automl.data_preparation.meta_features_extractors import TimeSeriesFeaturesExtractor
from meta_automl.meta_algorithm.dataset_similarity_assessors import KNeighborsSimilarityAssessor
from meta_automl.meta_algorithm.model_advisors import DiverseModelAdvisor


def dataset_to_pipelines(d_id):
def dataset_to_models(d_id):
adapter = PipelineAdapter()
dir_to_search = Path(get_project_root(), 'data', 'knowledge_base_time_series_0', 'datasets', d_id)
try:
@@ -31,10 +30,10 @@ def dataset_to_pipelines(d_id):
for ind in gen:
if ind.fitness.value < best_fitness:
pipeline = adapter.restore(ind.graph)
best_model = Model(pipeline, ind.fitness.value, history.objective.metric_names[0],
TimeSeriesDataset(d_id))
best_model = EvaluatedModel(pipeline, ind.fitness.value, history.objective.metric_names[0],
TimeSeriesDataset(d_id))
best_fitness = ind.fitness.value
return best_model
return [best_model]


def main():
@@ -47,22 +46,30 @@ def main():
extractor = TimeSeriesFeaturesExtractor()
meta_features = extractor.extract(datasets)
meta_features = meta_features.dropna(axis=1, how='any')
dataset_ids = meta_features.index

# Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
y_train = x_train.index
mf_train, mf_test, did_train, did_test = train_test_split(meta_features, dataset_ids, train_size=0.75,
random_state=42)

# Define best models for datasets.
dataset_names_to_best_pipelines = {}
for d_id in tqdm(y_train):
if dataset_to_pipelines(d_id) is not None:
dataset_names_to_best_pipelines[d_id] = dataset_to_pipelines(d_id)
x_train = x_train[x_train.index.isin(dataset_names_to_best_pipelines.keys())]
y_train = y_train[y_train.isin(dataset_names_to_best_pipelines.keys())]
assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=2)
assessor.fit(x_train, y_train)
advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines)
return advisor.predict(x_test)
dataset_ids_to_best_models = {}
for d_id in tqdm(did_train, 'Loading models for train datasets'):
best_models_train = dataset_to_models(d_id)
if best_models_train is not None:
dataset_ids_to_best_models[d_id] = best_models_train
mf_train = mf_train[mf_train.index.isin(dataset_ids_to_best_models.keys())]
did_train = did_train[did_train.isin(dataset_ids_to_best_models.keys())]
dataset_ids_train, best_models_train = zip(*dataset_ids_to_best_models.items())

# Train the component that calculates similarity between datasets
assessor = KNeighborsSimilarityAssessor(n_neighbors=2).fit(mf_train, did_train)
# Train the component remembers best models for datasets
advisor = DiverseModelAdvisor(minimal_distance=2).fit(dataset_ids_train, best_models_train)
# Predict similar datasets from train
did_pred = assessor.predict(mf_test)
# Predict models for similar datasets
return advisor.predict(did_pred)


if __name__ == '__main__':
14 changes: 10 additions & 4 deletions examples/4_advising_models/advise_by_surrogate.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
import os
import pickle
from pathlib import Path

import yaml

from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.file_system.file_system import get_configs_dir, get_project_root
from meta_automl.meta_algorithm.model_advisors import SurrogateGNNPipelineAdvisor


def main():
# Define datasets
dataset = OpenMLDatasetsLoader().load(["apsfailure"], allow_names=True)
# Extract meta-features and load on demand.

with open("configs/run_surrogate_model.yml") as f:
with open(get_configs_dir() / 'run_surrogate_model.yml') as f:
config = yaml.load(f, yaml.Loader)

with open(os.path.join(config["dataset_params"]["root_path"], "pipelines.pickle"), "rb") as input_file:
project_root = get_project_root()
pipelines_data_path = Path(config["dataset_params"]["root_path"], "pipelines.pickle")
pipelines_fedot_data_path = os.path.join(config["dataset_params"]["root_path"], "pipelines_fedot.pickle")
pipelines_data_path = project_root / pipelines_data_path
pipelines_fedot_data_path = project_root / pipelines_fedot_data_path
with open(pipelines_data_path, "rb") as input_file:
pipelines_data = pickle.load(input_file)
with open(os.path.join(config["dataset_params"]["root_path"], "pipelines_fedot.pickle"), "rb") as input_file:
with open(pipelines_fedot_data_path, "rb") as input_file:
pipelines_fedot = pickle.load(input_file)

advisor = SurrogateGNNPipelineAdvisor(config, pipelines_data, pipelines_fedot)
30 changes: 17 additions & 13 deletions examples/4_advising_models/advise_models_from_similar_datasets.py
Original file line number Diff line number Diff line change
@@ -4,10 +4,10 @@

from meta_automl.data_preparation.dataset import OpenMLDataset
from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.evaluated_model import EvaluatedModel
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
from meta_automl.data_preparation.model import Model
from meta_automl.meta_algorithm.datasets_similarity_assessors import KNeighborsBasedSimilarityAssessor
from meta_automl.meta_algorithm.model_advisors import DiverseFEDOTPipelineAdvisor
from meta_automl.meta_algorithm.dataset_similarity_assessors import KNeighborsSimilarityAssessor
from meta_automl.meta_algorithm.model_advisors import DiverseModelAdvisor


def main():
@@ -19,24 +19,28 @@ def main():
meta_features = extractor.extract(datasets)
# Preprocess meta-features, as KNN does not support NaNs.
meta_features = meta_features.dropna(axis=1, how='any')
dataset_ids = meta_features.index
# Split datasets to train (preprocessing) and test (actual meta-algorithm objects).
x_train, x_test = train_test_split(meta_features, train_size=0.75, random_state=42)
y_train = x_train.index
assessor = KNeighborsBasedSimilarityAssessor(n_neighbors=2)
assessor.fit(x_train, y_train)
mf_train, mf_test, did_train, did_test = train_test_split(meta_features, dataset_ids, train_size=0.75,
random_state=42)

# Define best models for datasets.
best_pipelines = [
PipelineBuilder().add_node('scaling').add_node('rf').build(),
PipelineBuilder().add_node('normalization').add_node('logit').build(),
PipelineBuilder().add_node('rf').add_node('logit').build()
]
best_models = [[Model(pipeline, SingleObjFitness(1), 'some_metric_name', OpenMLDataset(dataset_id))]
for dataset_id, pipeline in zip(y_train, best_pipelines)]

dataset_names_to_best_pipelines = dict(zip(y_train, best_models))
advisor = DiverseFEDOTPipelineAdvisor(assessor, minimal_distance=2).fit(dataset_names_to_best_pipelines)
best_models_train = [[EvaluatedModel(pipeline, SingleObjFitness(1), 'some_metric_name', OpenMLDataset(dataset_id))]
for dataset_id, pipeline in zip(did_train, best_pipelines)]

return advisor.predict(x_test)
# Train the component that calculates similarity between datasets
assessor = KNeighborsSimilarityAssessor(n_neighbors=2).fit(mf_train, did_train)
# Train the component remembers best models for datasets
advisor = DiverseModelAdvisor(minimal_distance=2).fit(dataset_ids=did_train, models=best_models_train)
# Predict similar datasets from train
did_pred = assessor.predict(mf_test)
# Predict models for similar datasets
return advisor.predict(dataset_ids=did_pred)


if __name__ == '__main__':
5 changes: 2 additions & 3 deletions examples/5_visualisation/visualisation_ts_embeddings.py
Original file line number Diff line number Diff line change
@@ -8,10 +8,9 @@
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import QuantileTransformer

from meta_automl.data_preparation.datasets_loaders.timeseries_dataset_loader import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.datasets_loaders import TimeSeriesDatasetsLoader
from meta_automl.data_preparation.file_system import get_project_root
from meta_automl.data_preparation.meta_features_extractors.time_series.time_series_meta_features_extractor import \
TimeSeriesFeaturesExtractor
from meta_automl.data_preparation.meta_features_extractors import TimeSeriesFeaturesExtractor

p = Path(get_project_root(), 'data', 'knowledge_base_time_series_0', 'datasets')
len_d = {i: len(pd.read_csv(Path(p, i, 'data.csv'))) for i in os.listdir(p)}
15 changes: 6 additions & 9 deletions examples/6_gnn_surrogate/accessing_pipeline_dataset_encoders.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
import openml

from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.file_system.file_system import get_checkpoints_dir
from meta_automl.data_preparation.pipeline_features_extractors import FEDOTPipelineFeaturesExtractor
from meta_automl.surrogate.data_pipeline_surrogate import PipelineVectorizer
from meta_automl.surrogate.surrogate_model import RankingPipelineDatasetSurrogateModel

if __name__ == '__main__':
dataset_name = 'higgs'
datasets_loader = OpenMLDatasetsLoader()
dset = openml.datasets.get_dataset(dataset_name)
open_ml_dataset_id = dset.id
train_data = datasets_loader.load_single(open_ml_dataset_id)

dataset = datasets_loader.load_single(dataset_name, allow_name=True)
checkpoints_dir = get_checkpoints_dir() / 'tabular'
# Load surrogate model
surrogate_model = RankingPipelineDatasetSurrogateModel.load_from_checkpoint(
checkpoint_path="./experiments/base/checkpoints/last.ckpt",
hparams_file="./experiments/base/hparams.yaml"
checkpoint_path=checkpoints_dir / 'checkpoints/best.ckpt',
hparams_file=checkpoints_dir / 'hparams.yaml'
)
surrogate_model.eval()

@@ -26,7 +23,7 @@
print(surrogate_model.dataset_encoder)

pipeline_features_extractor = FEDOTPipelineFeaturesExtractor(include_operations_hyperparameters=False,
operation_encoding="ordinal")
operation_encoding='ordinal')
pipeline_vectorizer = PipelineVectorizer(
pipeline_features_extractor=pipeline_features_extractor,
pipeline_estimator=surrogate_model
18 changes: 9 additions & 9 deletions examples/6_gnn_surrogate/surrogate_optimizer_example.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from functools import partial

import openml
from fedot.api.main import Fedot
from golem.core.optimisers.meta.surrogate_optimizer import SurrogateEachNgenOptimizer

from meta_automl.data_preparation.datasets_loaders import OpenMLDatasetsLoader
from meta_automl.data_preparation.feature_preprocessors import FeaturesPreprocessor
from meta_automl.data_preparation.file_system import get_data_dir
from meta_automl.data_preparation.file_system.file_system import get_checkpoints_dir, get_configs_dir
from meta_automl.data_preparation.meta_features_extractors import PymfeExtractor
from meta_automl.data_preparation.pipeline_features_extractors import FEDOTPipelineFeaturesExtractor
from meta_automl.surrogate.data_pipeline_surrogate import DataPipelineSurrogate, get_extractor_params
@@ -14,14 +15,13 @@
if __name__ == '__main__':
dataset_name = 'sylvine' # Specify your OpenML dataset here to get the dataset meta-features.
datasets_loader = OpenMLDatasetsLoader()
dset = openml.datasets.get_dataset(dataset_name)
open_ml_dataset_id = dset.id
train_data = datasets_loader.load_single(open_ml_dataset_id)
train_data = datasets_loader.load_single(dataset_name, allow_name=True)
surrogate_knowledge_base_dir = get_checkpoints_dir() / 'tabular'

# Load surrogate model
surrogate_model = RankingPipelineDatasetSurrogateModel.load_from_checkpoint(
checkpoint_path="./experiments/base/checkpoints/best.ckpt",
hparams_file="./experiments/base/hparams.yaml"
checkpoint_path=surrogate_knowledge_base_dir / "checkpoints/best.ckpt",
hparams_file=surrogate_knowledge_base_dir / "hparams.yaml"
)
surrogate_model.eval()

@@ -30,16 +30,16 @@
operation_encoding="ordinal")

# Prepare dataset extractor and extract metafeatures
extractor_params = get_extractor_params('configs/use_features.json')
extractor_params = get_extractor_params(get_configs_dir() / 'use_features.json')
meta_features_extractor = PymfeExtractor(
extractor_params=extractor_params,
)
meta_features_preprocessor = FeaturesPreprocessor(
load_path="./data/pymfe_meta_features_and_fedot_pipelines/all/meta_features_preprocessors.pickle",
load_path=get_data_dir() / "pymfe_meta_features_and_fedot_pipelines/all/meta_features_preprocessors.pickle",
extractor_params=extractor_params)
x_dset = meta_features_extractor.extract([train_data], fill_input_nans=True).fillna(0)

# Compose extractors and model into joint sturcture
# Compose extractors and model into joint structure
surrogate_pipeline = DataPipelineSurrogate(
pipeline_features_extractor=pipeline_features_extractor,
dataset_meta_features=x_dset,
Loading

0 comments on commit 150d53c

Please sign in to comment.