Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: KeyError no longer occurs when using groupfolds for regression tasks. #1385

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions flaml/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def custom_metric(
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
Expand Down Expand Up @@ -739,7 +739,7 @@ def retrain_from_log(
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
Expand Down Expand Up @@ -1358,7 +1358,7 @@ def custom_metric(
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
Expand Down
4 changes: 2 additions & 2 deletions flaml/automl/task/generic_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,8 +442,8 @@ def prepare_data(
X_train_all, y_train_all = shuffle(X_train_all, y_train_all, random_state=RANDOM_SEED)
if data_is_df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)

X_train, y_train = X_train_all, y_train_all
state.groups_all = state.groups
Expand Down
2 changes: 1 addition & 1 deletion flaml/automl/task/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def prepare_data(
* Valid str options depend on different tasks.
For classification tasks, valid choices are
["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
For regression tasks, valid choices are ["auto", 'uniform', 'time'].
For regression tasks, valid choices are ["auto", 'uniform', 'time', 'group'].
"auto" -> uniform.
For time series forecast tasks, must be "auto" or 'time'.
For ranking task, must be "auto" or 'group'.
Expand Down
36 changes: 33 additions & 3 deletions test/automl/test_split.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from sklearn.datasets import fetch_openml
import numpy as np
from sklearn.datasets import fetch_openml, load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold, KFold, train_test_split

Expand Down Expand Up @@ -48,7 +49,7 @@ def test_time():
_test(split_type="time")


def test_groups():
def test_groups_for_classification_task():
from sklearn.externals._arff import ArffException

try:
Expand Down Expand Up @@ -88,6 +89,35 @@ def test_groups():
automl.fit(X, y, **automl_settings)


def test_groups_for_regression_task():
"""Append nonsensical groups to iris dataset and use it to test that GroupKFold works for regression tasks"""
iris_dict_data = load_iris(as_frame=True) # numpy arrays
iris_data = iris_dict_data["frame"] # pandas dataframe data + target

rng = np.random.default_rng(42)
iris_data["cluster"] = rng.integers(
low=0, high=5, size=iris_data.shape[0]
) # np.random.randint(0, 5, iris_data.shape[0])

automl = AutoML()
X = iris_data[["sepal length (cm)", "sepal width (cm)", "petal length (cm)"]].to_numpy()
y = iris_data["petal width (cm)"]
X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
X, y, iris_data["cluster"], random_state=42
)
automl_settings = {
"max_iter": 5,
"time_budget": -1,
"metric": "r2",
"task": "regression",
"estimator_list": ["lgbm", "rf", "xgboost", "kneighbor"],
"eval_method": "cv",
"split_type": "uniform",
"groups": groups_train,
}
automl.fit(X_train, y_train, **automl_settings)


def test_stratified_groupkfold():
from minio.error import ServerError
from sklearn.model_selection import StratifiedGroupKFold
Expand Down Expand Up @@ -204,4 +234,4 @@ def get_n_splits(self, X=None, y=None, groups=None):


if __name__ == "__main__":
test_groups()
test_groups_for_classification_task()
Loading