From 9631bcdb55d0245274fca4d80d7e7f064965c9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20M=C3=A9ndez=20Civieta?= Date: Wed, 14 Aug 2024 18:07:06 -0400 Subject: [PATCH] Update readme and user guide --- .virtual_documents/Untitled.ipynb | 1 + .virtual_documents/user_guide.ipynb | 241 +++++ README.md | 44 +- user_guide.ipynb | 1296 +++++++++++++++------------ 4 files changed, 1001 insertions(+), 581 deletions(-) create mode 100644 .virtual_documents/Untitled.ipynb create mode 100644 .virtual_documents/user_guide.ipynb diff --git a/.virtual_documents/Untitled.ipynb b/.virtual_documents/Untitled.ipynb new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.virtual_documents/Untitled.ipynb @@ -0,0 +1 @@ + diff --git a/.virtual_documents/user_guide.ipynb b/.virtual_documents/user_guide.ipynb new file mode 100644 index 0000000..c285f79 --- /dev/null +++ b/.virtual_documents/user_guide.ipynb @@ -0,0 +1,241 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + +from sklearn.datasets import make_regression +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +from asgl import Regressor + +# Generate synthetic regression data +X, y = make_regression(n_samples=1000, n_features=50, n_informative=25, bias=10, noise=5, random_state=42) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=250) + +# Create a Regressor object configured for linear regression with Lasso penalization +model = Regressor(model='lm', penalization='lasso', lambda1=0.1) +model.fit(X_train, y_train) + +# Make predictions on the test data +predictions = model.predict(X_test) + +# Evaluate the model's performance using mean squared error +mse = mean_squared_error(predictions, y_test) +print(f"Mean Squared Error: {mse}") + + + + + +import numpy as np +from sklearn.model_selection import RandomizedSearchCV + +# Generate synthetic regression data +X, y = make_regression(n_samples=1000, n_features=50, n_informative=25, bias=10, noise=5, random_state=42) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=250) + +# Define the group structure +group_index = np.random.randint(1, 5, size=50) + +# Create a Regressor object configured for quantile regression with Adaptive Sparse Group Lasso penalization +model = Regressor(model='qr', penalization='asgl', quantile=0.5) + +# Define the parameter grid for RandomizedSearchCV +param_grid = {'lambda1': [1e-4, 1e-3, 1e-2, 1e-1, 1], 'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1]} +rscv = RandomizedSearchCV(model, param_grid, scoring='neg_median_absolute_error') +rscv.fit(X_train, y_train, **{'group_index': group_index}) + + +rscv.best_params_ + + +rscv.best_score_ + + + + + +# Generate custom weights +custom_individual_weights = np.random.rand(X_train.shape[1]) +custom_group_weights = np.random.rand(len(np.unique(group_index))) + +# Create a Regressor object with custom weights +model = Regressor(model='lm', penalization='asgl', individual_weights=custom_individual_weights, group_weights=custom_group_weights) + +# Fit the model +model.fit(X_train, y_train, group_index=group_index) + + + + + + + + + + + + + + +lm_model = Regressor(model='lm', penalization=None) +lm_model.fit(X=X, y=y) + +coef = lm_model.coef_ +print(np.round(coef, 1)) + + + + + + + + +qr_model = Regressor(model='qr', penalization=None, quantile=0.5) +qr_model.fit(X=X, y=y) + +coef = qr_model.coef_ +print(np.round(coef, 1)) + + + + + + + + + + + +lasso_model = Regressor(model='lm', penalization='lasso',lambda1=0.1) +lasso_model.fit(X=X, y=y) +coef = lasso_model.coef_ +print(np.round(coef, 1)) + + + + + +group_index = np.random.randint(1, 5, size=50) +group_lasso_model = Regressor(model='lm', penalization='gl',lambda1=0.1) +group_lasso_model.fit(X=X, y=y, group_index=group_index) +coef = group_lasso_model.coef_ +print(np.round(coef, 1)) + + + + + +sgl_model = Regressor(model='lm', penalization='sgl',lambda1=0.1, alpha=0.5) +sgl_model.fit(X=X, y=y, group_index=group_index) +coef = sgl_model.coef_ +print(np.round(coef, 1)) + + + + + +individual_weights = np.repeat(0.5, 50) +alasso_model = Regressor(model='lm', penalization='alasso',lambda1=0.1, individual_weights=individual_weights) +alasso_model.fit(X=X, y=y) +coef = alasso_model.coef_ + + + + + +group_weights=np.repeat(1.5, len(np.unique(group_index))) +agl_model = Regressor(model='lm', penalization='agl',lambda1=0.1, group_weights=group_weights) +agl_model.fit(X=X, y=y, group_index=group_index) +coef = agl_model.coef_ + + + + + + + + + + + +asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, individual_weights=individual_weights, group_weights=group_weights) +asgl_model.fit(X=X, y=y, group_index=group_index) +coef = asgl_model.coef_ + + + + + + + + + + + +asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='pca_pct', individual_power_weight=1, group_power_weight=1, variability_pct=0.9) +asgl_model.fit(X=X, y=y, group_index=group_index) +coef = asgl_model.coef_ + + +print(f"Let's see what the individual weights look like:\n{np.round(asgl_model.individual_weights, 2)}") + + + + + +asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='pls_pct', individual_power_weight=1, group_power_weight=1, variability_pct=0.9) +asgl_model.fit(X=X, y=y, group_index=group_index) +coef = asgl_model.coef_ + + + + + +asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='pca_1', individual_power_weight=1, group_power_weight=1) +asgl_model.fit(X=X, y=y, group_index=group_index) +coef = asgl_model.coef_ + + + + + +asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='lasso', lambda1_weights=1e-2, individual_power_weight=1, group_power_weight=1) +asgl_model.fit(X=X, y=y, group_index=group_index) +coef = asgl_model.coef_ + + + + + +asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='unpenalized', individual_power_weight=1, group_power_weight=1) +asgl_model.fit(X=X, y=y, group_index=group_index) +coef = asgl_model.coef_ + + + + + + diff --git a/README.md b/README.md index fadc224..99d81da 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ For users currently utilizing the `ASGL` class, we recommend switching to the `Regressor` class to ensure continued support and access to the latest functionalities. -## Main parameters: +## Key features: The `Regressor` class includes the following list of parameters: @@ -212,9 +212,11 @@ genes are grouped into genetic pathways. For scenarios where the regressors have a known grouped structure, this information can be passed to the `Regressor` class during model fitting -using the group_index parameter. The following example demonstrates this -with a synthetic group_index. The model will be optimized using -scikit-learn’s `RandomizedSearchCV` function. +using the `group_index` parameter. This parameter is an array where each +element indicates the group at which the associated variable belongs. +The following example demonstrates this with a synthetic group_index. +The model will be optimized using scikit-learn’s `RandomizedSearchCV` +function. ``` python import numpy as np @@ -240,6 +242,40 @@ This example demonstrates how to fit a quantile regression model with Adaptive Sparse Group Lasso penalization, utilizing scikit-learn’s `RandomizedSearchCV` to optimize the model’s hyperparameters. +### Example 3: Customizing weights + +The `asgl` package offers several built-in methods for estimating +adaptive weights, controlled via the `weight_technique` parameter. For +more details onto the inners of each of these alternatives, refer to the +[associated research +paper](https://link.springer.com/article/10.1007/s11634-020-00413-8) or +to the user guide. However, for users requiring extensive customization, +the package allows for the direct specification of custom weights +through the `individual_weights` and `group_weights` parameters. This +allows the users to implement their own weight computation techniques +and use them within the `asgl` framework. + +When using custom weights, ensure that the length of +`individual_weights` matches the number of variables, and the length of +`group_weights` matches the number of groups. Below is an example +demonstrating how to fit a model with custom individual and group +weights: + +``` python +import numpy as np +from asgl import Regressor + +# Generate custom weights +custom_individual_weights = np.random.rand(X_train.shape[1]) +custom_group_weights = np.random.rand(len(np.unique(group_index))) + +# Create a Regressor object with custom weights +model = Regressor(model='lm', penalization='asgl', individual_weights=custom_individual_weights, group_weights=custom_group_weights) + +# Fit the model +model.fit(X_train, y_train, group_index=group_index) +``` + ## Contributions Contributions are welcome! Please submit a pull request or open an issue diff --git a/user_guide.ipynb b/user_guide.ipynb index 1f9fae7..4441418 100644 --- a/user_guide.ipynb +++ b/user_guide.ipynb @@ -4,246 +4,638 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# `asgl` package" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "# asgl \"funq\n", + "\n", + "[![Downloads](https://pepy.tech/badge/asgl)](https://pepy.tech/project/asgl)\n", + "[![Downloads](https://pepy.tech/badge/asgl/month)](https://pepy.tech/project/asgl)\n", + "[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)\n", + "[![Package Version](https://img.shields.io/badge/version-2.0.0-blue.svg)](https://cran.r-project.org/package=asgl)\n", + "\n", "## Introduction\n", - "___\n", "\n", - "`asgl` is a Python package that solves several regression related models for simultaneous variable selection and prediction, in low and high dimensional frameworks. This package is directly related to research work shown on [this paper](https://link.springer.com/article/10.1007/s11634-020-00413-8).\n", + "The `asgl` package is a versatile and robust tool designed for fitting a variety of regression models, including linear regression, quantile regression, and various penalized regression models such as Lasso, Group Lasso, Sparse Group Lasso, and their adaptive variants. The package is especially useful for simultaneous variable selection and prediction in both low and high-dimensional frameworks.\n", "\n", - "The current version of the package supports:\n", - "* Linear regression models\n", - "* Quantile regression models\n", + "The primary class available to users is the `Regressor` class, which is detailed later in this document.\n", "\n", - "And considers the following penalizations for variable selection:\n", + "`asgl` is based on cutting-edge research and methodologies, as outlined in the following papers:\n", "\n", - "* No penalized models \n", - "* lasso\n", - "* group lasso\n", - "* sparse group lasso\n", - "* adaptive sparse group lasso" + "* [Adaptive Sparse Group Lasso in Quantile Regression](https://link.springer.com/article/10.1007/s11634-020-00413-8)\n", + "* [`asgl`: A Python Package for Penalized Linear and Quantile Regression](https://arxiv.org/abs/2111.00472)\n", + "\n", + "For a practical introduction to the package, users can refer to the user guide notebook available in the GitHub repository. Additional accessible explanations can be found on [Towards Data Science: Sparse Group Lasso](https://towardsdatascience.com/sparse-group-lasso-in-python-255e379ab892) and [Towards Data Science: Adaptive Lasso](https://towardsdatascience.com/an-adaptive-lasso-63afca54b80d)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Requirements \n", - "___\n", - "The package makes use of some basic functions from `scikit-learn` and `numpy`, and is built on top of the wonderful `cvxpy` convex optimization module. It is higly encouraged to install `cvxpy` prior of the installation of `asgl` following the instructions from the original authors, that can be found [here](https://www.cvxpy.org/)). Additionally, `asgl` makes use of python `multiprocessing` module, allowing, if requested, for parallel execution of the code highly reducing computation time." + "## Dependencies\n", + "\n", + "asgl requires: \n", + "\n", + "* Python >= 3.9\n", + "* cvxpy >= 1.2.0\n", + "* numpy >= 1.20.0\n", + "* scikit-learn >= 1.0\n", + "* pytest >= 7.1.2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Usage example:\n", - "___\n", - "In the following example we will analyze the `BostonHousing` dataset (available [here](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html#sklearn.datasets.load_boston)). Even though the `asgl` package can easily deal with much more complex datasets, we will work using this one so we are not affected by computation time. We will show how to implement cross validation on a grid of possible parameter values for an sparse group lasso linear model, how to find the optimal parameter values and finally, how to compute the test error." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Import required packages\n", - "import numpy as np\n", - "from sklearn.datasets import load_boston\n", - "import asgl" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Import test data #\n", - "boston = load_boston()\n", - "x = boston.data\n", - "y = boston.target\n", - "group_index = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5])" + "## User installation\n", + "\n", + "The easiest way to install asgl is using `pip`:\n" ] }, { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ - "As it will be seen later, group lasso and sparse group lasso based formulations work taking into account a group structure of the data. The `BostonHousing` dataset does not have any group structure, but as part of this example we define a fake one called `group_index`." + "pip install asgl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We define an initial grid of values for parameters $\\lambda$ and $\\alpha$ from the sparse group lasso penalization. More details on the meaning of these parameters as well as the mathematical formulation of the penalization can be found in the **Sparse group lasso** section below." + "## Testing\n", + "\n", + "After installation, you can launch the test suite from the source directory (you will need to have `pytest >= 7.1.2` installed) by runnig:" ] }, { - "cell_type": "code", - "execution_count": 3, + "cell_type": "raw", "metadata": {}, - "outputs": [], "source": [ - "# Define parameters grid\n", - "lambda1 = (10.0 ** np.arange(-3, 1.51, 0.2)) # 23 possible values for lambda\n", - "alpha = np.arange(0, 1, 0.05) # 20 possible values for alpha\n", - "\n", - "# Define model parameters\n", - "model = 'lm' # linear model\n", - "penalization = 'sgl' # sparse group lasso penalization\n", - "parallel = True # Code executed in parallel\n", - "error_type = 'MSE' # Error measuremente considered. MSE stands for Mean Squared Error." + "pytest" ] }, { - "cell_type": "code", - "execution_count": 4, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "We define a grid of 460 models.\n" - ] - } - ], "source": [ - "num_models = len(lambda1)*len(alpha)\n", - "print(f'We define a grid of {num_models} models.')" + "## Key features:\n", + "\n", + "The `Regressor` class includes the following list of parameters:\n", + "\n", + "* model: str, default='lm'\n", + " * Type of model to fit. Options are 'lm' (linear regression) and 'qr' (quantile regression).\n", + "* penalization: str or None, default='lasso'\n", + " * Type of penalization to use. Options are 'lasso', 'gl' (group lasso), 'sgl' (sparse group lasso), 'alasso' (adaptive lasso), 'agl' (adaptive group lasso), 'asgl' (adaptive sparse group lasso), or None.\n", + "* quantile: float, default=0.5\n", + " * Quantile level for quantile regression models. Valid values are between 0 and 1.\n", + "* fit_intercept: bool, default=True\n", + " * Whether to fit an intercept in the model.\n", + "* lambda1: float, default=0.1\n", + " * Constant that multiplies the penalization, controlling the strength. Must be a non-negative float i.e. in `[0, inf)`. Larger values will result in larger penalizations.\n", + "* alpha: float, default=0.5\n", + " * Constant that performs tradeoff between individual and group penalizations in sgl and asgl penalizations.\n", + " ``alpha=1`` enforces a lasso penalization while ``alpha=0`` enforces a group lasso penalization.\n", + "* solver: str, default='default'\n", + " * Solver to be used by `cvxpy`. Default uses optimal alternative depending on the problem. Users can check available solvers via the command `cvxpy.installed_solvers()`.\n", + "* weight_technique: str, default='pca_pct'\n", + " * Technique used to fit adaptive weights. Options include 'pca_1', 'pca_pct', 'pls_1', 'pls_pct', 'lasso', 'unpenalized', and 'sparse_pca'. For low dimensional problems (where the number of variables is smaller than the number of observations) the usage of the 'unpenalized' weight_technique alternative is encouraged. For high dimensional problems (where the number of variables is larger than the number of observations) the default alternative is encouraged.\n", + "* individual_power_weight: float, default=1\n", + " * Power to which individual weights are raised. This parameter only has effect in adaptive penalizations. ('alasso' and 'asgl').\n", + "* group_power_weight: float, default=1\n", + " * Power to which group weights are raised. This parameter only has effect in adaptive penalizations with a grouped structure ('agl' and 'asgl').\n", + "* variability_pct: float, default=0.9\n", + " * Percentage of variability explained by PCA, PLS, and sparse PCA components. This parameter only has effect in adaptiv penalizations where `weight_technique` is equal to 'pca_pct', 'pls_pct' or 'sparse_pca'.\n", + "* lambda1_weights: float, default=0.1\n", + " * The value of the parameter ``lambda1`` used to solve the lasso model if ``weight_technique='lasso'``\n", + "* spca_alpha: float, default=1e-5\n", + " * Sparse PCA parameter. This parameter only has effect if `weight_technique='sparse_pca'`See scikit-learn implementation for more details.\n", + "* spca_ridge_alpha: float, default=1e-2\n", + " * Sparse PCA parameter. This parameter only has effect if `weight_technique='sparse_pca'`See scikit-learn implementation for more details.\n", + "* individual_weights: array or None, default=None\n", + " * Custom individual weights for adaptive penalizations. If this parameter is informed,\n", + " it overrides the weight estimation process defined by parameter ``weight_technique`` and allows the user to\n", + " provide custom weights. It must be either `None` or be an array with non-negative float values and length equal to the number of variables.\n", + "* group_weights: array or None, default=None\n", + " * Custom group weights for adaptive penalizations. If this parameter is informed,\n", + " it overrides the weight estimation process defined by parameter ``weight_technique`` and allows the user to\n", + " provide custom weights. It must be either `None` or be an array with non-negative float values and length equal to the number of groups (as defined by `group_index`)\n", + "* tol: float, default=1e-4\n", + " * Tolerance for coefficients to be considered zero.\n", + "* weight_tol: float, default=1e-4\n", + " * Tolerance value used to avoid ZeroDivision errors when computing the weights." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We have defined a grid of 460 possible models based on the combinations of different $\\lambda$ and $\\alpha$ values. In order to find the optimal values of the parameters, we consider 5-fold cross validation, and we will run this in parallel. Additionally, we provide a `random_state` value so that splits are reproducible." + "## Examples" ] }, { - "cell_type": "code", - "execution_count": 5, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Define a cross validation object\n", - "cv_class = asgl.CV(model=model, penalization=penalization, lambda1=lambda1, alpha=alpha,\n", - " nfolds=5, error_type=error_type, parallel=parallel, random_state=99)" + "### Example 1: Linear Regression with Lasso Penalization" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "We are considering a grid of 460 models, optimized based on 5-folds cross validation\n" + "Mean Squared Error: 29.111050920778137\n" ] } ], "source": [ - "# Compute error using k-fold cross validation\n", - "error = cv_class.cross_validation(x=x, y=y, group_index=group_index)\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error\n", + "from asgl import Regressor\n", "\n", - "num_models, k_folds = error.shape\n", - "# error is a matrix of shape (number_of_models, k_folds)\n", - "print(f'We are considering a grid of {num_models} models, optimized based on {k_folds}-folds cross validation')\n", + "# Generate synthetic regression data\n", + "X, y = make_regression(n_samples=1000, n_features=50, n_informative=25, bias=10, noise=5, random_state=42)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=250)\n", "\n", - "# Obtain the mean error across different folds\n", - "error = np.mean(error, axis=1)" + "# Create a Regressor object configured for linear regression with Lasso penalization\n", + "model = Regressor(model='lm', penalization='lasso', lambda1=0.1)\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the test data\n", + "predictions = model.predict(X_test)\n", + "\n", + "# Evaluate the model's performance using mean squared error\n", + "mse = mean_squared_error(predictions, y_test)\n", + "print(f\"Mean Squared Error: {mse}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's find the parameter values that minimize the cross validation error." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# Select the minimum error\n", - "minimum_error_idx = np.argmin(error)\n", + "### Example 2: Quantile Regression with Adaptive Sparse Group Lasso Penalization\n", "\n", - "# Select the parameters associated to mininum error values\n", - "optimal_parameters = cv_class.retrieve_parameters_value(minimum_error_idx)\n", - "optimal_lambda = optimal_parameters.get('lambda1')\n", - "optimal_alpha = optimal_parameters.get('alpha')" + "Group-based penalizations like Group Lasso, Sparse Group Lasso, and their adaptive variants, assume that there is a group structure within the regressors. This structure can be useful in various applications, such as when using dummy variables where all the dummies of the same variable belong to the same group, or in genetic data analysis where genes are grouped into genetic pathways.\n", + "\n", + "For scenarios where the regressors have a known grouped structure, this information can be passed to the `Regressor` class during model fitting using the `group_index` parameter. This parameter is an array where each element indicates the group at which the associated variable belongs. The following example demonstrates this with a synthetic group_index. The model will be optimized using scikit-learn's `RandomizedSearchCV` function." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " Minimum cross validation error was 23.81466504759943.\n", - " Optimal parameter values:\n", - " Lambda: 0.001\n", - " Alpha: 0.9500000000000001\n" - ] + "data": { + "text/html": [ + "
RandomizedSearchCV(estimator=Regressor(model='qr', penalization='asgl'),\n",
+       "                   param_distributions={'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1],\n",
+       "                                        'lambda1': [0.0001, 0.001, 0.01, 0.1,\n",
+       "                                                    1]},\n",
+       "                   scoring='neg_median_absolute_error')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "RandomizedSearchCV(estimator=Regressor(model='qr', penalization='asgl'),\n", + " param_distributions={'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1],\n", + " 'lambda1': [0.0001, 0.001, 0.01, 0.1,\n", + " 1]},\n", + " scoring='neg_median_absolute_error')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "print(f' Minimum cross validation error was {error[minimum_error_idx]}.\\n Optimal parameter values:\\n Lambda: {optimal_lambda}\\n Alpha: {optimal_alpha}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have found that the cross validation error is minimized for the parameter values shown above. Now we will consider a final train / test split in which to train the model for the pair of optimal parameters obtained before and compute the final test error. For this, we define an ASGL object, that is used to fit the model with no cross validation involved." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# Define asgl class using optimal values\n", - "asgl_model = asgl.ASGL(model=model, penalization=penalization, lambda1=optimal_lambda, alpha=optimal_alpha)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Split data into train / test\n", - "train_idx, test_idx = asgl.train_test_split(nrows=x.shape[0], train_pct=0.7, random_state=1)\n", + "import numpy as np\n", + "from sklearn.model_selection import RandomizedSearchCV\n", "\n", - "# Solve the model\n", - "asgl_model.fit(x=x[train_idx, :], y=y[train_idx], group_index=group_index)\n", + "# Generate synthetic regression data\n", + "X, y = make_regression(n_samples=1000, n_features=50, n_informative=25, bias=10, noise=5, random_state=42)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=250)\n", "\n", - "# Obtain betas\n", - "final_beta_solution = asgl_model.coef_[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`asgl.coef_` stores the $\\beta$ coefficients of the model found to be optimal based on cross validation. Observe that matrix $X$ has 13 variables and here `final_beta_solution` has length 14. We should take into account that the first element of this array is the intercept of the model. We can turn the intercept off by setting `intercept=False` in the `asgl.ASGL` definition." + "# Define the group structure\n", + "group_index = np.random.randint(1, 5, size=50)\n", + "\n", + "# Create a Regressor object configured for quantile regression with Adaptive Sparse Group Lasso penalization\n", + "model = Regressor(model='qr', penalization='asgl', quantile=0.5)\n", + "\n", + "# Define the parameter grid for RandomizedSearchCV\n", + "param_grid = {'lambda1': [1e-4, 1e-3, 1e-2, 1e-1, 1], 'alpha': [0, 0.2, 0.4, 0.6, 0.8, 1]}\n", + "rscv = RandomizedSearchCV(model, param_grid, scoring='neg_median_absolute_error')\n", + "rscv.fit(X_train, y_train, **{'group_index': group_index})" ] }, { @@ -254,8 +646,7 @@ { "data": { "text/plain": [ - "array([ 34.2, -0.1, 0. , -0. , 3.1, -18.2, 4.3, -0. , -1.6,\n", - " 0.3, -0. , -0.9, 0. , -0.4])" + "{'lambda1': 0.1, 'alpha': 0.4}" ] }, "execution_count": 11, @@ -264,7 +655,7 @@ } ], "source": [ - "np.round(final_beta_solution, 1)" + "rscv.best_params_" ] }, { @@ -273,23 +664,46 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Final error is 29.44\n" - ] + "data": { + "text/plain": [ + "-207.14893836307883" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# Obtain predictions\n", - "final_prediction = asgl_model.predict(x_new=x[test_idx, :])\n", + "rscv.best_score_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 3: Customizing weights\n", + "\n", + "The `asgl` package offers several built-in methods for estimating adaptive weights, controlled via the `weight_technique` parameter. For more details onto the inners of each of these alternatives, refer to the [associated research paper](https://link.springer.com/article/10.1007/s11634-020-00413-8) or to thethe next section for an overview. However, for users requiring extensive customization, the package allows for the direct specification of custom weights through the `individual_weights` and `group_weights` parameters. This allows the users to implement their own weight computation techniques and use them within the `asgl` framework.\n", + "\n", + "When using custom weights, ensure that the length of `individual_weights` matches the number of variables, and the length of `group_weights` matches the number of groups. Below is an example demonstrating how to fit a model with custom individual and group weights:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate custom weights\n", + "custom_individual_weights = np.random.rand(X_train.shape[1])\n", + "custom_group_weights = np.random.rand(len(np.unique(group_index)))\n", "\n", - "# Obtain final errors\n", - "final_error = asgl.error_calculator(y_true=y[test_idx], \n", - " prediction_list=final_prediction,\n", - " error_type=error_type)\n", + "# Create a Regressor object with custom weights\n", + "model = Regressor(model='lm', penalization='asgl', individual_weights=custom_individual_weights, group_weights=custom_group_weights)\n", "\n", - "print(f'Final error is {np.round(final_error[0], 2)}')" + "# Fit the model\n", + "model.fit(X_train, y_train, group_index=group_index)" ] }, { @@ -304,7 +718,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For an in-depth analysis of the mathematical formulations we highly encourage to read our original [paper](https://link.springer.com/article/10.1007/s11634-020-00413-8), however, here the basics of the formulations wil be covered, including code examples." + "For an in-depth analysis of the mathematical formulations we highly encourage to read our original [paper](https://link.springer.com/article/10.1007/s11634-020-00413-8), however, here the basics of the formulations will be covered, including code examples." ] }, { @@ -328,28 +742,30 @@ "This model can be fit by simply defining\n", " * `model=lm`: lm referring to linear model\n", " * `penalization=None`: This fits an unpenalized model\n", - " * `intercept=True`: Default value is `True`. If the intercept of the model is not required, it can be set to `False`." + " * `fit_intercept=True`: Default value is `True`. If the intercept of the model is not required, it can be set to `False`." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[ 36.5 -0.1 0. 0. 2.7 -17.8 3.8 0. -1.5 0.3 -0. -1.\n", - " 0. -0.5]\n" + "[10.1 0.3 -0.1 0.2 61.7 -0. 56.5 74.3 91.3 0.2 0. 0.1 0.2 0.1\n", + " -0.1 0.1 10.9 69. 0.1 22.7 0. 7.2 94.7 91.5 -0.4 75.4 -0.3 38.1\n", + " 5.3 -0. 0.2 -0.2 0. 70.4 0.5 59.2 7. 0. -0.1 12.2 69.4 1.3\n", + " 86.8 21.1 -0. 96.2 -0.2 78.3 -0. 0. -0.1]\n" ] } ], "source": [ - "lm_model = asgl.ASGL(model='lm', penalization=None)\n", - "lm_model.fit(x=x, y=y)\n", + "lm_model = Regressor(model='lm', penalization=None)\n", + "lm_model.fit(X=X, y=y)\n", "\n", - "coef = lm_model.coef_[0]\n", + "coef = lm_model.coef_\n", "print(np.round(coef, 1))" ] }, @@ -380,25 +796,35 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[14.9 -0.1 0. 0. 1.3 -9.2 5.3 -0. -1. 0.2 -0. -0.7 0. -0.3]\n" + "[10. 0.3 0.1 0.2 61.7 0. 56.5 74.3 91.2 0.1 0.1 -0.1 -0.1 0.1\n", + " 0. -0.1 10.8 69. 0.2 22.9 0.2 7.1 94.6 91.6 -0.4 75.2 -0.6 38.3\n", + " 5.2 0.1 0.3 -0.2 -0. 70.5 0.7 59.2 7.1 -0.1 -0. 12.3 69.6 1.2\n", + " 86.9 21. 0.1 96.2 0.1 78.3 0.1 -0.1 -0.1]\n" ] } ], "source": [ - "qr_model = asgl.ASGL(model='qr', penalization=None, tau=0.5)\n", - "qr_model.fit(x=x, y=y)\n", + "qr_model = Regressor(model='qr', penalization=None, quantile=0.5)\n", + "qr_model.fit(X=X, y=y)\n", "\n", - "coef = qr_model.coef_[0]\n", + "coef = qr_model.coef_\n", "print(np.round(coef, 1))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the remaining of the document, we will stick to using linear models, but at any time you can switch to quantile regression models by simply stating `model='qr'`." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -420,44 +846,30 @@ "It can be fit as,\n", "\n", "* `penalization='lasso'`\n", - "* `lambda1 = [0.001, 0.01, 0.1, 1, 10]`. This parameter is the $\\lambda$ defined in the problem formulation. It controls the sparsity of the solution. Large $\\lambda$ values are associated with more sparse solutions, since the coefficients are more heavily penalized." + "* `lambda1=0.1`. This parameter is the $\\lambda$ defined in the problem formulation. It controls the sparsity of the solution. Large $\\lambda$ values are associated with more sparse solutions, since the coefficients are more heavily penalized." ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "lambda1 = [0.001, 0.01, 0.1, 1, 10]\n", - "lasso_model = asgl.ASGL(model='lm', penalization='lasso',lambda1=lambda1)\n", - "lasso_model.fit(x=x, y=y)\n", - "coef = lasso_model.coef_" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Observe that `lambda1` is defined as a list of values. We fit a linear model with a lasso penalization for each possible $\\lambda$ value, and the coefficients of these models are all stored in `lasso_model.coef_`. This way, the coefficients associated to the third $\\lambda$ value are stored the third in `coef`." - ] - }, - { - "cell_type": "code", - "execution_count": 16, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The model coefficients associated to lambda value 1 (which is the fourth value in the lambda1 array) are:\n", - "[32.5 -0.1 0. -0. 0. 0. 2.5 0. -0.9 0.3 -0. -0.8 0. -0.7]\n" + "[10.1 0.2 -0. 0.1 61.7 -0. 56.4 74.3 91.3 0.1 -0. 0. 0.2 0.\n", + " -0. 0. 10.8 69. 0. 22.7 0. 7.1 94.6 91.5 -0.3 75.3 -0.2 38.\n", + " 5.2 -0. 0.2 -0.2 0. 70.4 0.5 59.1 6.9 0. -0. 12.1 69.4 1.2\n", + " 86.7 21.1 -0. 96.2 -0.2 78.3 0. 0. -0. ]\n" ] } ], "source": [ - "print(f'The model coefficients associated to lambda value 1 (which is the fourth value in the lambda1 array) are:\\n{np.round(coef[3],1)}')" + "lasso_model = Regressor(model='lm', penalization='lasso',lambda1=0.1)\n", + "lasso_model.fit(X=X, y=y)\n", + "coef = lasso_model.coef_\n", + "print(np.round(coef, 1))" ] }, { @@ -473,32 +885,32 @@ "where $p_{l}$ is the size of the l-th group. This penalization can be fit by simply defining:\n", "\n", "* `penalization='gl'` where gl refers to group lasso\n", - "* `lambda1 = [0.001, 0.01, 0.1, 1, 10]`, where $\\lambda$ is the parameter defined in the lasso penalization\n", - "* `group_index=np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2])`. This should be an array of the same length as the number of variables in matrix $X$. Each element on this array indicates the group at which the associated variable belongs. For example, the first three variables from $X$ belong to group 1, while the next three belong to group 2." + "* `lambda1=0.1`, where $\\lambda$ is the parameter defined in the lasso penalization\n", + "* `group_index=np.random.randint(1, 5, size=50)`. This should be an array of the same length as the number of variables in matrix $X$. Each element on this array indicates the group at which the associated variable belongs. " ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The model coefficients associated to lambda value 10 (which is the fith value in the lambda1 array) are:\n", - "[30.9 -0. 0. -0. 0. -0. 0. 0. -0. 0.1 -0. -0.2 0. -0.5]\n" + "[10.1 0.3 -0. 74.2 0.1 -0. 0.2 22.7 94.6 -0.4 38. -0. 7. 0.\n", + " -0.1 69.3 86.7 78.3 91.2 68.9 0. 0.3 21.1 -0. 0. -0.1 61.6 0.1\n", + " -0.1 10.9 0.1 7.2 91.4 -0.3 5.3 -0. 0.5 1.3 -0. 0.2 56.4 0.1\n", + " 0.1 75.3 -0.2 70.4 59.1 12.2 96.1 -0.2 -0.1]\n" ] } ], "source": [ - "lambda1 = [0.001, 0.01, 0.1, 1, 10]\n", - "group_index = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2])\n", - "group_lasso_model = asgl.ASGL(model='lm', penalization='gl',lambda1=lambda1)\n", - "group_lasso_model.fit(x=x, y=y, group_index=group_index)\n", + "group_index = np.random.randint(1, 5, size=50)\n", + "group_lasso_model = Regressor(model='lm', penalization='gl',lambda1=0.1)\n", + "group_lasso_model.fit(X=X, y=y, group_index=group_index)\n", "coef = group_lasso_model.coef_\n", - "\n", - "print(f'The model coefficients associated to lambda value 10 (which is the fith value in the lambda1 array) are:\\n{np.round(coef[4],1)}')" + "print(np.round(coef, 1))" ] }, { @@ -513,67 +925,32 @@ "where $\\alpha$ is a parameter defined in $[0,1]$ that balances the penalization applied between lasso and group lasso. Values of $\\alpha$ close to 1 produce lasso-like solutions, while values close to 0 produce group lasso-like solutions.. This penalization can be fit defining:\n", "\n", "* `penalization='sgl'` where sgl refers to sparse group lasso\n", - "* `lambda1 = [0.001, 0.01, 0.1, 1, 10]`, where $\\lambda$ is the parameter defined in the lasso penalization\n", - "* `alpha=[0, 0.25, 0.5, 0.75, 1]`, where $\\alpha$ is the parameter described above\n", - "* `group_index=np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2])`, as described in the group lasso." + "* `lambda1=0.1`, where $\\lambda$ is the parameter defined in the lasso penalization\n", + "* `alpha=0.5`, where $\\alpha$ is the parameter described above\n", + "* `group_index=np.random.randint(1, 5, size=50)`, as described in the group lasso." ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "lambda1 = [0.001, 0.01, 0.1, 1, 10]\n", - "alpha = [0, 0.25, 0.5, 0.75, 1]\n", - "group_index = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2])\n", - "sgl_model = asgl.ASGL(model='lm', penalization='sgl',lambda1=lambda1, alpha=alpha, parallel=True)\n", - "sgl_model.fit(x=x, y=y, group_index=group_index)\n", - "coef = sgl_model.coef_" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We consider a grid with 5 possible $\\lambda$ values and another 5 possible $\\alpha$ values. So we fit 25 models in total. The coefficients for all the 25 models are stored in coef. Using the function `retrieve_parameters_value(idx)` we can recover the parameter values associated to a specific solution stored in `coef`. For example, if we are interested in recovering what are the parameter values that yielded the coefficients stored in the index 20 in coef, we could run," - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "coef_20 = coef[20]\n", - "param_20 = sgl_model.retrieve_parameters_value(20)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Coefficients value:\n", - "[30.9 -0. 0. -0. 0. -0. 0. 0. -0. 0.1 -0. -0.2 0. -0.5]\n", - "Parameters value:\n", - "{'lambda1': 10, 'alpha': 0, 'lasso_weights': None, 'gl_weights': None}\n" + "[10.1 0.2 0. 74.3 0.1 0. 0.2 22.7 94.6 -0.3 38. 0. 7. 0.\n", + " -0.1 69.4 86.7 78.3 91.3 69. 0. 0.2 21.1 0. 0. -0. 61.7 0.\n", + " -0. 10.8 0.1 7.1 91.5 -0.2 5.3 -0. 0.5 1.2 0. 0.2 56.4 0.\n", + " 0. 75.3 -0.2 70.4 59.1 12.1 96.1 -0.2 -0. ]\n" ] } ], "source": [ - "print(f'Coefficients value:\\n{np.round(coef_20, 1)}\\nParameters value:\\n{param_20}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here `lasso_weights` and `gl_weights` are parameters used in the adaptive penalization described below, and for that reason are shown as `None`." + "sgl_model = Regressor(model='lm', penalization='sgl',lambda1=0.1, alpha=0.5)\n", + "sgl_model.fit(X=X, y=y, group_index=group_index)\n", + "coef = sgl_model.coef_\n", + "print(np.round(coef, 1))" ] }, { @@ -588,41 +965,21 @@ "\n", "where $\\tilde{w_i}$ are weights previously provided by the researcher. This penalization can be fit by defining,\n", "\n", - "* `penalization='alasso'` where sgl refers to sparse group lasso\n", - "* `lambda1 = [0.001, 0.01, 0.1, 1, 10]`, where $\\lambda$ is the parameter defined in the lasso penalization\n", - "* `lasso_weights=np.repeat(0.5, 13)`. Here, `lasso_weights` refers to $\\tilde{w}$, and it should be of the same length as the number of predictors in X (the number of columns in X, in this case 13)\n", - "\n", - "As it happended with parameters $\\lambda$ and $\\alpha$, it can be interesting to fit models for different weight values. This can be easily done by simply storing all the weight candidates into a list. For example,\n", - "\n", - "`lasso_weights=[np.repeat(0.5, 13), np.repeat(0.75, 13), np.repeat(1, 13)]`\n", - "\n", - "would fit models for the three candidate values of the lasso weights. We will see how to fit this assuming that we know some potential values for $\\tilde{w_i}$ (we will discuss later how those can be obtained)" + "* `penalization='alasso'` where the 'a' before 'lasso' stands for adaptive.\n", + "* `lambda1=0.1`, where $\\lambda$ is the parameter defined in the lasso penalization\n", + "* `individual_weights=np.repeat(0.5, 50)`. Here, `individual_weights` refers to $\\tilde{w}$, and it should be of the same length as the number of predictors in X (the number of columns in X, in this case 50)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 32, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "lambda1 = [0.001, 0.01, 0.1, 1, 10]\n", - "lasso_weights = [np.repeat(0.5, 13), np.repeat(0.75, 13), np.repeat(1, 13)]\n", - "alasso_model = asgl.ASGL(model='lm', penalization='alasso',lambda1=lambda1, \n", - " lasso_weights=lasso_weights, parallel=True)\n", - "alasso_model.fit(x=x, y=y)\n", - "coef = asgl_model.coef_\n", - "len(coef)" + "individual_weights = np.repeat(0.5, 50)\n", + "alasso_model = Regressor(model='lm', penalization='alasso',lambda1=0.1, individual_weights=individual_weights)\n", + "alasso_model.fit(X=X, y=y)\n", + "coef = alasso_model.coef_" ] }, { @@ -637,38 +994,22 @@ "\n", "where $\\tilde{v_l}$ are also additional weights. This penalization can be fit by defining,\n", "\n", - "* `penalization='agl'` where sgl refers to sparse group lasso\n", - "* `lambda1 = [0.001, 0.01, 0.1, 1, 10]`, where $\\lambda$ is the parameter defined in the lasso penalization\n", - "* `group_index=np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2])`, as described in the group lasso.\n", - "* `gl_weights=np.repeat(1.5, 2)`. Here `gl_weights` refers to $\\tilde{v}$ and it should be of the same length as the number of groups considered in `group_index` (in this case, 2)\n" + "* `penalization='agl'` where the 'a' before 'gl' stand for adaptive.\n", + "* `lambda1=0.1`, where $\\lambda$ is the parameter defined in the lasso penalization\n", + "* `group_index=np.random.randint(1, 5, size=50)`, as described in the group lasso.\n", + "* `group_weights=np.repeat(1.5, len(np.unique(group_index)))`. Here `group_weights` refers to $\\tilde{v}$ and it should be of the same length as the number of groups considered in `group_index`.\n" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 34, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "lambda1 = [0.001, 0.01, 0.1, 1, 10]\n", - "group_index = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2])\n", - "\n", - "gl_weights = [np.repeat(0.5, 2), np.repeat(0.7, 2), np.repeat(1.3, 2)]\n", - "agl_model = asgl.ASGL(model='lm', penalization='agl',lambda1=lambda1, \n", - " gl_weights=gl_weights, parallel=True)\n", - "agl_model.fit(x=x, y=y, group_index=group_index)\n", - "coef = asgl_model.coef_\n", - "len(coef)" + "group_weights=np.repeat(1.5, len(np.unique(group_index)))\n", + "agl_model = Regressor(model='lm', penalization='agl',lambda1=0.1, group_weights=group_weights)\n", + "agl_model.fit(X=X, y=y, group_index=group_index)\n", + "coef = agl_model.coef_" ] }, { @@ -698,48 +1039,22 @@ "Now we will see how this penalization can be fit assuming that we know some potential values for $\\tilde{w_i}$ and $\\tilde{v_l}$. After that, we will discuss alternatives for the estimation of such weights.\n", "\n", "* `penalization='asgl'` where asgl refers to adaptive sparse group lasso\n", - "* `lambda1 = [0.001, 0.01, 0.1, 1, 10]`, where $\\lambda$ is the parameter defined in the lasso penalization\n", - "* `alpha=[0, 0.25, 0.5, 0.75, 1]`, where $\\alpha$ is the parameter described for the sparse group lasso penalization\n", - "* `group_index=np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2])`, as described in the group lasso.\n", - "* `lasso_weights = [np.repeat(0.5, 13), np.repeat(0.75, 13), np.repeat(1, 13)]` where `lasso_weights` is the parameter defined for the \n", - "* `gl_weights=np.repeat(1.5, 2)`. where `gl_weights` is the parameter defined for adaptive group lasso." + "* `lambda1 = 0.1`, where $\\lambda$ is the parameter defined in the lasso penalization\n", + "* `alpha=0.5`, where $\\alpha$ is the parameter described for the sparse group lasso penalization\n", + "* `group_index=np.random.randint(1, 5, size=50)`, as described in the group lasso.\n", + "* `individual_weights=np.repeat(0.5, 50)` where `individual_weights` is the parameter defined for adaptive lasso.\n", + "* `group_weights=np.repeat(1.5, len(np.unique(group_index)))`. where `group_weights` is the parameter defined for adaptive group lasso." ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "75" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lambda1 = [0.001, 0.01, 0.1, 1, 10]\n", - "alpha = [0, 0.25, 0.5, 0.75, 1]\n", - "group_index = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2])\n", - "\n", - "lasso_weights = [np.repeat(0.5, 13), np.repeat(0.75, 13), np.repeat(1, 13)]\n", - "gl_weights = [np.repeat(0.5, 2)]\n", - "asgl_model = asgl.ASGL(model='lm', penalization='asgl',lambda1=lambda1, alpha=alpha, \n", - " lasso_weights=lasso_weights, gl_weights=gl_weights, parallel=True)\n", - "asgl_model.fit(x=x, y=y, group_index=group_index)\n", - "coef = asgl_model.coef_\n", - "len(coef)" - ] - }, - { - "cell_type": "markdown", + "execution_count": 38, "metadata": {}, + "outputs": [], "source": [ - "We are considering here a grid of 5 $\\lambda$ values, 5 $\\alpha$ values, 3 $\\tilde{w}$ values and 1 $\\tilde{v}$ values. A total number of 75 models are fitted." + "asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, individual_weights=individual_weights, group_weights=group_weights)\n", + "asgl_model.fit(X=X, y=y, group_index=group_index)\n", + "coef = asgl_model.coef_" ] }, { @@ -770,65 +1085,45 @@ "source": [ "##### PCA based on a subset of components\n", "\n", - "This is our proposal for the default weight calculation alternative. Simply put, use PCA in order to reduce the number of dimensions of the problem. Fit a non penalized model using the PCA scores (taking advantage of being in a smaller dimension framework) and then project back into the original space. An in-depth explanation of this process can be read in our original [paper](https://link.springer.com/article/10.1007/s11634-020-00413-8). This can be easily done in the package by using the `WEIGHTS` class:\n", + "This is our proposal for the default weight calculation alternative. Simply put, use PCA in order to reduce the number of dimensions of the problem. Fit a non penalized model using the PCA scores (taking advantage of being in a smaller dimension framework) and then project back into the original space. An in-depth explanation of this process can be read in our original [paper](https://link.springer.com/article/10.1007/s11634-020-00413-8). This can be easily done as part of the `Regressor` object:\n", "\n", - "* `model='lm'`\n", "* `penalization=asgl`\n", "* `weight_technique='pca_pct'`. It refers to PCA percentage (because this technique is based on selecting a percentage of PCA components to fit the weights)\n", - "* `lasso_power_weight=[0.6, 0.8, 1]`. This is the $\\gamma_1$ coefficient value. Default value for this parameter is `lasso_power_weight=1`\n", - "* `gl_power_weight=[0.6, 0.8, 1]` This is the $\\gamma_2$ coefficient value Default value for this parameter is `lasso_power_weight=1`\n", - "* `variability_pct=0.9`. The number of PCA components to use for fitting the weight in terms of the total variability they can explain. Default value for this parameter `variability_pct=0.9`" + "* `individual_power_weight=1`. This is the $\\gamma_1$ coefficient value. Default value for this parameter is `1`\n", + "* `group_power_weight=1` This is the $\\gamma_2$ coefficient value Default value for this parameter is `1`\n", + "* `variability_pct=0.9`. The number of PCA components to use for fitting the weight in terms of the total variability they can explain. Default value for this parameter is `variability_pct=0.9`" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ - "# Obtain weight values\n", - "model='lm'\n", - "penalization = 'asgl'\n", - "weight_technique = 'pca_pct'\n", - "lasso_power_weight = [0.6, 0.8, 1]\n", - "gl_power_weight = [0.6, 0.8, 1]\n", - "variability_pct = 0.9\n", - "\n", - "weights = asgl.WEIGHTS(model=model, penalization=penalization, weight_technique=weight_technique, lasso_power_weight=lasso_power_weight, \n", - " gl_power_weight=gl_power_weight, variability_pct=variability_pct)\n", - "lasso_weights, gl_weights = weights.fit(x, y, group_index=group_index)" + "asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='pca_pct', individual_power_weight=1, group_power_weight=1, variability_pct=0.9)\n", + "asgl_model.fit(X=X, y=y, group_index=group_index)\n", + "coef = asgl_model.coef_" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Let's see what the weights look like:\n", - "[ 71.35 59.22 78.99 1938.93 843.08 527.11 40.48 183.18 60.17\n", - " 9.96 219.61 12.03 82.16]\n" + "Let's see what the individual weights look like:\n", + "[0.48 0.08 0.57 0.02 0.29 0.03 0.02 0.01 0.16 0.07 0.2 0.15 0.1 0.08\n", + " 0.07 0.04 0.02 0.09 0.04 0.07 0.11 0.01 0.01 0.13 0.02 0.25 0.04 0.19\n", + " 0.04 0.14 0.08 0.39 0.02 0.08 0.02 0.04 0.11 4.3 0.05 0.02 0.1 0.01\n", + " 0.05 0.06 0.01 0.15 0.01 0.09 1.15 0.56]\n" ] } ], "source": [ - "print(f\"Let's see what the weights look like:\\n{np.round(lasso_weights[0], 2)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# Use the weights obtained before to fit an asgl model\n", - "asgl_model = asgl.ASGL(model='lm', penalization='asgl',lambda1=lambda1, alpha=alpha, \n", - " lasso_weights=lasso_weights, gl_weights=gl_weights, parallel=True)\n", - "asgl_model.fit(x=x, y=y, group_index=group_index)\n", - "coef = asgl_model.coef_" + "print(f\"Let's see what the individual weights look like:\\n{np.round(asgl_model.individual_weights, 2)}\")" ] }, { @@ -841,27 +1136,20 @@ "\n", "* `penalization=asgl`\n", "* `weight_technique='pls_pct'`. It refers to PLS percentage (because this technique is based on selecting a percentage of PLS components to fit the weights)\n", - "* `lasso_power_weight=[0.8, 1, 1.2]`. As defined for PCA based on a subset of components.\n", - "* `gl_power_weight=[0.8, 1, 1.2]`. As defined for PCA based on a subset of components.\n", - "* `variability_pct=0.9`. As defined for PCA based on a subset of components. " + "* `individual_power_weight=1`. This is the $\\gamma_1$ coefficient value. Default value for this parameter is `1`\n", + "* `group_power_weight=1` This is the $\\gamma_2$ coefficient value Default value for this parameter is `1`\n", + "* `variability_pct=0.9`. The number of PCA components to use for fitting the weight in terms of the total variability they can explain. Default value for this parameter is `variability_pct=0.9`" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ - "# Obtain weight values\n", - "penalization = 'asgl'\n", - "weight_technique = 'pls_pct'\n", - "lasso_power_weight = [0.8, 1, 1.2]\n", - "gl_power_weight = [0.8, 1, 1.2]\n", - "variability_pct = 0.9\n", - "\n", - "weights = asgl.WEIGHTS(penalization=penalization, weight_technique=weight_technique, lasso_power_weight=lasso_power_weight,\n", - " gl_power_weight=gl_power_weight, variability_pct=variability_pct)\n", - "lasso_weights, gl_weights = weights.fit(x, y, group_index=group_index)" + "asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='pls_pct', individual_power_weight=1, group_power_weight=1, variability_pct=0.9)\n", + "asgl_model.fit(X=X, y=y, group_index=group_index)\n", + "coef = asgl_model.coef_" ] }, { @@ -870,37 +1158,24 @@ "source": [ "##### PCA / PLS based on the first component\n", "\n", - "Each PCA is built as a linear combination of the original variables. This means that another alternative for estimating the weights can be defined as simply using the weights from the first principal component as weights for the adaptive sparse group lasso model. This alternative produce worse results than the previous ones, but runs faster. In the same way, it is poosible to use the first PLS compponent to obtain weights\n", + "Each PCA is built as a linear combination of the original variables. This means that another alternative for estimating the weights can be defined as simply using the weights from the first principal component as weights for the adaptive sparse group lasso model. In the same way, it is poosible to use the first PLS compponent to obtain weights\n", "\n", - "* `model='qr'`\n", "* `penalization=asgl`\n", "* `weight_technique='pca_1'`. It refers to using the first principal component.\n", "* `weight_technique='pls_1'`It refers to using the first PLS component.\n", - "* `lasso_power_weight=[0.8, 1, 1.2]`. As defined for PCA based on a subset of components.\n", - "* `gl_power_weight=[0.8, 1, 1.2]`. As defined for PCA based on a subset of components." + "* `individual_power_weight=1`. As defined for PCA based on a subset of components.\n", + "* `group_power_weight=1`. As defined for PCA based on a subset of components." ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ - "# Obtain weight values\n", - "model='qr'\n", - "penalization = 'asgl'\n", - "weight_technique = 'pca_1'\n", - "lasso_power_weight = [0.8, 1, 1.2]\n", - "gl_power_weight = [0.8, 1, 1.2]\n", - "\n", - "weights = asgl.WEIGHTS(model=model, penalization=penalization, weight_technique=weight_technique, \n", - " lasso_power_weight=lasso_power_weight, gl_power_weight=gl_power_weight)\n", - "lasso_weights, gl_weights = weights.fit(x, y, group_index=group_index)\n", - "\n", - "weight_technique = 'pls_1'\n", - "weights_pls = asgl.WEIGHTS(penalization=penalization, weight_technique=weight_technique, \n", - " lasso_power_weight=lasso_power_weight, gl_power_weight=gl_power_weight)\n", - "lasso_weights, gl_weights = weights_pls.fit(x, y, group_index=group_index)" + "asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='pca_1', individual_power_weight=1, group_power_weight=1)\n", + "asgl_model.fit(X=X, y=y, group_index=group_index)\n", + "coef = asgl_model.coef_" ] }, { @@ -911,31 +1186,22 @@ "\n", "Another alternative consists on running an initial lasso model, and use the estimates of this model as the initial weight for a second model\n", "\n", - "* `model='lm'`\n", "* `penalization=asgl`\n", "* `weight_technique='lasso'`. It refers to using lasso to obtain the weights\n", - "* `lambda1_weights`. It is the $\\lambda$ value used in the lasso estimation of the weights.\n", - "* `lasso_power_weight=[0.8, 1, 1.2]`. As defined for PCA based on a subset of components.\n", - "* `gl_power_weight=[0.8, 1, 1.2]`. As defined for PCA based on a subset of components." + "* `lambda1_weights=1e-2`. It is the $\\lambda$ value used in the lasso estimation of the weights.\n", + "* `individual_power_weight=1`. As defined for PCA based on a subset of components.\n", + "* `group_power_weight=1`. As defined for PCA based on a subset of components." ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ - "# Obtain weight values\n", - "model='lm'\n", - "penalization = 'asgl'\n", - "weight_technique = 'lasso'\n", - "lasso_power_weight = [0.8, 1, 1.2]\n", - "gl_power_weight = [0.8, 1, 1.2]\n", - "lambda1_weights = 1e-2\n", - "\n", - "weights = asgl.WEIGHTS(model=model, penalization=penalization, weight_technique=weight_technique, lasso_power_weight=lasso_power_weight,\n", - " gl_power_weight=gl_power_weight, lambda1_weights=lambda1_weights)\n", - "lasso_weights, gl_weights = weights.fit(x, y, group_index=group_index)" + "asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='lasso', lambda1_weights=1e-2, individual_power_weight=1, group_power_weight=1)\n", + "asgl_model.fit(X=X, y=y, group_index=group_index)\n", + "coef = asgl_model.coef_" ] }, { @@ -944,36 +1210,24 @@ "source": [ "##### Unpenalized model\n", "\n", - "This alternative can only be used when dealing with a low dimensional dataset (in which the number of observations is larger than the number of variables). In this case, it is possible to fit an initial model with no penalization, and then use this as weights for an adaptive model. We consider two alternatives. An unpenalized linear model and unpenalized quantile regression model.\n", + "This alternative can only be used when dealing with a low dimensional dataset (in which the number of observations is larger than the number of variables). In this case, it is possible to fit an initial model with no penalization, and then use these as weights for an adaptive model. We consider two alternatives. An unpenalized linear model and unpenalized quantile regression model. The usage of either one is determined by the parameter `model`.\n", "\n", - "* `model='qr'`\n", + "* `model='lm'`\n", "* `penalization=asgl`\n", "* `weight_technique='unpenalized'`. It refers to using an unpenalized model.\n", - "* `lasso_power_weight=[0.8, 1, 1.2]`. As defined for PCA based on a subset of components.\n", - "* `gl_power_weight=[0.8, 1, 1.2]`. As defined for PCA based on a subset of components.\n" + "* `individual_power_weight=1`. As defined for PCA based on a subset of components.\n", + "* `group_power_weight=1`. As defined for PCA based on a subset of components.\n" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "# Obtain weight values\n", - "model='qr'\n", - "penalization = 'asgl'\n", - "weight_technique = 'unpenalized'\n", - "lasso_power_weight = [0.8, 1, 1.2]\n", - "gl_power_weight = [0.8, 1, 1.2]\n", - "\n", - "weights = asgl.WEIGHTS(model=model, penalization=penalization, weight_technique=weight_technique, lasso_power_weight=lasso_power_weight,\n", - " gl_power_weight=gl_power_weight)\n", - "lasso_weights, gl_weights = weights.fit(x, y, group_index=group_index)\n", - "\n", - "model='lm'\n", - "weights_qr = asgl.WEIGHTS(model=model, penalization=penalization, weight_technique=weight_technique, lasso_power_weight=lasso_power_weight,\n", - " gl_power_weight=gl_power_weight)\n", - "lasso_weights, gl_weights = weights_qr.fit(x, y, group_index=group_index)" + "asgl_model = Regressor(model='lm', penalization='asgl',lambda1=0.1, alpha=0.5, weight_technique='unpenalized', individual_power_weight=1, group_power_weight=1)\n", + "asgl_model.fit(X=X, y=y, group_index=group_index)\n", + "coef = asgl_model.coef_" ] }, { @@ -995,123 +1249,11 @@ "\n", "* $\\gamma_1$ and $\\gamma_2$: These parameters are the powers applied to weights in adaptive penalizations. Usually, these are defined in the interval $[0, 2]$" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Main functions on the package\n", - "\n", - "There are four main functions in this package:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `asgl.ASGL.fit`\n", - "This function is used for fitting any model (unpenalized or penalized) on a dataset. This function is included inside the `ASGL` class object, whose parameters are:\n", - "\n", - "* `model`: model to be fit (accepts `'lm'` or `'qr'`)\n", - "* `penalization`: penalization to use (accepts `None`, `'lasso'`, `'gl'`, `'sgl'`, `'asgl'`, `'asgl_lasso'`, `'asgl_gl'`)\n", - "* `intercept`: boolean, wheter to fit the model including intercept or not\n", - "* `tol`: tolerance for a coefficient in the model to be considered as 0\n", - "* `lambda1`: parameter value that controls the level of shrinkage applied on penalizations\n", - "* `alpha`: parameter value, tradeoff between lasso and group lasso in sgl penalization\n", - "* `tau`: quantile level in quantile regression models\n", - "* `lasso_weights`: lasso weights in adaptive penalizations\n", - "* `gl_weights`: group lasso weights in adaptive penalizations\n", - "* `parallel`: boolean, wheter to execute the code in parallel or sequentially\n", - "* `num_cores`: if parallel is set to true, the number of cores to use in the execution. Default is (max - 1)\n", - "* `solver`: solver to be used by CVXPY. Default uses optimal alternative depending on the problem\n", - "* `max_iters`: CVXPY parameter. Default is 500\n", - "\n", - "The function itself requires (in adition to the class object input parameters), the following input:\n", - "* `x`: A covariates data matrix\n", - "* `y`: A response vector\n", - "* `group_index`: The group structure (required only if a group based penalization is going to be used)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `asgl.WEIGHTS.fit`\n", - "\n", - "This function is used for fitting weights used later by adaptive penalizations. This function is included inside the `WEIGHTS` class object, whose input parameter values of this function are:\n", - "\n", - "* `model`: model to be fit using these weights (accepts `'lm'` or `'qr'`)\n", - "* `penalization`: penalization to use (`'asgl'`, `'asgl_lasso'`, `'asgl_gl'`)\n", - "* `tau`: quantile level in quantile regression models\n", - "* `weight_technique`: weight technique to use for fitting the adaptive weights. Accepts `'pca_1'`, `'pca_pct'`, `'pls_1'`, `'pls_pct'`, `'unpenalized'`, `'spca'`\n", - "* `weight_tol`: Tolerance value used for avoiding ZeroDivision errors\n", - "* `lasso_power_weight`: parameter value, power at which the lasso weights are risen. Default is 1\n", - "* `gl_power_weight`: parameter value, power at which the group lasso weights are risen. Default is 1\n", - "* `variability_pct`: parameter value, percentage of variability explained by pca or pls components used in `'pca_pct'`, `'pls_pct'` and `'spca'`. Default is 0.9 (90%)\n", - "* `spca_alpha`: sparse PCA parameter. Default is $10^{-5}$\n", - "* `spca_ridge_alpha`: sparse PCA parameter. Default is $10^{-2}$\n", - "\n", - "The function itself requires (in adition to the class object input parameters), the following input:\n", - "* `x`: A covariates data matrix\n", - "* `y`: A response vector\n", - "* `group_index`: The group structure (required only if a group based penalization is going to be used)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `asgl.CV.cross_validation`\n", - "\n", - "This function performs cross validation over a dataset in order to obtain optimal parameter values for a model. This function is part of the `CV` clas object whose input parameter values of this function are:\n", - "\n", - "* All the parameters described for `asgl.ASGL.fit` and `asgl.WEIGHTS.fit` methods\n", - "* `error_type`: error measurement to use. Accepts:\n", - " * `'MSE'`: mean squared error\n", - " * `'MAE'`: mean absolute error\n", - " * `'MDAE'`: mean absolute deviation error\n", - " * `'QRE'`: quantile regression error\n", - "* `random_state`: random state value in case reproducible data splits are required\n", - "* `nfolds`: number of folds in which the dataset should be split. Default value is 5\n", - "\n", - "The function itself requires (in adition to the class object input parameters), the following input:\n", - "* `x`: A covariates data matrix\n", - "* `y`: A response vector\n", - "* `group_index`: The group structure (required only if a group based penalization is going to be used)\n", - "* `split_index`: Default is `None`. A parameter containing a group structure used in the splitting process of the data and used only if `GroupKFold` is required. \n", - "\n" - ] - }, - { - "source": [ - "### `asgl.TVT.train_validate_test`\n", - "\n", - "This function performs cross validation over a dataset in order to obtain optimal parameter values for a model. This function is part of the `TVT` clas object whose input parameter values of this function are:\n", - "\n", - "* All the parameters described for `asgl.ASGL.fit` and `asgl.WEIGHTS.fit` methods\n", - "* `error_type`: error measurement to use. Accepts:\n", - " * `'MSE'`: mean squared error\n", - " * `'MAE'`: mean absolute error\n", - " * `'MDAE'`: mean absolute deviation error\n", - " * `'QRE'`: quantile regression error\n", - "* `random_state`: random state value in case reproducible data splits are required\n", - "* `train_pct`: Default$=0.05$. Percentage of data used in the training process.\n", - "* `train_size`: Default is `None`. Number of observations used in the tranining process. This parameter overrides `train_pct`\n", - "* `validate_pct`: Default$=0.05$. Percentage of data used in the validation process.\n", - "* `validate_size`: Default is `None`. Number of observations used in the tranining process. This parameter overrides `validate_pct`\n", - "\n", - "The function itself requires (in adition to the class object input parameters), the following input:\n", - "* `x`: A covariates data matrix\n", - "* `y`: A response vector\n", - "* `group_index`: The group structure (required only if a group based penalization is going to be used) " - ], - "cell_type": "markdown", - "metadata": {} } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1125,9 +1267,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}