diff --git a/02_random_forest_benchmark.ipynb b/02_random_forest_benchmark.ipynb new file mode 100644 index 0000000..d973d5f --- /dev/null +++ b/02_random_forest_benchmark.ipynb @@ -0,0 +1,613 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qs6yQrz3Jjfw" + }, + "source": [ + "# California Housing Random Forest Benchmark\n", + "\n", + "The following notebook tests the speed at which a given device can perform training iterations on the California Housing dataset (use features of an area to predict median house value) using a Random Forest Model, `RandomizedSearchCV` and 5 folds of cross-validation.\n", + "\n", + "It's designed to be a simple test to compare Apple's M1 (normal, Pro, Max) to each other and other sources of compute.\n", + "\n", + "| Model | Dataset | Dataset Size |\n", + "| ----- | ----- | ----- |\n", + "| Random Forest (Scikit-Learn) + Random Search + Cross-validation | [California Housing](https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset) | ~20,000 samples, 8 features, 1 target variable |\n", + "\n", + "## Resources\n", + "* Code on GitHub: https://github.com/mrdbourke/m1-machine-learning-test\n", + "* Code in this notebook adapted from: https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/section-2-data-science-and-ml-tools/introduction-to-scikit-learn.ipynb" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2BLeMkHQNt6m" + }, + "source": [ + "## Setup Hyperparameters\n", + "\n", + "The main hyperparameter we're concerned with is what device this test is running on.\n", + "\n", + "Since it'll be many different machines, we'll note the current one here.\n", + "\n", + "We'll also list the dataset name and other attributes about the data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "k7Eh_M2aN2tc" + }, + "outputs": [], + "source": [ + "BATCH_SIZE = None \n", + "EPOCHS = None \n", + "DATASET_NAME = \"california_housing\" \n", + "DEVICE = \"Intel_Mac\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w-4eniIdKzze" + }, + "source": [ + "## Get helper functions and import dependencies\n", + "\n", + "The function below downloads the helper functions if necessary (if running this notebook in Google Colab, it's easier to download a single file than clone the whole repo)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bhU15T2EKyMH", + "outputId": "550558ac-fb87-40e6-8830-72311293fe01" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Helper functions already downloaded, skipping redownload.\n" + ] + } + ], + "source": [ + "# Get helper functions\n", + "import os\n", + "import requests\n", + "\n", + "if not os.path.exists(\"helper_functions.py\"):\n", + " print(\"Downloading helper functions...\")\n", + " r = requests.get(\"https://raw.githubusercontent.com/mrdbourke/m1-machine-learning-test/main/helper_functions.py\")\n", + " print(\"Writing helper functions to file...\")\n", + " open(\"helper_functions.py\", \"wb\").write(r.content)\n", + "else:\n", + " print(\"Helper functions already downloaded, skipping redownload.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "kuxrbACvJ4-W" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from timeit import default_timer as timer \n", + "from helper_functions import print_train_time\n", + "\n", + "# Get California Housing dataset\n", + "from sklearn.datasets import fetch_california_housing\n", + "housing = fetch_california_housing()\n", + "housing; # gets downloaded as dictionary" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1ZA2nETFLbSD" + }, + "source": [ + "## View data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 223 + }, + "id": "vfdAqhJ-La1F", + "outputId": "62e17d09-dcab-43a5-d2f2-1097fa47fac2" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudetarget
08.325241.06.9841271.023810322.02.55555637.88-122.234.526
18.301421.06.2381370.9718802401.02.10984237.86-122.223.585
27.257452.08.2881361.073446496.02.80226037.85-122.243.521
35.643152.05.8173521.073059558.02.54794537.85-122.253.413
43.846252.06.2818531.081081565.02.18146737.85-122.253.422
\n", + "
" + ], + "text/plain": [ + " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", + "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", + "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", + "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", + "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", + "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", + "\n", + " Longitude target \n", + "0 -122.23 4.526 \n", + "1 -122.22 3.585 \n", + "2 -122.24 3.521 \n", + "3 -122.25 3.413 \n", + "4 -122.25 3.422 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Setup dataframe\n", + "housing_df = pd.DataFrame(housing[\"data\"], columns=housing[\"feature_names\"])\n", + "housing_df[\"target\"] = pd.Series(housing[\"target\"])\n", + "housing_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Apw6iww4Mat9" + }, + "source": [ + "## Setup Random Search Grid\n", + "\n", + "To make the time a little longer, we'll fit 5 random combinations of hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "ENkNlKHSMZaf" + }, + "outputs": [], + "source": [ + "# Hyperparameter grid RandomizedSearchCV will search over\n", + "grid = {\"n_estimators\": [100, 200, 500],\n", + " \"max_depth\": [None, 5, 10, 20],\n", + " \"max_features\": [\"auto\", \"sqrt\"],\n", + " \"min_samples_split\": [2, 4],\n", + " \"min_samples_leaf\": [1, 2]}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eYLiBUFPLfIX" + }, + "source": [ + "## Model data\n", + "\n", + "We'll use Scikit-Learn's Random Forest model to model the data with `n_jobs=-1` to use as many processors as possible.\n", + "\n", + "The model will be:\n", + "* [`RandomForestRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) from Scikit-Learn\n", + "* We'll use [`RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) to search for different hyperparameters (this will ensure the modelling takes a little longer)\n", + " * For each different set of hyperparameters, we'll do 5-fold cross-validation (fitting the same model 5x on different splits of data to again take more time)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "Fik2wBn0OAU-" + }, + "outputs": [], + "source": [ + "# Import the RandomForestRegressor model class from the ensemble module\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "# Import data splitting and random search CV function\n", + "from sklearn.model_selection import RandomizedSearchCV, train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "58C5PdaOLWk-", + "outputId": "17a286ef-711f-4161-ef1f-7fb035cc3711" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 5 candidates, totalling 25 fits\n", + "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 2.2s\n", + "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 1.2s\n", + "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 1.2s\n", + "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 1.3s\n", + "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 1.2s\n", + "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.5s\n", + "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.5s\n", + "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.6s\n", + "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.6s\n", + "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.6s\n", + "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 0.6s\n", + "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 0.6s\n", + "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 0.6s\n", + "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 0.6s\n", + "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 0.7s\n", + "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.6s\n", + "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.6s\n", + "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.5s\n", + "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.5s\n", + "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.5s\n", + "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 0.8s\n", + "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 0.8s\n", + "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 0.8s\n", + "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 0.7s\n", + "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 0.7s\n", + "\n", + "Train time on Intel_Mac: 20.424 seconds\n" + ] + } + ], + "source": [ + "# Start time\n", + "start_time = timer()\n", + "\n", + "# Setup random seed\n", + "np.random.seed(42)\n", + "\n", + "# Create the data\n", + "X = housing_df.drop(\"target\", axis=1)\n", + "y = housing_df[\"target\"]\n", + "\n", + "# Split into train and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "\n", + "# Institate and fit the model (on the training set)\n", + "model = RandomForestRegressor(n_jobs=-1) # set to use all processors\n", + "\n", + "# Setup RandomizedSearchCV\n", + "rs_model = RandomizedSearchCV(estimator=model,\n", + " param_distributions=grid,\n", + " n_iter=5, # try 5 models total\n", + " cv=5, # 5-fold cross-validation\n", + " verbose=2) # print out results\n", + "\n", + "# Fit the random search model\n", + "rs_model.fit(X_train, y_train)\n", + "\n", + "# End timer\n", + "end_time = timer()\n", + "train_time = print_train_time(start_time, \n", + " end_time, \n", + " device=DEVICE)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "59qrQd_bNjp6", + "outputId": "ca691b85-03e6-498a-a0ee-5cb6c8b8d17b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8128037523299034" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check the score of the model (on the test set)\n", + "# The default score metirc of regression aglorithms is R^2\n", + "rs_model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J3NBLNr5OFvG", + "outputId": "19f217cc-eb7b-4213-e26e-ff7623591558" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_estimators': 200,\n", + " 'min_samples_split': 4,\n", + " 'min_samples_leaf': 2,\n", + " 'max_features': 'sqrt',\n", + " 'max_depth': None}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the best hyperparameters found by RandomizedSearchCV\n", + "rs_model.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RkgK2AXhOIdI" + }, + "source": [ + "## Track results and save to file" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 117 + }, + "id": "zVMa2QXpOJ71", + "outputId": "4db47e9a-b3e6-4ebb-c469-01fcdec25d7b" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
devicedataset_nameepochsbatch_sizenum_train_samplesnum_test_samplestotal_train_timetime_per_epochmodel
0Intel_Maccalifornia_housingNoneNone16512412820.424NoneRandomForestCV
\n", + "
" + ], + "text/plain": [ + " device dataset_name epochs batch_size num_train_samples \\\n", + "0 Intel_Mac california_housing None None 16512 \n", + "\n", + " num_test_samples total_train_time time_per_epoch model \n", + "0 4128 20.424 None RandomForestCV " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results = {\n", + " \"device\": DEVICE,\n", + " \"dataset_name\": DATASET_NAME,\n", + " \"epochs\": EPOCHS,\n", + " \"batch_size\": BATCH_SIZE,\n", + " \"num_train_samples\": len(X_train),\n", + " \"num_test_samples\": len(X_test),\n", + " \"total_train_time\": round(train_time, 3),\n", + " \"time_per_epoch\": None,\n", + " \"model\": \"RandomForestCV\"\n", + " }\n", + "results_df = pd.DataFrame(results, index=[0])\n", + "results_df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "wygyBdUIQXde" + }, + "outputs": [], + "source": [ + "# Write CSV to file\n", + "if not os.path.exists(\"results/\"):\n", + " os.makedirs(\"results/\")\n", + "\n", + "results_df.to_csv(f\"results/{DATASET_NAME}_{DEVICE}.csv\")" + ] + } + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyP24rAyT5OSamS0ka8ho9C9", + "collapsed_sections": [], + "include_colab_link": true, + "name": "03_random_forest_benchmark.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/03_random_forest_benchmark.ipynb b/03_random_forest_benchmark.ipynb deleted file mode 100644 index 3c0b6b8..0000000 --- a/03_random_forest_benchmark.ipynb +++ /dev/null @@ -1,597 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "03_random_forest_benchmark.ipynb", - "provenance": [], - "collapsed_sections": [], - "authorship_tag": "ABX9TyP24rAyT5OSamS0ka8ho9C9", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Qs6yQrz3Jjfw" - }, - "source": [ - "# California Housing Random Forest Benchmark\n", - "\n", - "The following notebook tests the speed at which a given device can perform training iterations on the California Housing dataset (use features of an area to predict median house value) using a Random Forest Model, `RandomizedSearchCV` and 5 folds of cross-validation.\n", - "\n", - "It's designed to be a simple test to compare Apple's M1 (normal, Pro, Max) to each other and other sources of compute.\n", - "\n", - "| Model | Dataset | Dataset Size |\n", - "| ----- | ----- | ----- |\n", - "| Random Forest (Scikit-Learn) + Random Search + Cross-validation | [California Housing](https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset) | ~20,000 samples, 8 features, 1 target variable |\n", - "\n", - "## Resources\n", - "* Code on GitHub: https://github.com/mrdbourke/m1-machine-learning-test\n", - "* Code in this notebook adapted from: https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/section-2-data-science-and-ml-tools/introduction-to-scikit-learn.ipynb" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2BLeMkHQNt6m" - }, - "source": [ - "## Setup Hyperparameters\n", - "\n", - "The main hyperparameter we're concerned with is what device this test is running on.\n", - "\n", - "Since it'll be many different machines, we'll note the current one here.\n", - "\n", - "We'll also list the dataset name and other attributes about the data." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "k7Eh_M2aN2tc" - }, - "source": [ - "BATCH_SIZE = None \n", - "EPOCHS = None \n", - "DATASET_NAME = \"california_housing\" \n", - "DEVICE = \"Google Colab\"" - ], - "execution_count": 1, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w-4eniIdKzze" - }, - "source": [ - "## Get helper functions and import dependencies\n", - "\n", - "The function below downloads the helper functions if necessary (if running this notebook in Google Colab, it's easier to download a single file than clone the whole repo)." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bhU15T2EKyMH", - "outputId": "550558ac-fb87-40e6-8830-72311293fe01" - }, - "source": [ - "# Get helper functions\n", - "import os\n", - "import requests\n", - "\n", - "if not os.path.exists(\"helper_functions.py\"):\n", - " print(\"Downloading helper functions...\")\n", - " r = requests.get(\"https://raw.githubusercontent.com/mrdbourke/m1-machine-learning-test/main/helper_functions.py\")\n", - " print(\"Writing helper functions to file...\")\n", - " open(\"helper_functions.py\", \"wb\").write(r.content)\n", - "else:\n", - " print(\"Helper functions already downloaded, skipping redownload.\")" - ], - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Helper functions already downloaded, skipping redownload.\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kuxrbACvJ4-W" - }, - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from timeit import default_timer as timer \n", - "from helper_functions import print_train_time\n", - "\n", - "# Get California Housing dataset\n", - "from sklearn.datasets import fetch_california_housing\n", - "housing = fetch_california_housing()\n", - "housing; # gets downloaded as dictionary" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1ZA2nETFLbSD" - }, - "source": [ - "## View data" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 223 - }, - "id": "vfdAqhJ-La1F", - "outputId": "62e17d09-dcab-43a5-d2f2-1097fa47fac2" - }, - "source": [ - "# Setup dataframe\n", - "housing_df = pd.DataFrame(housing[\"data\"], columns=housing[\"feature_names\"])\n", - "housing_df[\"target\"] = pd.Series(housing[\"target\"])\n", - "housing_df.head()" - ], - "execution_count": 4, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudetarget
08.325241.06.9841271.023810322.02.55555637.88-122.234.526
18.301421.06.2381370.9718802401.02.10984237.86-122.223.585
27.257452.08.2881361.073446496.02.80226037.85-122.243.521
35.643152.05.8173521.073059558.02.54794537.85-122.253.413
43.846252.06.2818531.081081565.02.18146737.85-122.253.422
\n", - "
" - ], - "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms ... AveOccup Latitude Longitude target\n", - "0 8.3252 41.0 6.984127 1.023810 ... 2.555556 37.88 -122.23 4.526\n", - "1 8.3014 21.0 6.238137 0.971880 ... 2.109842 37.86 -122.22 3.585\n", - "2 7.2574 52.0 8.288136 1.073446 ... 2.802260 37.85 -122.24 3.521\n", - "3 5.6431 52.0 5.817352 1.073059 ... 2.547945 37.85 -122.25 3.413\n", - "4 3.8462 52.0 6.281853 1.081081 ... 2.181467 37.85 -122.25 3.422\n", - "\n", - "[5 rows x 9 columns]" - ] - }, - "metadata": {}, - "execution_count": 4 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Apw6iww4Mat9" - }, - "source": [ - "## Setup Random Search Grid\n", - "\n", - "To make the time a little longer, we'll fit 5 random combinations of hyperparameters." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ENkNlKHSMZaf" - }, - "source": [ - "# Hyperparameter grid RandomizedSearchCV will search over\n", - "grid = {\"n_estimators\": [100, 200, 500],\n", - " \"max_depth\": [None, 5, 10, 20],\n", - " \"max_features\": [\"auto\", \"sqrt\"],\n", - " \"min_samples_split\": [2, 4],\n", - " \"min_samples_leaf\": [1, 2]}" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eYLiBUFPLfIX" - }, - "source": [ - "## Model data\n", - "\n", - "We'll use Scikit-Learn's Random Forest model to model the data with `n_jobs=-1` to use as many processors as possible.\n", - "\n", - "The model will be:\n", - "* [`RandomForestRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) from Scikit-Learn\n", - "* We'll use [`RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) to search for different hyperparameters (this will ensure the modelling takes a little longer)\n", - " * For each different set of hyperparameters, we'll do 5-fold cross-validation (fitting the same model 5x on different splits of data to again take more time)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Fik2wBn0OAU-" - }, - "source": [ - "# Import the RandomForestRegressor model class from the ensemble module\n", - "from sklearn.ensemble import RandomForestRegressor\n", - "# Import data splitting and random search CV function\n", - "from sklearn.model_selection import RandomizedSearchCV, train_test_split" - ], - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "58C5PdaOLWk-", - "outputId": "17a286ef-711f-4161-ef1f-7fb035cc3711" - }, - "source": [ - "# Start time\n", - "start_time = timer()\n", - "\n", - "# Setup random seed\n", - "np.random.seed(42)\n", - "\n", - "# Create the data\n", - "X = housing_df.drop(\"target\", axis=1)\n", - "y = housing_df[\"target\"]\n", - "\n", - "# Split into train and test sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", - "\n", - "# Institate and fit the model (on the training set)\n", - "model = RandomForestRegressor(n_jobs=-1) # set to use all processors\n", - "\n", - "# Setup RandomizedSearchCV\n", - "rs_model = RandomizedSearchCV(estimator=model,\n", - " param_distributions=grid,\n", - " n_iter=5, # try 5 models total\n", - " cv=5, # 5-fold cross-validation\n", - " verbose=2) # print out results\n", - "\n", - "# Fit the random search model\n", - "rs_model.fit(X_train, y_train)\n", - "\n", - "# End timer\n", - "end_time = timer()\n", - "train_time = print_train_time(start_time, \n", - " end_time, \n", - " device=DEVICE)" - ], - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Fitting 5 folds for each of 5 candidates, totalling 25 fits\n", - "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 13.6s\n", - "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 12.4s\n", - "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 12.3s\n", - "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 12.2s\n", - "[CV] END max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 12.4s\n", - "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.3s\n", - "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.2s\n", - "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.2s\n", - "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.3s\n", - "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.3s\n", - "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 4.4s\n", - "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 4.2s\n", - "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 4.3s\n", - "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 4.3s\n", - "[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=500; total time= 4.2s\n", - "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.5s\n", - "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.5s\n", - "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.4s\n", - "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.5s\n", - "[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 4.3s\n", - "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 6.7s\n", - "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 6.7s\n", - "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 6.7s\n", - "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 6.8s\n", - "[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 6.7s\n", - "\n", - "Train time on Google Colab: 166.493 seconds\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "59qrQd_bNjp6", - "outputId": "ca691b85-03e6-498a-a0ee-5cb6c8b8d17b" - }, - "source": [ - "# Check the score of the model (on the test set)\n", - "# The default score metirc of regression aglorithms is R^2\n", - "rs_model.score(X_test, y_test)" - ], - "execution_count": 8, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "0.8128037523299034" - ] - }, - "metadata": {}, - "execution_count": 8 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "J3NBLNr5OFvG", - "outputId": "19f217cc-eb7b-4213-e26e-ff7623591558" - }, - "source": [ - "# Find the best hyperparameters found by RandomizedSearchCV\n", - "rs_model.best_params_" - ], - "execution_count": 9, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'max_depth': None,\n", - " 'max_features': 'sqrt',\n", - " 'min_samples_leaf': 2,\n", - " 'min_samples_split': 4,\n", - " 'n_estimators': 200}" - ] - }, - "metadata": {}, - "execution_count": 9 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RkgK2AXhOIdI" - }, - "source": [ - "## Track results and save to file" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 117 - }, - "id": "zVMa2QXpOJ71", - "outputId": "4db47e9a-b3e6-4ebb-c469-01fcdec25d7b" - }, - "source": [ - "results = {\n", - " \"device\": DEVICE,\n", - " \"dataset_name\": DATASET_NAME,\n", - " \"epochs\": EPOCHS,\n", - " \"batch_size\": BATCH_SIZE,\n", - " \"num_train_samples\": len(X_train),\n", - " \"num_test_samples\": len(X_test),\n", - " \"total_train_time\": round(train_time, 3),\n", - " \"time_per_epoch\": None,\n", - " \"model\": \"RandomForestCV\"\n", - " }\n", - "results_df = pd.DataFrame(results, index=[0])\n", - "results_df" - ], - "execution_count": 10, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
devicedataset_nameepochsbatch_sizenum_train_samplesnum_test_samplestotal_train_timetime_per_epochmodel
0Google Colabcalifornia_housingNoneNone165124128166.493NoneRandomForestCV
\n", - "
" - ], - "text/plain": [ - " device dataset_name ... time_per_epoch model\n", - "0 Google Colab california_housing ... None RandomForestCV\n", - "\n", - "[1 rows x 9 columns]" - ] - }, - "metadata": {}, - "execution_count": 10 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wygyBdUIQXde" - }, - "source": [ - "# Write CSV to file\n", - "if not os.path.exists(\"results/\"):\n", - " os.makedirs(\"results/\")\n", - "\n", - "results_df.to_csv(f\"results/{DATASET_NAME}_{DEVICE}.csv\")" - ], - "execution_count": 11, - "outputs": [] - } - ] -} \ No newline at end of file