diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..be48cf5
Binary files /dev/null and b/.DS_Store differ
diff --git a/.ipynb_checkpoints/Linear Regression team TS2-Copy1-checkpoint.ipynb b/.ipynb_checkpoints/Linear Regression team TS2-Copy1-checkpoint.ipynb
new file mode 100644
index 0000000..c57dbef
--- /dev/null
+++ b/.ipynb_checkpoints/Linear Regression team TS2-Copy1-checkpoint.ipynb
@@ -0,0 +1,1536 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Regression Model that Predicts Apple Prices Based on Historical Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Table Of Contents\n",
+ "### 1. [Introduction](#introduction)\n",
+ "\n",
+ " 1. Objective\n",
+ " 2. Parameters\n",
+ " 3. Outline\n",
+ " \n",
+ "### 2. [Importing Data and Plotting](#import)\n",
+ "\n",
+ " 1. Import necessary packages\n",
+ " 2. Import the data into a Pandas Dataframe\n",
+ " 3. Show the data\n",
+ " 4. Make ean initial plot of the data\n",
+ "\n",
+ "### 3. [Exploratory Data Analysis](#explore)\n",
+ "### 4. [Split Data: Testing and training](#split)\n",
+ "### 5. [Outliers](#outliers)\n",
+ "### 6. [Regression Model](#regress)\n",
+ " 1. Taking estimates\n",
+ " 2. Least squares\n",
+ " 3. sklearn\n",
+ "### 7. [Conclusion](#conclude)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Introduction "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.1. Objective \n",
+ "In this notebook we will design a regression model that will predict the cost of apples based on given parameters."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.2. Parameters\n",
+ "1. Month/Season\n",
+ "2. Distance travelled\n",
+ "3. Supplier cost\n",
+ "4. Grade of apple\n",
+ "5. Demand and Supply\n",
+ "6. Container used?\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Importing Data and Plotting "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.1. Import necessary packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# These packages will be mainly used for data wrangling\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# These packages will be mostly used for plotting the data\n",
+ "from matplotlib import pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "# These packages will be mostly used to build our linear regression model so that we can make predictions from it.\n",
+ "import statsmodels as sm\n",
+ "import sklearn as skl"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.2. Import the data into a Pandas Dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#import train and test sets into DataFrames\n",
+ "sample_submission = pd.DataFrame(pd.read_csv(\"sample_submission.csv\"))\n",
+ "test_set = pd.DataFrame(pd.read_csv(\"df-test_set.csv\"))\n",
+ "train_set = pd.DataFrame(pd.read_csv(\"df-train_set.csv\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Filter for Commodity of interest in train set\n",
+ "train_set = train_set[train_set[\"Commodities\"]==\"APPLE GOLDEN DELICIOUS\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.3. Show the data "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### 2.3.1 Showing the training data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " CAPE | \n",
+ " M4183 | \n",
+ " 1L | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-09-09 | \n",
+ " 150.0 | \n",
+ " 170.0 | \n",
+ " 51710.0 | \n",
+ " 332 | \n",
+ " 6075.6 | \n",
+ " 822 | \n",
+ " 8.51 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " CAPE | \n",
+ " JG110 | \n",
+ " 2M | \n",
+ " 11.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-04-14 | \n",
+ " 50.0 | \n",
+ " 50.0 | \n",
+ " 16000.0 | \n",
+ " 320 | \n",
+ " 3520.0 | \n",
+ " 0 | \n",
+ " 4.55 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " JE090 | \n",
+ " 2S | \n",
+ " 9.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-04-16 | \n",
+ " 55.0 | \n",
+ " 55.0 | \n",
+ " 990.0 | \n",
+ " 18 | \n",
+ " 162.0 | \n",
+ " 1506 | \n",
+ " 6.11 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " CAPE | \n",
+ " M4183 | \n",
+ " 1S | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-04 | \n",
+ " 80.0 | \n",
+ " 120.0 | \n",
+ " 32020.0 | \n",
+ " 388 | \n",
+ " 7100.4 | \n",
+ " 443 | \n",
+ " 4.51 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " EASTERN CAPE | \n",
+ " IA400 | \n",
+ " 1S | \n",
+ " 400.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-09-28 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1 | \n",
+ " 400.0 | \n",
+ " 2 | \n",
+ " 4.50 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Province Container Size_Grade Weight_Kg \\\n",
+ "1 CAPE M4183 1L 18.3 \n",
+ "7 CAPE JG110 2M 11.0 \n",
+ "24 W.CAPE-BERGRIVER ETC JE090 2S 9.0 \n",
+ "40 CAPE M4183 1S 18.3 \n",
+ "69 EASTERN CAPE IA400 1S 400.0 \n",
+ "\n",
+ " Commodities Date Low_Price High_Price Sales_Total \\\n",
+ "1 APPLE GOLDEN DELICIOUS 2020-09-09 150.0 170.0 51710.0 \n",
+ "7 APPLE GOLDEN DELICIOUS 2020-04-14 50.0 50.0 16000.0 \n",
+ "24 APPLE GOLDEN DELICIOUS 2020-04-16 55.0 55.0 990.0 \n",
+ "40 APPLE GOLDEN DELICIOUS 2020-05-04 80.0 120.0 32020.0 \n",
+ "69 APPLE GOLDEN DELICIOUS 2020-09-28 1800.0 1800.0 1800.0 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "1 332 6075.6 822 8.51 \n",
+ "7 320 3520.0 0 4.55 \n",
+ "24 18 162.0 1506 6.11 \n",
+ "40 388 7100.4 443 4.51 \n",
+ "69 1 400.0 2 4.50 "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Viewing the first five rows of our train_set dataframe.\n",
+ "train_set.head() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(64376, 13)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#The dataframe has 64376 rows and 13 columns.\n",
+ "train_set.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 64376 entries, 0 to 64375\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Province 64376 non-null object \n",
+ " 1 Container 64376 non-null object \n",
+ " 2 Size_Grade 64376 non-null object \n",
+ " 3 Weight_Kg 64376 non-null float64\n",
+ " 4 Commodities 64376 non-null object \n",
+ " 5 Date 64376 non-null object \n",
+ " 6 Low_Price 64376 non-null float64\n",
+ " 7 High_Price 64376 non-null float64\n",
+ " 8 Sales_Total 64376 non-null float64\n",
+ " 9 Total_Qty_Sold 64376 non-null int64 \n",
+ " 10 Total_Kg_Sold 64376 non-null float64\n",
+ " 11 Stock_On_Hand 64376 non-null int64 \n",
+ " 12 avg_price_per_kg 64376 non-null float64\n",
+ "dtypes: float64(6), int64(2), object(5)\n",
+ "memory usage: 6.4+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "#The info method displays the nature of our data i.e datatypes and non-null count.\n",
+ "train_set.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The info summary above shows 64376 entries and it has the following data types: six float type data, two integer type data, five object type data. All columns showing zero null values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " EC120 | \n",
+ " 1M | \n",
+ " 12.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-07-09 | \n",
+ " 128.0 | \n",
+ " 136.0 | \n",
+ " 5008.0 | \n",
+ " 38 | \n",
+ " 456.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1X | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-01-20 | \n",
+ " 220.0 | \n",
+ " 220.0 | \n",
+ " 1760.0 | \n",
+ " 8 | \n",
+ " 146.4 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " EC120 | \n",
+ " 1S | \n",
+ " 12.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-08-19 | \n",
+ " 120.0 | \n",
+ " 120.0 | \n",
+ " 720.0 | \n",
+ " 6 | \n",
+ " 72.0 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1M | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-06 | \n",
+ " 160.0 | \n",
+ " 160.0 | \n",
+ " 160.0 | \n",
+ " 1 | \n",
+ " 18.3 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1L | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-04 | \n",
+ " 140.0 | \n",
+ " 160.0 | \n",
+ " 14140.0 | \n",
+ " 100 | \n",
+ " 1830.0 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index Province Container Size_Grade Weight_Kg \\\n",
+ "0 1 W.CAPE-BERGRIVER ETC EC120 1M 12.0 \n",
+ "1 2 W.CAPE-BERGRIVER ETC M4183 1X 18.3 \n",
+ "2 3 W.CAPE-BERGRIVER ETC EC120 1S 12.0 \n",
+ "3 4 W.CAPE-BERGRIVER ETC M4183 1M 18.3 \n",
+ "4 5 W.CAPE-BERGRIVER ETC M4183 1L 18.3 \n",
+ "\n",
+ " Commodities Date Low_Price High_Price Sales_Total \\\n",
+ "0 APPLE GOLDEN DELICIOUS 2020-07-09 128.0 136.0 5008.0 \n",
+ "1 APPLE GOLDEN DELICIOUS 2020-01-20 220.0 220.0 1760.0 \n",
+ "2 APPLE GOLDEN DELICIOUS 2020-08-19 120.0 120.0 720.0 \n",
+ "3 APPLE GOLDEN DELICIOUS 2020-05-06 160.0 160.0 160.0 \n",
+ "4 APPLE GOLDEN DELICIOUS 2020-05-04 140.0 160.0 14140.0 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand \n",
+ "0 38 456.0 0 \n",
+ "1 8 146.4 2 \n",
+ "2 6 72.0 45 \n",
+ "3 1 18.3 8 \n",
+ "4 100 1830.0 19 "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Viewing the first five rows of our test_set dataframe.\n",
+ "test_set.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 64376.000000 | \n",
+ " 64376.000000 | \n",
+ " 64376.000000 | \n",
+ " 6.437600e+04 | \n",
+ " 64376.000000 | \n",
+ " 64376.000000 | \n",
+ " 64376.000000 | \n",
+ " 64376.00 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 12.781592 | \n",
+ " 75.651938 | \n",
+ " 89.607858 | \n",
+ " 1.939501e+04 | \n",
+ " 446.104402 | \n",
+ " 3336.641295 | \n",
+ " 477.646328 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 35.943052 | \n",
+ " 159.508144 | \n",
+ " 172.223177 | \n",
+ " 4.442192e+04 | \n",
+ " 1184.169758 | \n",
+ " 7682.295441 | \n",
+ " 1453.892091 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 0.120000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " -5.770000e+04 | \n",
+ " -595.000000 | \n",
+ " -5040.000000 | \n",
+ " -512.000000 | \n",
+ " -inf | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 7.000000 | \n",
+ " 30.000000 | \n",
+ " 35.000000 | \n",
+ " 1.154000e+03 | \n",
+ " 20.000000 | \n",
+ " 175.000000 | \n",
+ " 0.000000 | \n",
+ " 4.02 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 10.000000 | \n",
+ " 46.000000 | \n",
+ " 55.000000 | \n",
+ " 5.400000e+03 | \n",
+ " 107.000000 | \n",
+ " 940.000000 | \n",
+ " 76.000000 | \n",
+ " 6.00 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 11.000000 | \n",
+ " 80.000000 | \n",
+ " 100.000000 | \n",
+ " 1.877200e+04 | \n",
+ " 390.000000 | \n",
+ " 3250.000000 | \n",
+ " 381.000000 | \n",
+ " 8.67 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 500.000000 | \n",
+ " 4400.000000 | \n",
+ " 4400.000000 | \n",
+ " 1.134701e+06 | \n",
+ " 39453.000000 | \n",
+ " 192230.000000 | \n",
+ " 93193.000000 | \n",
+ " inf | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "count 64376.000000 64376.000000 64376.000000 6.437600e+04 64376.000000 \n",
+ "mean 12.781592 75.651938 89.607858 1.939501e+04 446.104402 \n",
+ "std 35.943052 159.508144 172.223177 4.442192e+04 1184.169758 \n",
+ "min 0.120000 1.000000 1.000000 -5.770000e+04 -595.000000 \n",
+ "25% 7.000000 30.000000 35.000000 1.154000e+03 20.000000 \n",
+ "50% 10.000000 46.000000 55.000000 5.400000e+03 107.000000 \n",
+ "75% 11.000000 80.000000 100.000000 1.877200e+04 390.000000 \n",
+ "max 500.000000 4400.000000 4400.000000 1.134701e+06 39453.000000 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "count 64376.000000 64376.000000 64376.00 \n",
+ "mean 3336.641295 477.646328 NaN \n",
+ "std 7682.295441 1453.892091 NaN \n",
+ "min -5040.000000 -512.000000 -inf \n",
+ "25% 175.000000 0.000000 4.02 \n",
+ "50% 940.000000 76.000000 6.00 \n",
+ "75% 3250.000000 381.000000 8.67 \n",
+ "max 192230.000000 93193.000000 inf "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Summary statistic of each column in the dataframe.\n",
+ "train_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " avg_price_per_kg | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " False | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ " 64368 | \n",
+ "
\n",
+ " \n",
+ " True | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Province Container Size_Grade Weight_Kg Commodities \\\n",
+ "avg_price_per_kg \n",
+ "False 64368 64368 64368 64368 64368 \n",
+ "True 8 8 8 8 8 \n",
+ "\n",
+ " Date Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "avg_price_per_kg \n",
+ "False 64368 64368 64368 64368 64368 \n",
+ "True 8 8 8 8 8 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "avg_price_per_kg \n",
+ "False 64368 64368 64368 \n",
+ "True 8 8 8 "
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Counting the Number of rows with INF\n",
+ "train_set.groupby(np.isinf(train_set['avg_price_per_kg'])).count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 64368.000000 | \n",
+ " 64368.000000 | \n",
+ " 64368.000000 | \n",
+ " 6.436800e+04 | \n",
+ " 64368.000000 | \n",
+ " 64368.000000 | \n",
+ " 64368.000000 | \n",
+ " 64368.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 12.775983 | \n",
+ " 75.649130 | \n",
+ " 89.606784 | \n",
+ " 1.939782e+04 | \n",
+ " 446.159847 | \n",
+ " 3337.055990 | \n",
+ " 477.677107 | \n",
+ " 10.465372 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 35.912807 | \n",
+ " 159.508488 | \n",
+ " 172.225175 | \n",
+ " 4.442383e+04 | \n",
+ " 1184.232900 | \n",
+ " 7682.682767 | \n",
+ " 1453.963990 | \n",
+ " 26.126632 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 0.120000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " -5.770000e+04 | \n",
+ " -595.000000 | \n",
+ " -5040.000000 | \n",
+ " -512.000000 | \n",
+ " -78.670000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 7.000000 | \n",
+ " 30.000000 | \n",
+ " 35.000000 | \n",
+ " 1.155000e+03 | \n",
+ " 20.000000 | \n",
+ " 175.000000 | \n",
+ " 0.000000 | \n",
+ " 4.020000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 10.000000 | \n",
+ " 46.000000 | \n",
+ " 55.000000 | \n",
+ " 5.400000e+03 | \n",
+ " 107.000000 | \n",
+ " 940.000000 | \n",
+ " 76.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 11.000000 | \n",
+ " 80.000000 | \n",
+ " 100.000000 | \n",
+ " 1.877400e+04 | \n",
+ " 390.000000 | \n",
+ " 3253.000000 | \n",
+ " 381.000000 | \n",
+ " 8.670000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 500.000000 | \n",
+ " 4400.000000 | \n",
+ " 4400.000000 | \n",
+ " 1.134701e+06 | \n",
+ " 39453.000000 | \n",
+ " 192230.000000 | \n",
+ " 93193.000000 | \n",
+ " 1250.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "count 64368.000000 64368.000000 64368.000000 6.436800e+04 64368.000000 \n",
+ "mean 12.775983 75.649130 89.606784 1.939782e+04 446.159847 \n",
+ "std 35.912807 159.508488 172.225175 4.442383e+04 1184.232900 \n",
+ "min 0.120000 1.000000 1.000000 -5.770000e+04 -595.000000 \n",
+ "25% 7.000000 30.000000 35.000000 1.155000e+03 20.000000 \n",
+ "50% 10.000000 46.000000 55.000000 5.400000e+03 107.000000 \n",
+ "75% 11.000000 80.000000 100.000000 1.877400e+04 390.000000 \n",
+ "max 500.000000 4400.000000 4400.000000 1.134701e+06 39453.000000 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "count 64368.000000 64368.000000 64368.000000 \n",
+ "mean 3337.055990 477.677107 10.465372 \n",
+ "std 7682.682767 1453.963990 26.126632 \n",
+ "min -5040.000000 -512.000000 -78.670000 \n",
+ "25% 175.000000 0.000000 4.020000 \n",
+ "50% 940.000000 76.000000 6.000000 \n",
+ "75% 3253.000000 381.000000 8.670000 \n",
+ "max 192230.000000 93193.000000 1250.000000 "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Droping INF \n",
+ "train_set = train_set.replace([np.inf, -np.inf], np.nan)\n",
+ "\n",
+ "train_set = train_set.dropna(axis = 0)\n",
+ "\n",
+ "train_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### 2.3.2. Showing the testing data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(685, 13)"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#The dataframe has 685 rows and 13 columns.\n",
+ "test_set.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 685 entries, 0 to 684\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Index 685 non-null int64 \n",
+ " 1 Province 685 non-null object \n",
+ " 2 Container 685 non-null object \n",
+ " 3 Size_Grade 685 non-null object \n",
+ " 4 Weight_Kg 685 non-null float64\n",
+ " 5 Commodities 685 non-null object \n",
+ " 6 Date 685 non-null object \n",
+ " 7 Low_Price 685 non-null float64\n",
+ " 8 High_Price 685 non-null float64\n",
+ " 9 Sales_Total 685 non-null float64\n",
+ " 10 Total_Qty_Sold 685 non-null int64 \n",
+ " 11 Total_Kg_Sold 685 non-null float64\n",
+ " 12 Stock_On_Hand 685 non-null int64 \n",
+ "dtypes: float64(5), int64(3), object(5)\n",
+ "memory usage: 69.7+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "#The info method displays the nature of our data i.e datatypes and non-null count.\n",
+ "test_set.info() "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The info summary above shows 685 entries and it has the following data types: five float type data, three integer type data, five object type data. All columns showing zero null values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 343.000000 | \n",
+ " 34.142482 | \n",
+ " 164.202891 | \n",
+ " 195.590073 | \n",
+ " 18788.111212 | \n",
+ " 174.883212 | \n",
+ " 2725.402336 | \n",
+ " 439.245255 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 197.886752 | \n",
+ " 87.575995 | \n",
+ " 355.167319 | \n",
+ " 389.109476 | \n",
+ " 33951.586813 | \n",
+ " 299.351142 | \n",
+ " 5059.123311 | \n",
+ " 715.985761 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 10.000000 | \n",
+ " 10.000000 | \n",
+ " 10.000000 | \n",
+ " 1.000000 | \n",
+ " 6.300000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 172.000000 | \n",
+ " 9.000000 | \n",
+ " 50.000000 | \n",
+ " 64.000000 | \n",
+ " 1300.000000 | \n",
+ " 13.000000 | \n",
+ " 204.000000 | \n",
+ " 20.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 343.000000 | \n",
+ " 12.000000 | \n",
+ " 80.000000 | \n",
+ " 112.000000 | \n",
+ " 5520.000000 | \n",
+ " 62.000000 | \n",
+ " 860.100000 | \n",
+ " 153.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 514.000000 | \n",
+ " 18.300000 | \n",
+ " 128.000000 | \n",
+ " 160.000000 | \n",
+ " 21176.000000 | \n",
+ " 200.000000 | \n",
+ " 3033.000000 | \n",
+ " 516.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 685.000000 | \n",
+ " 400.000000 | \n",
+ " 2400.000000 | \n",
+ " 2400.000000 | \n",
+ " 308010.000000 | \n",
+ " 2774.000000 | \n",
+ " 47200.000000 | \n",
+ " 6827.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index Weight_Kg Low_Price High_Price Sales_Total \\\n",
+ "count 685.000000 685.000000 685.000000 685.000000 685.000000 \n",
+ "mean 343.000000 34.142482 164.202891 195.590073 18788.111212 \n",
+ "std 197.886752 87.575995 355.167319 389.109476 33951.586813 \n",
+ "min 1.000000 3.000000 10.000000 10.000000 10.000000 \n",
+ "25% 172.000000 9.000000 50.000000 64.000000 1300.000000 \n",
+ "50% 343.000000 12.000000 80.000000 112.000000 5520.000000 \n",
+ "75% 514.000000 18.300000 128.000000 160.000000 21176.000000 \n",
+ "max 685.000000 400.000000 2400.000000 2400.000000 308010.000000 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand \n",
+ "count 685.000000 685.000000 685.000000 \n",
+ "mean 174.883212 2725.402336 439.245255 \n",
+ "std 299.351142 5059.123311 715.985761 \n",
+ "min 1.000000 6.300000 0.000000 \n",
+ "25% 13.000000 204.000000 20.000000 \n",
+ "50% 62.000000 860.100000 153.000000 \n",
+ "75% 200.000000 3033.000000 516.000000 \n",
+ "max 2774.000000 47200.000000 6827.000000 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Summary statistic of each column in the dataframe.\n",
+ "test_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After veiwing our data, we viewed the sample submission as well to confirm our response variable as the column to feed the submission file on Kaggle."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 13.94 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.30 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index avg_price_per_kg\n",
+ "0 1 13.94\n",
+ "1 2 1.30"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_submission.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.4. Make an initial plot of the data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Exploratory Data Analysis \n",
+ "Make at least 7 plots."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.1. Explore the data shape and types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.2. Look for null values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Give data descriptions\n",
+ "### 3.3. Is the data univariate or multivariate?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.4. Determine kurtosis and skew"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.5. Consider the distribution of the data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.6. Look for correlation of multivariate data "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Testing the suitability of the data\n",
+ "### 4.1. Testing for linearity"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.2. Testing for multicollinearity"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.3. Testing for independence"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.4. Testing for homoscedasticity\n",
+ "Do the magnitude of the risiduals increase as the fitted data increases? This will result in a cone shape and that is called heteroscedasticity. We don’t want that.\n",
+ "### 4.5. Testing for normality"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Check for outliers in residuals \n",
+ "### Plot Cook’s distance\n",
+ "## 6. Build the Regression Model \n",
+ "Consider a treemodel\n",
+ "### Results if we follow method 1: Taking Estimates\n",
+ "#### Show our calculations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Slope = -0.04000586109498592\n",
+ "Intercept = 10.976486265786798\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Imported regression model\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "\n",
+ "#Split predictors and response\n",
+ "X = train_set['Weight_Kg']\n",
+ "Y = train_set['avg_price_per_kg']\n",
+ "\n",
+ "#Calculating x bar, y bar\n",
+ "x_bar = np.mean(X)\n",
+ "y_bar = np.mean(Y)\n",
+ "\n",
+ "#Calculating Slope\n",
+ "b = sum((X-x_bar) * (Y-y_bar)) / sum((X-x_bar)**2)\n",
+ "\n",
+ "#Calculating intercept\n",
+ "a = y_bar - b*x_bar\n",
+ "\n",
+ "print(\"Slope = \" + str(b))\n",
+ "print(\"Intercept = \" + str(a))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Plot our results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWQ0lEQVR4nO3df2xdZ33H8fen125qWqo0xKlSJ10CygIpvwpWKOvGqhXmrCCSIVXLNka0dYuGuokfWyBe2WDTIsoyoYK0IoUfI4iuUYCQRqXMZAHEQNDgkJY0TU1M0zW2s8asyujApLbz3R/3ONzY9zrxvdf32uf5vKSre+73nnPP87jN5x4/5/E5igjMzCwNlzW7AWZm1jgOfTOzhDj0zcwS4tA3M0uIQ9/MLCEtzW7AxSxevDhWrFjR7GaYmc0bixcvpqenpyci1k1+b86H/ooVK+jt7W12M8zM5hVJi8vVPbxjZpYQh76ZWUIc+mZmCXHom5klxKFvZpaQOT97pxp7Dw+yvaePoTMjXLewjS1dq9lwY0ezm2Vm1nS5C/29hwfp3nOEkdFxAAbPjNC95wiAg9/Mkpe74Z3tPX3nA3/CyOg423v6mtQiM7O5I3ehP3RmZEZ1M7OU5C70r1vYNqO6mVlKchf6W7pW09ZauKDW1lpgS9fqJrXIzGzuyN2J3ImTtZ69Y2Y2Ve5CH4rB75A3M5vqosM7kj4j6bSkx0pq2yU9IemHkr4saWHJe92S+iX1Seoqqb9W0pHsvY9LUt17Y2Zm07qUMf3PApOvybwfeHlEvBL4EdANIGkNsBG4IdvmXkkTA+yfADYDq7LHlOs8m5nZ7Lpo6EfEt4BnJ9W+FhFj2cvvAcuy5fXArog4GxEngH5graSlwNUR8d2ICOBzwIY69cHMzC5RPWbv/Anw1Wy5AzhZ8t5AVuvIlifXy5K0WVKvpN7h4eE6NNHMzKDG0Jd0FzAG3DdRKrNaTFMvKyJ2RERnRHS2t7fX0kQzMytR9ewdSZuAtwC3ZkM2UDyCX16y2jJgKKsvK1M3M7MGqupIX9I64P3AWyPi5yVv7QM2SlogaSXFE7YHI+IU8Jykm7JZO+8AHqix7WZmNkMXPdKXdD9wC7BY0gDwQYqzdRYA+7OZl9+LiD+PiKOSdgOPUxz2uTMiJq5+9k6KM4HaKJ4D+CpmZtZQ+uXIzNzU2dkZvb29zW6Gmdm8IulQRHROrufu2jtmZlaZQ9/MLCEOfTOzhDj0zcwSksurbPrG6GZm5eUu9H1jdDOzynI3vOMbo5uZVZa70PeN0c3MKstd6PvG6GZmleUu9H1jdDOzynJ3Itc3Rjczqyx3oQ++MbqZWSW5G94xM7PKHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWkIuGvqTPSDot6bGS2iJJ+yUdz56vKXmvW1K/pD5JXSX110o6kr33cUmqf3fMzGw6l3Kk/1lg3aTaVuBARKwCDmSvkbQG2AjckG1zr6SJC+F8AtgMrMoekz/TzMxm2UVDPyK+BTw7qbwe2Jkt7wQ2lNR3RcTZiDgB9ANrJS0Fro6I70ZEAJ8r2cbMzBqk2jH9ayPiFED2vCSrdwAnS9YbyGod2fLkelmSNkvqldQ7PDxcZRPNzGyyep/ILTdOH9PUy4qIHRHRGRGd7e3tdWucmVnqqg39Z7IhG7Ln01l9AFhest4yYCirLytTNzOzBqo29PcBm7LlTcADJfWNkhZIWknxhO3BbAjoOUk3ZbN23lGyjZmZNchFr6cv6X7gFmCxpAHgg8DdwG5JdwBPA7cDRMRRSbuBx4Ex4M6ImLhL+TspzgRqA76aPczMrIFUnEwzd3V2dkZvb2+zm2FmNq9IOhQRnZPr/otcM7OEOPTNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS8hF/zhrPtp7eJDtPX0MnRnhuoVtbOlazYYbK17fzcwsGbkL/b2HB+nec4SR0eIfAg+eGaF7zxEAB7+ZJS93wzvbe/rOB/6EkdFxtvf0NalFZmZzR+5Cf+jMyIzqZmYpyV3oX7ewbUZ1M7OU5C70t3Stpq21cEGtrbXAlq7VTWqRmdnckbsTuRMnaz17x8xsqtyFPhSD3yFvZjZVLkPf8/TNzMrLXeh7nr6ZWWW5O5HrefpmZpXlLvQ9T9/MrLLchb7n6ZuZVZa70Pc8fTOzymoKfUnvkXRU0mOS7pd0haRFkvZLOp49X1Oyfrekfkl9krpqb/5UG27s4MNvewUdC9sQ0LGwjQ+/7RU+iWtmBigiqttQ6gC+DayJiBFJu4GHgDXAsxFxt6StwDUR8X5Ja4D7gbXAdcB/AL8aEeMVdgFAZ2dn9Pb2VtVGM7NUSToUEZ2T67UO77QAbZJagBcAQ8B6YGf2/k5gQ7a8HtgVEWcj4gTQT/ELwMzMGqTq0I+IQeCfgaeBU8D/RsTXgGsj4lS2zilgSbZJB3Cy5CMGstoUkjZL6pXUOzw8XG0TzcxskqpDPxurXw+spDhcc6Wkt0+3SZla2bGliNgREZ0R0dne3l5tE83MbJJahnfeCJyIiOGIGAX2AL8GPCNpKUD2fDpbfwBYXrL9MorDQWZm1iC1hP7TwE2SXiBJwK3AMWAfsClbZxPwQLa8D9goaYGklcAq4GAN+zczsxmq+to7EfGwpC8CPwDGgMPADuAqYLekOyh+MdyerX80m+HzeLb+nRebuWNmZvVV9ZTNRvGUTTOzmZutKZtmZjaPOPTNzBLi0DczS4hD38wsIQ59M7OE5O52iRN8n1wzs6lyGfq+T66ZWXm5HN7xfXLNzMrLZegP+j65ZmZl5S709x4eLHs5T/B9cs3Mchf623v6yl6vWeD75JpZ8nIX+pWGcAKfxDUzy13oVxrC6fDQjplZ/kJ/S9dq2loLF9TaWgse2jEzI4fz9CeGcPyHWWZmU+Uu9KEY/A55M7Opcje8Y2ZmlTn0zcwS4tA3M0uIQ9/MLCEOfTOzhNQU+pIWSvqipCckHZP0ekmLJO2XdDx7vqZk/W5J/ZL6JHXV3nwzM5uJWo/0Pwb8e0S8FHgVcAzYChyIiFXAgew1ktYAG4EbgHXAvZIKZT/VzMxmRdWhL+lq4A3ApwEi4vmIOAOsB3Zmq+0ENmTL64FdEXE2Ik4A/cDaavdvZmYzV8uR/ouBYeBfJR2W9ClJVwLXRsQpgOx5SbZ+B3CyZPuBrDaFpM2SeiX1Dg8P19BEMzMrVUvotwCvAT4RETcCPyMbyqmg3GXuy10FmYjYERGdEdHZ3t5eQxPNzKxULaE/AAxExMPZ6y9S/BJ4RtJSgOz5dMn6y0u2XwYM1bB/MzOboapDPyL+GzgpaeLylbcCjwP7gE1ZbRPwQLa8D9goaYGklcAq4GC1+zczs5mr9YJrfwncJ+ly4Engjyl+keyWdAfwNHA7QEQclbSb4hfDGHBnRIyX/1gzM5sNNYV+RDwCdJZ569YK628DttWyTzMzq14uL6289/Cgr6dvZlZG7kJ/7+FBuvccYWS0OHI0eGaE7j1HAN8j18wsd9fe2d7Tdz7wJ4yMjrO9p69JLTIzmztyF/pDZ0ZmVDczS0nuQv+6hW0zqpuZpSR3ob+lazVtrRdex62ttcCWrtUVtjAzS0fuTuROnKz17B0zs6lyF/pQDH6HvJnZVLkMfc/TNzMrL3eh73n6ZmaV5e5Erufpm5lVlrvQ9zx9M7PKchf6nqdvZlZZ7kLf8/TNzCrL3Ylcz9M3M6ssd0f6ZmZWWe6O9PceHmTLFx5l9FzxnuuDZ0bY8oVHAU/ZNDPL3ZH+h/YdPR/4E0bPBR/ad7RJLTIzmztyF/pnRkZnVDczS0nuQt/MzCpz6JuZJaTm0JdUkHRY0oPZ60WS9ks6nj1fU7Jut6R+SX2Sumrdt5mZzUw9jvTfBRwreb0VOBARq4AD2WskrQE2AjcA64B7JRUwM7OGqSn0JS0D3gx8qqS8HtiZLe8ENpTUd0XE2Yg4AfQDa2vZv5mZzUytR/r3AO8DzpXUro2IUwDZ85Ks3gGcLFlvIKtNIWmzpF5JvcPDwzU20czMJlQd+pLeApyOiEOXukmZWpSpERE7IqIzIjrb29urbaKZmU1Sy1/k3gy8VdJtwBXA1ZI+DzwjaWlEnJK0FDidrT8ALC/ZfhkwVMP+zcxshqo+0o+I7ohYFhErKJ6g/XpEvB3YB2zKVtsEPJAt7wM2SlogaSWwCjhYdcvNzGzGZuPaO3cDuyXdATwN3A4QEUcl7QYeB8aAOyNivPLHmJlZvdUl9CPim8A3s+X/AW6tsN42YFs99mlmZjPnv8g1M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0tIUqG/YutX+MNPfrfZzTAza5qkQh/gOz9+1sFvZslKLvShGPxmZimqOvQlLZf0DUnHJB2V9K6svkjSfknHs+drSrbpltQvqU9SVz06YGZml66WI/0x4K8i4mXATcCdktYAW4EDEbEKOJC9JntvI3ADsA64V1KhlsabmdnMVB36EXEqIn6QLT8HHAM6gPXAzmy1ncCGbHk9sCsizkbECaAfWFvt/mtx80sWNWO3ZmZNV5cxfUkrgBuBh4FrI+IUFL8YgCXZah3AyZLNBrJauc/bLKlXUu/w8HA9mnjezS9ZxH1/9vq6fqaZ2XxRc+hLugr4EvDuiPjpdKuWqUW5FSNiR0R0RkRne3t7rU087zLB7Z3X1+3zzMzmm5pCX1IrxcC/LyL2ZOVnJC3N3l8KnM7qA8Dyks2XAUO17H+mzgVs7+lr5C7NzOaUWmbvCPg0cCwiPlry1j5gU7a8CXigpL5R0gJJK4FVwMFq91+twTMjjd6lmdmc0VLDtjcDfwQckfRIVvsb4G5gt6Q7gKeB2wEi4qik3cDjFGf+3BkR4zXsvyoFlRtlMjNLQ9WhHxHfpvw4PcCtFbbZBmyrdp/1MB7Biq1foSDx+69bzj9ueEUzm2Nm1lBJ/kUuFMP/8997mg/sPdLsppiZNUyyoT/h/odPXnwlM7OcSD70x6PsrFEzs1xKPvTNzFLi0AdfatnMkuHQx5daNrN0OPTNzBLi0DczS4hDH7i84L/SNbM0OPSB0XFP2zSzNDj0gSta/WMwszQ47YCzY+ea3QQzs4Zw6FO8zr6ZWQoc+vhyy2aWDoc+8OL2FzS7CWZmDVHLTVRyo//0z3hJ90OMR/g6+2aWaz7Sp3h39omrbfo6+2aWZw79CnydfTPLI4d+Bb7OvpnlkUPfzCwhPpE7jRVbvwLAPb/3ajbc2NHk1pjNfxP/pko9dfeb67Y+wEvveohflFxa5YqCeGLbbTNoZXO96aPf5Pjpn51/vWrJlex/7y11+3xFg4cxJK0DPgYUgE9FxN3Trd/Z2Rm9vb2X/PkT/5Ncdfbn/OaThxgrFBi9rIWxywqMForPLxg9S+HcOGdbLuf5QgtnWy7n561X8NMFVzKWrTN6Wcv5bSmZx3+x/+HMrLxyAT6h3L+rma4PUwN/wnwJ/smBP6Ga4Jd0KCI6J9cbeqQvqQD8C/AmYAD4vqR9EfF4vff1ud1/y2uG+ur9sfCR+n9kQxUK0Nqaj8dlHp20C5UL/Onqc025wJ+uXo1GD++sBfoj4kkASbuA9UDdQ/+vb3sP7/3Pz9N6boyWc+O0jhefW86NsXag/O5OX3lNcZ3xMVqzdVvPjde7ac01Pl58/OIXzW6JlZKa/yVar0ehcMFvxza3NDr0O4DSuZADwOsmryRpM7AZ4Prrr69qR0++aBl/sWFrVdtezLwd4okoBv7oaPnH889Xfm8uPvIkovjzf/75Zrdk1jw13ZtlfoOe6frVbnNRDfzC/NODxxm7rIWxQgvf+ZVXcWJR/c8lNjr0y339T/m9KyJ2ADugOKY/241KhgQtLcVHW1uzW2Olpvsynm+Pczm7am0DDzI+MOn1ivc/WPd9NDr0B4DlJa+XAUMNboPZ3FMoFB9XXNHslsyaRpzIrWabaZ07B2NjDfvC/OTX+84PRz/4st+YeXsvQUNn70hqAX4E3AoMAt8H/iAijlbaZqazd2D6//C1mrdDO2ZzQCOmbFazzVxSr/ZXmr3TjCmbtwH3UJyy+ZmI2Dbd+tWEPsDNd3+dwTMjF13vyssLHP2HdTP+fDOzuWxOTNkEiIiHgIdmez9DlxD4ba0Ftv2ur6ZpZunI7UTn6xaWP1FZkBDQsbCND7/tFf5LWzNLSm4vw7ClazXde44wMvrLefZtrQUHvZklLbehPxHs23v6GDozwnUL29jStdqBb2ZJy23oQzH4J4f83sOD/iIws2TlOvQn23t48IIhn8EzI3TvKd4hy8FvZinI7Ynccrb39F0wxg8wMjrO9p5ZuDCbmdkclFToV5rGeSnTO83M8iCp0K80jbNS3cwsb5IK/S1dq2lrLVxQa2stsKVrdZNaZGbWWEmdyPU0TjNLXVKhD+WncZqZpSKp4R0zs9Q59M3MEuLQNzNLiEPfzCwhDn0zs4Q0/M5ZMyVpGPivKjdfDPykjs2ZD9znNKTW59T6C7X1+ScAETHltoBzPvRrIam33O3C8sx9TkNqfU6tvzB7ffbwjplZQhz6ZmYJyXvo72h2A5rAfU5Dan1Orb8wS33O9Zi+mZldKO9H+mZmVsKhb2aWkFyGvqR1kvok9Uva2uz21Iukz0g6LemxktoiSfslHc+eryl5rzv7GfRJ6mpOq2sjabmkb0g6JumopHdl9dz2W9IVkg5KejTr899n9dz2GUBSQdJhSQ9mr3PdXwBJT0k6IukRSb1ZbXb7HRG5egAF4MfAi4HLgUeBNc1uV5369gbgNcBjJbV/ArZmy1uBj2TLa7K+LwBWZj+TQrP7UEWflwKvyZZfCPwo61tu+w0IuCpbbgUeBm7Kc5+zfrwX+Dfgwex1rvub9eUpYPGk2qz2O49H+muB/oh4MiKeB3YB65vcprqIiG8Bz04qrwd2Zss7gQ0l9V0RcTYiTgD9FH8280pEnIqIH2TLzwHHgA5y3O8o+r/sZWv2CHLcZ0nLgDcDnyop57a/FzGr/c5j6HcAJ0teD2S1vLo2Ik5BMSCBJVk9dz8HSSuAGyke+ea639lQxyPAaWB/ROS9z/cA7wPOldTy3N8JAXxN0iFJm7ParPY7j3fOUplaivNSc/VzkHQV8CXg3RHxU6lc94qrlqnNu35HxDjwakkLgS9Levk0q8/rPkt6C3A6Ig5JuuVSNilTmzf9neTmiBiStATYL+mJadatS7/zeKQ/ACwveb0MGGpSWxrhGUlLAbLn01k9Nz8HSa0UA/++iNiTlXPfb4CIOAN8E1hHfvt8M/BWSU9RHI79LUmfJ7/9PS8ihrLn08CXKQ7XzGq/8xj63wdWSVop6XJgI7CvyW2aTfuATdnyJuCBkvpGSQskrQRWAQeb0L6aqHhI/2ngWER8tOSt3PZbUnt2hI+kNuCNwBPktM8R0R0RyyJiBcV/r1+PiLeT0/5OkHSlpBdOLAO/DTzGbPe72WevZ+mM+G0UZ3n8GLir2e2pY7/uB04BoxS/9e8AXgQcAI5nz4tK1r8r+xn0Ab/T7PZX2edfp/gr7A+BR7LHbXnuN/BK4HDW58eAv8vque1zST9u4Zezd3LdX4ozDB/NHkcnsmq2++3LMJiZJSSPwztmZlaBQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhPw/jznwxD6w4A0AAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "def gen_y(x_list, a, b):\n",
+ " y_gen = []\n",
+ " for x_i in x_list:\n",
+ " y_i = a + b*x_i\n",
+ " y_gen.append(y_i)\n",
+ "\n",
+ " return(y_gen)\n",
+ "\n",
+ "y_gen2 = gen_y(X, a, b)\n",
+ "\n",
+ "plt.scatter(X, Y)\n",
+ "plt.plot(X, y_gen2, color='red')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "#### Assess our results\n",
+ "### Results if we follow method 2: Least Squared Method\n",
+ "#### Show our calculations\n",
+ "#### Plot our results\n",
+ "#### Assess our results\n",
+ "### Results if we build our model using sklearn:\n",
+ "#### Show our calculations\n",
+ "#### Plot our results\n",
+ "#### Assess our results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 7. Conclusion "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### What we accomplished. \n",
+ "### What we learnt.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/.ipynb_checkpoints/Linear Regression team TS2-checkpoint.ipynb b/.ipynb_checkpoints/Linear Regression team TS2-checkpoint.ipynb
new file mode 100644
index 0000000..b2b7ac9
--- /dev/null
+++ b/.ipynb_checkpoints/Linear Regression team TS2-checkpoint.ipynb
@@ -0,0 +1,2115 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Regression Model that Predicts Apple Prices Based on Historical Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Table Of Contents\n",
+ "### 1. [Introduction](#introduction)\n",
+ "\n",
+ " 1. Objective\n",
+ " 2. Parameters\n",
+ " 3. Outline\n",
+ " \n",
+ "### 2. [Importing Data and Plotting](#import)\n",
+ "\n",
+ " 1. Import necessary packages\n",
+ " 2. Import the data into a Pandas Dataframe\n",
+ " 3. Show the data\n",
+ " 4. Make ean initial plot of the data\n",
+ "\n",
+ "### 3. [Exploratory Data Analysis](#explore)\n",
+ "### 4. [Split Data: Testing and training](#split)\n",
+ "### 5. [Outliers](#outliers)\n",
+ "### 6. [Regression Model](#regress)\n",
+ " 1. Taking estimates\n",
+ " 2. Least squares\n",
+ " 3. sklearn\n",
+ "### 7. [Conclusion](#conclude)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Introduction "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.1. Objective \n",
+ "In this notebook we will design a regression model that will predict the cost of apples based on given parameters."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.2. Parameters\n",
+ "1. Month/Season\n",
+ "2. Distance travelled\n",
+ "3. Supplier cost\n",
+ "4. Grade of apple\n",
+ "5. Demand and Supply\n",
+ "6. Container used?\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Importing Data and Plotting "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.1. Import necessary packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# These packages will be mainly used for data wrangling\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# These packages will be mostly used for plotting the data\n",
+ "from matplotlib import pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "# These packages will be mostly used to build our linear regression model so that we can make predictions from it.\n",
+ "import statsmodels as sm\n",
+ "import sklearn as skl"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.2. Import the data into a Pandas Dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#import train and test sets into DataFrames\n",
+ "sample_submission = pd.DataFrame(pd.read_csv(\"sample_submission.csv\"))\n",
+ "test_set = pd.DataFrame(pd.read_csv(\"df-test_set.csv\"))\n",
+ "train_set = pd.DataFrame(pd.read_csv(\"df-train_set.csv\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Filter for Commodity of interest in train set\n",
+ "train_set = train_set[train_set[\"Commodities\"]==\"APPLE GOLDEN DELICIOUS\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.3. Show the data "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### 2.3.1 Showing the training data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " CAPE | \n",
+ " M4183 | \n",
+ " 1L | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-09-09 | \n",
+ " 150.0 | \n",
+ " 170.0 | \n",
+ " 51710.0 | \n",
+ " 332 | \n",
+ " 6075.6 | \n",
+ " 822 | \n",
+ " 8.51 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " CAPE | \n",
+ " JG110 | \n",
+ " 2M | \n",
+ " 11.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-04-14 | \n",
+ " 50.0 | \n",
+ " 50.0 | \n",
+ " 16000.0 | \n",
+ " 320 | \n",
+ " 3520.0 | \n",
+ " 0 | \n",
+ " 4.55 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " JE090 | \n",
+ " 2S | \n",
+ " 9.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-04-16 | \n",
+ " 55.0 | \n",
+ " 55.0 | \n",
+ " 990.0 | \n",
+ " 18 | \n",
+ " 162.0 | \n",
+ " 1506 | \n",
+ " 6.11 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " CAPE | \n",
+ " M4183 | \n",
+ " 1S | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-04 | \n",
+ " 80.0 | \n",
+ " 120.0 | \n",
+ " 32020.0 | \n",
+ " 388 | \n",
+ " 7100.4 | \n",
+ " 443 | \n",
+ " 4.51 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " EASTERN CAPE | \n",
+ " IA400 | \n",
+ " 1S | \n",
+ " 400.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-09-28 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1 | \n",
+ " 400.0 | \n",
+ " 2 | \n",
+ " 4.50 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Province Container Size_Grade Weight_Kg \\\n",
+ "1 CAPE M4183 1L 18.3 \n",
+ "7 CAPE JG110 2M 11.0 \n",
+ "24 W.CAPE-BERGRIVER ETC JE090 2S 9.0 \n",
+ "40 CAPE M4183 1S 18.3 \n",
+ "69 EASTERN CAPE IA400 1S 400.0 \n",
+ "\n",
+ " Commodities Date Low_Price High_Price Sales_Total \\\n",
+ "1 APPLE GOLDEN DELICIOUS 2020-09-09 150.0 170.0 51710.0 \n",
+ "7 APPLE GOLDEN DELICIOUS 2020-04-14 50.0 50.0 16000.0 \n",
+ "24 APPLE GOLDEN DELICIOUS 2020-04-16 55.0 55.0 990.0 \n",
+ "40 APPLE GOLDEN DELICIOUS 2020-05-04 80.0 120.0 32020.0 \n",
+ "69 APPLE GOLDEN DELICIOUS 2020-09-28 1800.0 1800.0 1800.0 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "1 332 6075.6 822 8.51 \n",
+ "7 320 3520.0 0 4.55 \n",
+ "24 18 162.0 1506 6.11 \n",
+ "40 388 7100.4 443 4.51 \n",
+ "69 1 400.0 2 4.50 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Viewing the first five rows of our train_set dataframe.\n",
+ "train_set.head() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1952, 13)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#The dataframe has 64376 rows and 13 columns.\n",
+ "train_set.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 1952 entries, 1 to 64310\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Province 1952 non-null object \n",
+ " 1 Container 1952 non-null object \n",
+ " 2 Size_Grade 1952 non-null object \n",
+ " 3 Weight_Kg 1952 non-null float64\n",
+ " 4 Commodities 1952 non-null object \n",
+ " 5 Date 1952 non-null object \n",
+ " 6 Low_Price 1952 non-null float64\n",
+ " 7 High_Price 1952 non-null float64\n",
+ " 8 Sales_Total 1952 non-null float64\n",
+ " 9 Total_Qty_Sold 1952 non-null int64 \n",
+ " 10 Total_Kg_Sold 1952 non-null float64\n",
+ " 11 Stock_On_Hand 1952 non-null int64 \n",
+ " 12 avg_price_per_kg 1952 non-null float64\n",
+ "dtypes: float64(6), int64(2), object(5)\n",
+ "memory usage: 213.5+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "#The info method displays the nature of our data i.e datatypes and non-null count.\n",
+ "train_set.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The info summary above shows 64376 entries and it has the following data types: six float type data, two integer type data, five object type data. All columns showing zero null values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " EC120 | \n",
+ " 1M | \n",
+ " 12.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-07-09 | \n",
+ " 128.0 | \n",
+ " 136.0 | \n",
+ " 5008.0 | \n",
+ " 38 | \n",
+ " 456.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1X | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-01-20 | \n",
+ " 220.0 | \n",
+ " 220.0 | \n",
+ " 1760.0 | \n",
+ " 8 | \n",
+ " 146.4 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " EC120 | \n",
+ " 1S | \n",
+ " 12.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-08-19 | \n",
+ " 120.0 | \n",
+ " 120.0 | \n",
+ " 720.0 | \n",
+ " 6 | \n",
+ " 72.0 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1M | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-06 | \n",
+ " 160.0 | \n",
+ " 160.0 | \n",
+ " 160.0 | \n",
+ " 1 | \n",
+ " 18.3 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1L | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-04 | \n",
+ " 140.0 | \n",
+ " 160.0 | \n",
+ " 14140.0 | \n",
+ " 100 | \n",
+ " 1830.0 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index Province Container Size_Grade Weight_Kg \\\n",
+ "0 1 W.CAPE-BERGRIVER ETC EC120 1M 12.0 \n",
+ "1 2 W.CAPE-BERGRIVER ETC M4183 1X 18.3 \n",
+ "2 3 W.CAPE-BERGRIVER ETC EC120 1S 12.0 \n",
+ "3 4 W.CAPE-BERGRIVER ETC M4183 1M 18.3 \n",
+ "4 5 W.CAPE-BERGRIVER ETC M4183 1L 18.3 \n",
+ "\n",
+ " Commodities Date Low_Price High_Price Sales_Total \\\n",
+ "0 APPLE GOLDEN DELICIOUS 2020-07-09 128.0 136.0 5008.0 \n",
+ "1 APPLE GOLDEN DELICIOUS 2020-01-20 220.0 220.0 1760.0 \n",
+ "2 APPLE GOLDEN DELICIOUS 2020-08-19 120.0 120.0 720.0 \n",
+ "3 APPLE GOLDEN DELICIOUS 2020-05-06 160.0 160.0 160.0 \n",
+ "4 APPLE GOLDEN DELICIOUS 2020-05-04 140.0 160.0 14140.0 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand \n",
+ "0 38 456.0 0 \n",
+ "1 8 146.4 2 \n",
+ "2 6 72.0 45 \n",
+ "3 1 18.3 8 \n",
+ "4 100 1830.0 19 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Viewing the first five rows of our test_set dataframe.\n",
+ "test_set.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 40.460912 | \n",
+ " 174.307377 | \n",
+ " 215.648053 | \n",
+ " 20053.533811 | \n",
+ " 174.510758 | \n",
+ " 2960.176332 | \n",
+ " 408.393955 | \n",
+ " 6.778893 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 99.655169 | \n",
+ " 373.553578 | \n",
+ " 433.546159 | \n",
+ " 39005.069445 | \n",
+ " 308.810797 | \n",
+ " 6097.416527 | \n",
+ " 724.450582 | \n",
+ " 2.248744 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 3.000000 | \n",
+ " 2.000000 | \n",
+ " 5.000000 | \n",
+ " 5.000000 | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 0.000000 | \n",
+ " 0.250000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 9.000000 | \n",
+ " 50.000000 | \n",
+ " 60.000000 | \n",
+ " 1325.000000 | \n",
+ " 12.000000 | \n",
+ " 219.600000 | \n",
+ " 9.000000 | \n",
+ " 5.460000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 12.000000 | \n",
+ " 80.000000 | \n",
+ " 108.000000 | \n",
+ " 5495.000000 | \n",
+ " 64.000000 | \n",
+ " 853.500000 | \n",
+ " 126.500000 | \n",
+ " 6.670000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 18.300000 | \n",
+ " 127.250000 | \n",
+ " 160.000000 | \n",
+ " 21082.500000 | \n",
+ " 200.000000 | \n",
+ " 3093.525000 | \n",
+ " 468.000000 | \n",
+ " 8.280000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 400.000000 | \n",
+ " 2300.000000 | \n",
+ " 3300.000000 | \n",
+ " 369464.000000 | \n",
+ " 4237.000000 | \n",
+ " 74000.000000 | \n",
+ " 6400.000000 | \n",
+ " 21.240000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "count 1952.000000 1952.000000 1952.000000 1952.000000 1952.000000 \n",
+ "mean 40.460912 174.307377 215.648053 20053.533811 174.510758 \n",
+ "std 99.655169 373.553578 433.546159 39005.069445 308.810797 \n",
+ "min 3.000000 2.000000 5.000000 5.000000 1.000000 \n",
+ "25% 9.000000 50.000000 60.000000 1325.000000 12.000000 \n",
+ "50% 12.000000 80.000000 108.000000 5495.000000 64.000000 \n",
+ "75% 18.300000 127.250000 160.000000 21082.500000 200.000000 \n",
+ "max 400.000000 2300.000000 3300.000000 369464.000000 4237.000000 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "count 1952.000000 1952.000000 1952.000000 \n",
+ "mean 2960.176332 408.393955 6.778893 \n",
+ "std 6097.416527 724.450582 2.248744 \n",
+ "min 3.000000 0.000000 0.250000 \n",
+ "25% 219.600000 9.000000 5.460000 \n",
+ "50% 853.500000 126.500000 6.670000 \n",
+ "75% 3093.525000 468.000000 8.280000 \n",
+ "max 74000.000000 6400.000000 21.240000 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Summary statistic of each column in the dataframe.\n",
+ "train_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " avg_price_per_kg | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " False | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Province Container Size_Grade Weight_Kg Commodities \\\n",
+ "avg_price_per_kg \n",
+ "False 1952 1952 1952 1952 1952 \n",
+ "\n",
+ " Date Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "avg_price_per_kg \n",
+ "False 1952 1952 1952 1952 1952 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "avg_price_per_kg \n",
+ "False 1952 1952 1952 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Counting the Number of rows with INF\n",
+ "train_set.groupby(np.isinf(train_set['avg_price_per_kg'])).count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 40.460912 | \n",
+ " 174.307377 | \n",
+ " 215.648053 | \n",
+ " 20053.533811 | \n",
+ " 174.510758 | \n",
+ " 2960.176332 | \n",
+ " 408.393955 | \n",
+ " 6.778893 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 99.655169 | \n",
+ " 373.553578 | \n",
+ " 433.546159 | \n",
+ " 39005.069445 | \n",
+ " 308.810797 | \n",
+ " 6097.416527 | \n",
+ " 724.450582 | \n",
+ " 2.248744 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 3.000000 | \n",
+ " 2.000000 | \n",
+ " 5.000000 | \n",
+ " 5.000000 | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 0.000000 | \n",
+ " 0.250000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 9.000000 | \n",
+ " 50.000000 | \n",
+ " 60.000000 | \n",
+ " 1325.000000 | \n",
+ " 12.000000 | \n",
+ " 219.600000 | \n",
+ " 9.000000 | \n",
+ " 5.460000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 12.000000 | \n",
+ " 80.000000 | \n",
+ " 108.000000 | \n",
+ " 5495.000000 | \n",
+ " 64.000000 | \n",
+ " 853.500000 | \n",
+ " 126.500000 | \n",
+ " 6.670000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 18.300000 | \n",
+ " 127.250000 | \n",
+ " 160.000000 | \n",
+ " 21082.500000 | \n",
+ " 200.000000 | \n",
+ " 3093.525000 | \n",
+ " 468.000000 | \n",
+ " 8.280000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 400.000000 | \n",
+ " 2300.000000 | \n",
+ " 3300.000000 | \n",
+ " 369464.000000 | \n",
+ " 4237.000000 | \n",
+ " 74000.000000 | \n",
+ " 6400.000000 | \n",
+ " 21.240000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "count 1952.000000 1952.000000 1952.000000 1952.000000 1952.000000 \n",
+ "mean 40.460912 174.307377 215.648053 20053.533811 174.510758 \n",
+ "std 99.655169 373.553578 433.546159 39005.069445 308.810797 \n",
+ "min 3.000000 2.000000 5.000000 5.000000 1.000000 \n",
+ "25% 9.000000 50.000000 60.000000 1325.000000 12.000000 \n",
+ "50% 12.000000 80.000000 108.000000 5495.000000 64.000000 \n",
+ "75% 18.300000 127.250000 160.000000 21082.500000 200.000000 \n",
+ "max 400.000000 2300.000000 3300.000000 369464.000000 4237.000000 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "count 1952.000000 1952.000000 1952.000000 \n",
+ "mean 2960.176332 408.393955 6.778893 \n",
+ "std 6097.416527 724.450582 2.248744 \n",
+ "min 3.000000 0.000000 0.250000 \n",
+ "25% 219.600000 9.000000 5.460000 \n",
+ "50% 853.500000 126.500000 6.670000 \n",
+ "75% 3093.525000 468.000000 8.280000 \n",
+ "max 74000.000000 6400.000000 21.240000 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Droping INF \n",
+ "train_set = train_set.replace([np.inf, -np.inf], np.nan)\n",
+ "\n",
+ "train_set = train_set.dropna(axis = 0)\n",
+ "\n",
+ "train_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### 2.3.2. Showing the testing data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(685, 13)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#The dataframe has 685 rows and 13 columns.\n",
+ "test_set.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 685 entries, 0 to 684\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Index 685 non-null int64 \n",
+ " 1 Province 685 non-null object \n",
+ " 2 Container 685 non-null object \n",
+ " 3 Size_Grade 685 non-null object \n",
+ " 4 Weight_Kg 685 non-null float64\n",
+ " 5 Commodities 685 non-null object \n",
+ " 6 Date 685 non-null object \n",
+ " 7 Low_Price 685 non-null float64\n",
+ " 8 High_Price 685 non-null float64\n",
+ " 9 Sales_Total 685 non-null float64\n",
+ " 10 Total_Qty_Sold 685 non-null int64 \n",
+ " 11 Total_Kg_Sold 685 non-null float64\n",
+ " 12 Stock_On_Hand 685 non-null int64 \n",
+ "dtypes: float64(5), int64(3), object(5)\n",
+ "memory usage: 69.7+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "#The info method displays the nature of our data i.e datatypes and non-null count.\n",
+ "test_set.info() "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The info summary above shows 685 entries and it has the following data types: five float type data, three integer type data, five object type data. All columns showing zero null values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 343.000000 | \n",
+ " 34.142482 | \n",
+ " 164.202891 | \n",
+ " 195.590073 | \n",
+ " 18788.111212 | \n",
+ " 174.883212 | \n",
+ " 2725.402336 | \n",
+ " 439.245255 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 197.886752 | \n",
+ " 87.575995 | \n",
+ " 355.167319 | \n",
+ " 389.109476 | \n",
+ " 33951.586813 | \n",
+ " 299.351142 | \n",
+ " 5059.123311 | \n",
+ " 715.985761 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 10.000000 | \n",
+ " 10.000000 | \n",
+ " 10.000000 | \n",
+ " 1.000000 | \n",
+ " 6.300000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 172.000000 | \n",
+ " 9.000000 | \n",
+ " 50.000000 | \n",
+ " 64.000000 | \n",
+ " 1300.000000 | \n",
+ " 13.000000 | \n",
+ " 204.000000 | \n",
+ " 20.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 343.000000 | \n",
+ " 12.000000 | \n",
+ " 80.000000 | \n",
+ " 112.000000 | \n",
+ " 5520.000000 | \n",
+ " 62.000000 | \n",
+ " 860.100000 | \n",
+ " 153.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 514.000000 | \n",
+ " 18.300000 | \n",
+ " 128.000000 | \n",
+ " 160.000000 | \n",
+ " 21176.000000 | \n",
+ " 200.000000 | \n",
+ " 3033.000000 | \n",
+ " 516.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 685.000000 | \n",
+ " 400.000000 | \n",
+ " 2400.000000 | \n",
+ " 2400.000000 | \n",
+ " 308010.000000 | \n",
+ " 2774.000000 | \n",
+ " 47200.000000 | \n",
+ " 6827.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index Weight_Kg Low_Price High_Price Sales_Total \\\n",
+ "count 685.000000 685.000000 685.000000 685.000000 685.000000 \n",
+ "mean 343.000000 34.142482 164.202891 195.590073 18788.111212 \n",
+ "std 197.886752 87.575995 355.167319 389.109476 33951.586813 \n",
+ "min 1.000000 3.000000 10.000000 10.000000 10.000000 \n",
+ "25% 172.000000 9.000000 50.000000 64.000000 1300.000000 \n",
+ "50% 343.000000 12.000000 80.000000 112.000000 5520.000000 \n",
+ "75% 514.000000 18.300000 128.000000 160.000000 21176.000000 \n",
+ "max 685.000000 400.000000 2400.000000 2400.000000 308010.000000 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand \n",
+ "count 685.000000 685.000000 685.000000 \n",
+ "mean 174.883212 2725.402336 439.245255 \n",
+ "std 299.351142 5059.123311 715.985761 \n",
+ "min 1.000000 6.300000 0.000000 \n",
+ "25% 13.000000 204.000000 20.000000 \n",
+ "50% 62.000000 860.100000 153.000000 \n",
+ "75% 200.000000 3033.000000 516.000000 \n",
+ "max 2774.000000 47200.000000 6827.000000 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Summary statistic of each column in the dataframe.\n",
+ "test_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After veiwing our data, we viewed the sample submission as well to confirm our response variable as the column to feed the submission file on Kaggle."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 13.94 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.30 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index avg_price_per_kg\n",
+ "0 1 13.94\n",
+ "1 2 1.30"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_submission.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.4. Make an initial plot of the data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Exploratory Data Analysis \n",
+ "Make at least 7 plots."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.1. Explore the data shape and types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.2. Look for null values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.3. Is the data univariate or multivariate?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.4. Determine kurtosis and skew"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.5. Consider the distribution of the data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.6. Look for correlation of multivariate data "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Testing the suitability of the data\n",
+ "### 4.1. Testing for linearity"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.2. Testing for multicollinearity"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.3. Testing for independence"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.4. Testing for homoscedasticity\n",
+ "Do the magnitude of the risiduals increase as the fitted data increases? This will result in a cone shape and that is called heteroscedasticity. We don’t want that."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.5. Testing for normality"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.6. Check for outliers in residuals \n",
+ "#### Plot Cook’s distance"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 5. Transforming the data to be most suitable to use for building a multivariate linear regression model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 5.1. Transforming categorical data to numerical data "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Convert categorical data to numerical data\n",
+ "train=pd.get_dummies(train_set,drop_first=True)\n",
+ "\n",
+ "#Replace spaces in column names with underscores\n",
+ "train.columns = train.columns.str.replace(' ','_')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ " Province_EASTERN_CAPE | \n",
+ " Province_NATAL | \n",
+ " ... | \n",
+ " Date_2020-09-07 | \n",
+ " Date_2020-09-09 | \n",
+ " Date_2020-09-16 | \n",
+ " Date_2020-09-17 | \n",
+ " Date_2020-09-19 | \n",
+ " Date_2020-09-21 | \n",
+ " Date_2020-09-23 | \n",
+ " Date_2020-09-28 | \n",
+ " Date_2020-10-01 | \n",
+ " Date_2020-10-03 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 18.3 | \n",
+ " 150.0 | \n",
+ " 170.0 | \n",
+ " 51710.0 | \n",
+ " 332 | \n",
+ " 6075.6 | \n",
+ " 822 | \n",
+ " 8.51 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 11.0 | \n",
+ " 50.0 | \n",
+ " 50.0 | \n",
+ " 16000.0 | \n",
+ " 320 | \n",
+ " 3520.0 | \n",
+ " 0 | \n",
+ " 4.55 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 9.0 | \n",
+ " 55.0 | \n",
+ " 55.0 | \n",
+ " 990.0 | \n",
+ " 18 | \n",
+ " 162.0 | \n",
+ " 1506 | \n",
+ " 6.11 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " 18.3 | \n",
+ " 80.0 | \n",
+ " 120.0 | \n",
+ " 32020.0 | \n",
+ " 388 | \n",
+ " 7100.4 | \n",
+ " 443 | \n",
+ " 4.51 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 400.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1 | \n",
+ " 400.0 | \n",
+ " 2 | \n",
+ " 4.50 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 179 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "1 18.3 150.0 170.0 51710.0 332 \n",
+ "7 11.0 50.0 50.0 16000.0 320 \n",
+ "24 9.0 55.0 55.0 990.0 18 \n",
+ "40 18.3 80.0 120.0 32020.0 388 \n",
+ "69 400.0 1800.0 1800.0 1800.0 1 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg Province_EASTERN_CAPE \\\n",
+ "1 6075.6 822 8.51 0 \n",
+ "7 3520.0 0 4.55 0 \n",
+ "24 162.0 1506 6.11 0 \n",
+ "40 7100.4 443 4.51 0 \n",
+ "69 400.0 2 4.50 1 \n",
+ "\n",
+ " Province_NATAL ... Date_2020-09-07 Date_2020-09-09 Date_2020-09-16 \\\n",
+ "1 0 ... 0 1 0 \n",
+ "7 0 ... 0 0 0 \n",
+ "24 0 ... 0 0 0 \n",
+ "40 0 ... 0 0 0 \n",
+ "69 0 ... 0 0 0 \n",
+ "\n",
+ " Date_2020-09-17 Date_2020-09-19 Date_2020-09-21 Date_2020-09-23 \\\n",
+ "1 0 0 0 0 \n",
+ "7 0 0 0 0 \n",
+ "24 0 0 0 0 \n",
+ "40 0 0 0 0 \n",
+ "69 0 0 0 0 \n",
+ "\n",
+ " Date_2020-09-28 Date_2020-10-01 Date_2020-10-03 \n",
+ "1 0 0 0 \n",
+ "7 0 0 0 \n",
+ "24 0 0 0 \n",
+ "40 0 0 0 \n",
+ "69 1 0 0 \n",
+ "\n",
+ "[5 rows x 179 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1952, 179)"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cols = list(train)\n",
+ "#Move column to last index\n",
+ "cols.insert(len(cols)-1,cols.pop(cols.index(\"avg_price_per_kg\")))\n",
+ "\n",
+ "#Update column names\n",
+ "train = train.loc[:, cols]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " Province_EASTERN_CAPE | \n",
+ " Province_NATAL | \n",
+ " Province_ORANGE_FREE_STATE | \n",
+ " ... | \n",
+ " Date_2020-09-09 | \n",
+ " Date_2020-09-16 | \n",
+ " Date_2020-09-17 | \n",
+ " Date_2020-09-19 | \n",
+ " Date_2020-09-21 | \n",
+ " Date_2020-09-23 | \n",
+ " Date_2020-09-28 | \n",
+ " Date_2020-10-01 | \n",
+ " Date_2020-10-03 | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 18.3 | \n",
+ " 150.0 | \n",
+ " 170.0 | \n",
+ " 51710.0 | \n",
+ " 332 | \n",
+ " 6075.6 | \n",
+ " 822 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.51 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 11.0 | \n",
+ " 50.0 | \n",
+ " 50.0 | \n",
+ " 16000.0 | \n",
+ " 320 | \n",
+ " 3520.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4.55 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 9.0 | \n",
+ " 55.0 | \n",
+ " 55.0 | \n",
+ " 990.0 | \n",
+ " 18 | \n",
+ " 162.0 | \n",
+ " 1506 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 6.11 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " 18.3 | \n",
+ " 80.0 | \n",
+ " 120.0 | \n",
+ " 32020.0 | \n",
+ " 388 | \n",
+ " 7100.4 | \n",
+ " 443 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4.51 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 400.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1 | \n",
+ " 400.0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4.50 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 179 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "1 18.3 150.0 170.0 51710.0 332 \n",
+ "7 11.0 50.0 50.0 16000.0 320 \n",
+ "24 9.0 55.0 55.0 990.0 18 \n",
+ "40 18.3 80.0 120.0 32020.0 388 \n",
+ "69 400.0 1800.0 1800.0 1800.0 1 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand Province_EASTERN_CAPE Province_NATAL \\\n",
+ "1 6075.6 822 0 0 \n",
+ "7 3520.0 0 0 0 \n",
+ "24 162.0 1506 0 0 \n",
+ "40 7100.4 443 0 0 \n",
+ "69 400.0 2 1 0 \n",
+ "\n",
+ " Province_ORANGE_FREE_STATE ... Date_2020-09-09 Date_2020-09-16 \\\n",
+ "1 0 ... 1 0 \n",
+ "7 0 ... 0 0 \n",
+ "24 0 ... 0 0 \n",
+ "40 0 ... 0 0 \n",
+ "69 0 ... 0 0 \n",
+ "\n",
+ " Date_2020-09-17 Date_2020-09-19 Date_2020-09-21 Date_2020-09-23 \\\n",
+ "1 0 0 0 0 \n",
+ "7 0 0 0 0 \n",
+ "24 0 0 0 0 \n",
+ "40 0 0 0 0 \n",
+ "69 0 0 0 0 \n",
+ "\n",
+ " Date_2020-09-28 Date_2020-10-01 Date_2020-10-03 avg_price_per_kg \n",
+ "1 0 0 0 8.51 \n",
+ "7 0 0 0 4.55 \n",
+ "24 0 0 0 6.11 \n",
+ "40 0 0 0 4.51 \n",
+ "69 1 0 0 4.50 \n",
+ "\n",
+ "[5 rows x 179 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Building the regression models \n",
+ "### Method 1: A simple linear reggression model following the Least Squares Method\n",
+ "#### Calculating the slope and the intercept "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Slope = -0.0076244934807224145\n",
+ "Intercept = 7.0873874015160885\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Imported regression model\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "\n",
+ "#Split predictors and response\n",
+ "X = train_set['Weight_Kg']\n",
+ "Y = train_set['avg_price_per_kg']\n",
+ "\n",
+ "#Calculating x bar, y bar\n",
+ "x_bar = np.mean(X)\n",
+ "y_bar = np.mean(Y)\n",
+ "\n",
+ "#Calculating Slope\n",
+ "b = sum((X-x_bar) * (Y-y_bar)) / sum((X-x_bar)**2)\n",
+ "\n",
+ "#Calculating intercept\n",
+ "a = y_bar - b*x_bar\n",
+ "\n",
+ "print(\"Slope = \" + str(b))\n",
+ "print(\"Intercept = \" + str(a))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Plot our regression line on a scatter plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXQklEQVR4nO3df3BV5Z3H8c+XyxUD/ggIYkjBgCIqBkQz6qy7rrbV+GOrKbWtbd060864f7QzdbrLFlpna3fd0d3suv1nZ2ft1pZpra1TMWptN2Ws1W6nxYYmEChGVFC8QYhKLGIKITz7xzmBJPdXbu6Pc55736+ZZ3LvycO93xzjJ0++OT/MOScAgH+mRV0AAGBqCHAA8BQBDgCeIsABwFMEOAB4anol32zu3Lmuqampkm8JAN7bvHnzW865eRO3VzTAm5qa1NXVVcm3BADvmdlrmbbTQgEATxHgAOApAhwAPEWAA4CnCHAA8FRFj0KphI7ulNo7+9Q/OKQF9XVa07pMbasaoy4LAEquqgK8ozuldRt6NTQ8IklKDQ5p3YZeSSLEAVSdqmqhtHf2HQ/vUUPDI2rv7IuoIgAon6oK8P7BoYK2A4DPqirAF9TXFbQdAHxWVQG+pnWZ6pKJcdvqkgmtaV0WUUUAUD5V9UfM0T9UchQKgFpQVQEuBSFOYAOoBVXVQgGAWkKAA4CnCHAA8BQBDgCeyhvgZrbQzJ41sx1mtt3MvhRun2NmG81sZ/hxdvnLBQCMmswK/Kikv3XOXSDpCklfMLMLJa2V9IxzbqmkZ8LnAIAKyRvgzrm9zrnfh48PStohqVHSLZLWh9PWS2orU40AgAwK6oGbWZOkVZI2SZrvnNsrBSEv6cws/+ZOM+sys66BgYEiywUAjJp0gJvZKZIek3SXc+6Pk/13zrkHnXMtzrmWefPmTaVGAEAGkwpwM0sqCO+HnXMbws37zKwh/HyDpP3lKREAkMlkjkIxSd+WtMM598CYTz0p6Y7w8R2Snih9eQCAbCZzLZQrJf21pF4z6wm3fVXS/ZIeNbPPS3pd0sfLUiEAIKO8Ae6c+z9JluXTHyptOQCAyeJMTADwFAEOAJ4iwAHAUwQ4AHiKAAcATxHgAOApAhwAPEWAA4CnCHAA8BQBDgCeIsABwFMEOAB4igAHAE8R4ADgKQIcADxFgAOApwhwAPAUAQ4AniLAAcBTBDgAeIoABwBPEeAA4CkCHAA8RYADgKcIcADwFAEOAJ4iwAHAUwQ4AHiKAAcATxHgAOApAhwAPEWAA4CnCHAA8BQBDgCeIsABwFPToy6g1Dq6U2rv7FP/4JAW1NdpTesyta1qjLosACi5qgrwju6U1m3o1dDwiCQpNTikdRt6JYkQB1B1qqqF0t7Zdzy8Rw0Nj6i9sy+iigCgfPIGuJk9ZGb7zWzbmG33mFnKzHrCcWN5y5yc/sGhgrYDgM8mswL/rqTrM2z/D+fcxeH4aWnLmpoF9XUFbQcAn+UNcOfc85LeqUAtRVvTukx1ycS4bXXJhNa0LouoIgAon2J64F80s61hi2V2tklmdqeZdZlZ18DAQBFvl1/bqkbdt7pZjfV1MkmN9XW6b3Uzf8AEUJXMOZd/klmTpJ845y4Kn8+X9JYkJ+mfJDU45z6X73VaWlpcV1dXUQUDQK0xs83OuZaJ26e0AnfO7XPOjTjnjkn6lqTLii0QAFCYKQW4mTWMefpRSduyzQUAlEfeE3nM7BFJV0uaa2ZvSPq6pKvN7GIFLZTdkv6mfCUW5toHfqmd+w8df770zFna+OWroysIAMpkUj3wUil3D3xieI8ixAH4LFsP3OtT6e/u6NUjm/ZoxDklzDSS5YdRplAHAN95G+B3d/Tq+799/fjzbOENANXK22uhPLJpT9QlAECkvA3wQlbc06dZGSsBgGh4G+AJm3wo/9vHV5axEgCIhrcB/qnLF2bcPjHWkwlW3wCqk7cBfm9bs26/YtHxlXjCTDOT0zSxsTI84rgeOICq5O1RKFIQ4ve2NR9/vnjt0xnncT1wANXI2xV4JlwPHEAtqaoAX9O6LK3nnUwY1wMHUJWqKsAlKa0Jzvk9AKpUVQV4e2efho+NT+zhY/wRE0B1qqoA56bGAGpJVQU4f8QEUEuqKsC5qTGAWlJVAd62qlEfu7Rx3Mk9H7u0kZsaA6hKVRXgHd0pPbY5dfxCVyPO6bHNKXV0pyKuDABKr6oCvL2zT0PDI+O2DQ2PcBQKgKpUVQHOUSgAaonX10KZeEu1k5PTNDR8LG0eR6EAqEbeBnimW6oNDWc+7TLFChxAFfK2hcIt1QDUOm8DnJsYA6h13gY499kBUOu8DXDW3wBqnbcBXojbr1gUdQkAUHLeBngBN6VXy9lzylcIAETE2wBPTpt8gn/lsa1lrAQAouFtgB8ZmXwX/PDR9JN7AMB33gY4ANQ6AhwAPEWAA4CnCHAA8BQBDgCeIsABwFNeXU62ozul9s4+btAAAPIowDu6U1q3oTftlmkAUKu8aaFkut8lANSyvAFuZg+Z2X4z2zZm2xwz22hmO8OPs8tbJve1BICJJrMC/66k6ydsWyvpGefcUknPhM/LivtaAsB4eQPcOfe8pHcmbL5F0vrw8XpJbaUtK92a1mWqSybK/TYA4I2p9sDnO+f2SlL48cxsE83sTjPrMrOugYGBKb6d1LaqUfetblZjfR134wEAVeAoFOfcg5IelKSWlpaibqTTtqpRbasaJUlNa58uvjgA8NhUA3yfmTU45/aaWYOk/aUsKhuOAweAE6baQnlS0h3h4zskPVGacrIbPQ48NTjE/TABQJM7jPARSb+RtMzM3jCzz0u6X9K1ZrZT0rXh87LiOHAAGC9vC8U596ksn/pQiWvJibYJAIznzZmYHAcOAON5E+AcBw4A43lzMavRwwfv+lFPtIUAQEx4swKXToQ4AMCzAAcAnECAA4CnCHAA8BQBDgCe8i7A6+uSUZcAALHgXYDfc/Ny/4oGgDLwMgsTCa4IDgDeBXh7Z5+GR7geIQB4F+ApLmoFAJI8DHAAQIAABwBPEeAA4CkCHAA8RYADgKcIcADwlDc3dAAAH3V0p9Te2af+wSEtqK/TmtZlJbu3gT8B7pxknIEJwB8d3Smt29CroeERScF5LOs29EoqzQ1q/Ajw73xH+tznJEm7M3z6yLTpem7JJXp+8SV6bvGlen12Q0XLA4BM2jv7jof3qKHhEbV39tVGgN/d0avnfmd6TqZpynwK/UnHjural1/QtS+/kP2FvjlDuvji8aO5WZo1qwxVA4DUn+XM8WzbCxXrAL+7o1ff/+3r0mnztOQrT6V9fvrIUV2a2qFrXu3SX766WRcM7M7+YocPS5s2BSOf+fPTw37pUimRmNoXAqAmLaivy3j5jwX1dSV5fXOucheGamlpcV1dXZOev3jd0ypVebu/+hdSb6/U0zN+HD1a3Aufd1562J91Fv16AGk9cEmqSyZ03+rmglooZrbZOdcycXusV+Al/dly2mnSlVcGI9+b9vefCPgtW4KPO3dmnv/SS8F49NHcr3vSSeNDfuVKacUK6ZRTCv1KAHhiNKQ5CqVSzKTGxmDcdFPuuUePBuE9cVU/MJA+98gR6YUXgpHPvHnpq/rzzpOm858L8E3bqsaSBfZEsU6Emclpen/4WNRlZDd9unThhcH49Kdzzz14cHwLZ3Rlf+RI+tyBAWnjxmDks3Rpetg3NNDCAWpArHvgHd0pffnRHh0rQYm778+zmo4L56Q330xf1b/0UnGvO316etCvWCGdempxrwug7LzsgbetatRdP+qJuozKMgtW0A0N0g035J47MhL05ieG/b596XOPHpW6uoKRzxlnpIf9+efTwgFiJtb/RzatfTrqEuItkQiC9fzzpdtuyz330KHMR+EcPpw+9+23pWeeCUY+55yTHvaNjbRwgAqIdYCjhGbNkq64Ihj5vPnmiR796HjxxcxzX3klGI89lvs1E4nMLZzTTivgiwD8w7VQUFlnnRWM1tbc80ZGpJdfTl/Vv/lm5rmbNwcjnzlzMrdwksnCvg4gYh3dKa358ZbjN2JPDQ5pzY+3SKqla6EgnhIJadmyYHzyk7nnvv++tG1betgPZTil+J13pF/8Ihj5LF6cHvYLF9LCQSx846ntx8N71PCI0zee2k6AwyMzZ0qXXRaMfPbtS2/h7NiRee6uXcF4/PHcr2mWHvQrV0qnn17AFwEU5sD7wwVtLxQBjviZP1+67rpg5DIyEvTfxx5X39MTnEk7kXNSd3cw8pk9Oz3sL7iAFg5iJ9YBblbi0+lRXRKJ4AzV886TPvGJ3HOHhqTt29NbOIcOpc89cEB69tlg5NPUlB72ixbRwkFFxDrAP3P5ouBqhECx6uqklpZg5DMwkN7C2b4989zdu4PR0ZH/dVeuTG/hzJ49qfKBTIoKcDPbLemgpBFJRzOdKVSMe9uaJUmPbNqjEeeUMNMIS3KU27x50oc/HIxcjh2TXn01fVWfSmWev2VLMNavz/26p5+evqq/8MLggmjAGKVYgV/jnHurBK+T0b1tzceDXOLkHsTItGnSuecG49Zbc88d28IZu7p/7730ue++Kz33XDDyOfvs9FV9UxMtnBoR6xYKUDUKaeG89Za0dev4VX1vb+a5r70WjCeeyP+6K1akt3HmzJnsV4AYKjbAnaSfm5mT9N/OuQcnTjCzOyXdKUmLFi0q8u2AGjB3rvTBDwYjl2PHgv77xBbOnj2Z52/dGozvfS/36556anoLZ/lyacaMgr4MlF+xAX6lc67fzM6UtNHMXnTOPT92QhjqD0rB1QiLfD8Ao6ZNk5YsCcbq1bnn/ulP0h/+kB72Bw+mzz14UPrVr4KRz8KF6WG/eDEtnAopKsCdc/3hx/1m9rikyyQ9n/tfAai4k0+WLrkkGPm8/XZ6C2fr1sxz9+wJxlPp96xN09yc3q8/44zJfgXIYMoBbmazJE1zzh0MH18n6R9LVhmAaJxxhnTNNcHIxbmg/z5xVf/aa5nn9/YGI18LZ9as9FX9RRcFP4QwTjEr8PmSHrfgV6Xpkn7gnPvfklQFIP7MgiNempqktrbccw8fDi6HMDHs3303fe6hQ9Kvfx2MfBob08N+yZKgvVQDphzgzrlXJa0sYS0AqtWMGScCNp8DB9JPpNqyJfPcVCoYT0/i8OLly9PDfu7cSZUfVxxGCCBeZs+Wrr46GLmMtnBGT5AaDftduzLP3749GA8/nPt1M7Vwli8PDgWNGQIcgJ/GtnBuuSX33CNHgpuSTGzhHDiQPreQFk5DQ3rYn3tuxVo4NRHgtdENA5DVSScFJzKtWCF99rO55w4Oph+F09OT+cp6e/cG42c/y/hSu8OPXY0X6DO3/bMOTy/t5RBqIsCPRV0AAH/U10tXXRWMXJwLDqGcGPQZWjgtqR2ad+iA3jh9fklLrYkAT3BSAYBSMwsuHbxokXTzzRmnlPvaTTXRXeAKhgCqUU0E+DQW4ACqUOxbKNc+8Evt3J/hrikFOMYCHEAVivUKvBThDQDVKtYBTngD8Fm2AyhKdWBFrAMcAHz2qcsXFrS9UDUR4PV1yahLAICSi3WALz1zVtGvkZxmuufm5SWoBgAK88imzHdHyra9ULEO8MuXFH+x96a5M9W2qrEE1QBAYbKdg1Kqc1NiHeCl+Cm1c/8hfeZbvylBNQBQmJr+I2apfkr9+pV3SvI6AFCImv4jJtcwAeCzXQPvFbS9ULEO8FL9lAKAKGT77b9UXYFYB/i9bc26/YpFx1fiU12Rz5ge6y8TAKYk9tdCubetWfe2NR9/PpXLMx45yhXBAVSfmliaLqiP373sAKBYVR/gdcmE1rQui7oMACi52LdQClVfl9SsGdPVPzikBfV1WtO6jBN5AFSl2Af44rVPa7JHg9clE7rn5uUENoCaEOsWSiHhLUn3rW4mvAHUjFgHeKHnYRLeAGpJrAMcAJBd1QQ4p90DqDWxDvBCIpnT7gHUmlgH+K77b5p0iD/829fLWgsAxE3sDyPcdf9N455nO5W+NBeeBQB/xHoFXqiO7lTUJQBAxVRVgLd39kVdAgBUTOxbKBOZsrdL+geH1NGdUntnH6fSA6h63q3AJ/bEx6qfmdS6Db1KDQ7JSUoNDmndhl5aKwCqkncBLknf/OTFqksmxm2rSybknDQ0PDJu+9DwCK0VAJHIdhRdqc5a8TLA21Y16r7VzWqsr5NJaqyv032rm/Xu0HDG+f2DQ5UtEAAk/dk5cwraXijveuCj2lY1pvW22zv7lMoQ1tzQAUAUtvcfLGh7oYpagZvZ9WbWZ2Yvm9naklRUhDWtyzK2VrihA4AoDGbpCmTbXqgpr8DNLCHpPyVdK+kNSb8zsyedc38oSWVTMLoi5ygUALWgmBbKZZJeds69Kklm9kNJt0iKLMClzK0VAIjC7JlJHXg/fbU9e2ayJK9fTAulUdKeMc/fCLeNY2Z3mlmXmXUNDAwU8XYA4Jevf2S5konxx5wkE6avf2R5SV6/mADPdCRM2jk2zrkHnXMtzrmWefPmFfF2AOCXtlWNar915bgj5tpvXVmyLkExLZQ3JI29husHJPUXVw4AVJdytnWLWYH/TtJSM1tsZidJuk3Sk6UpCwCQz5RX4M65o2b2RUmdkhKSHnLObS9ZZQCAnIo6kcc591NJPy1RLQCAAnh5Kj0AgAAHAG+Zc5W7GZmZDUh6rcB/NlfSW2Uop1hxrUuKb23UVbi41kZdhSumtrOdc2nHYVc0wKfCzLqccy1R1zFRXOuS4lsbdRUurrVRV+HKURstFADwFAEOAJ7yIcAfjLqALOJalxTf2qircHGtjboKV/LaYt8DBwBk5sMKHACQAQEOAJ6KbYDH7XZtZrbbzHrNrMfMusJtc8xso5ntDD/OrkAdD5nZfjPbNmZb1jrMbF24D/vMrLXCdd1jZqlwn/WY2Y0R1LXQzJ41sx1mtt3MvhRuj8M+y1ZbpPvNzE42sxfMbEtY1zfC7ZHusxx1Rf59Fr5Xwsy6zewn4fPy7y/nXOyGgotjvSJpiaSTJG2RdGHENe2WNHfCtn+VtDZ8vFbSv1SgjqskXSJpW746JF0Y7rsZkhaH+zRRwbrukfR3GeZWsq4GSZeEj0+V9FL4/nHYZ9lqi3S/KbjW/ynh46SkTZKuiHqf5agr8u+z8P2+LOkHkn4SPi/7/orrCvz47dqcc0ckjd6uLW5ukbQ+fLxeUlu539A597ykdyZZxy2SfuicO+yc2yXpZQX7tlJ1ZVPJuvY6534fPj4oaYeCO0fFYZ9lqy2bitTmAu+FT5PhcIp4n+WoK5uK/bc0sw9IuknS/0x4/7Lur7gG+KRu11ZhTtLPzWyzmd0ZbpvvnNsrBf8zSjozotqy1RGH/fhFM9satlhGf4WMpC4za5K0SsHKLVb7bEJtUsT7LWwH9EjaL2mjcy4W+yxLXVL032fflPT3ko6N2Vb2/RXXAJ/U7doq7Ern3CWSbpD0BTO7KuJ6JiPq/fhfks6RdLGkvZL+Pdxe8brM7BRJj0m6yzn3x1xTM2yrdG2R7zfn3Ihz7mIFd9q6zMwuyjE96roi3V9m9leS9jvnNk/2n2TYNqW64hrgsbtdm3OuP/y4X9LjCn7l2WdmDZIUftwfUXnZ6oh0Pzrn9oX/wx2T9C2d+DWxonWZWVJBQD7snNsQbo7FPstUW1z2W1jLoKRfSrpeMdlnE+uKwf66UtLNZrZbQbv3g2b2fVVgf8U1wGN1uzYzm2Vmp44+lnSdpG1hTXeE0+6Q9EQ0FWat40lJt5nZDDNbLGmppBcqVdToN2/oowr2WUXrMjOT9G1JO5xzD4z5VOT7LFttUe83M5tnZvXh4zpJH5b0oiLeZ9nqinp/OefWOec+4JxrUpBVv3DO3a5K7K9y/UW22CHpRgV/lX9F0tcirmWJgr8ab5G0fbQeSWdIekbSzvDjnArU8oiCXxOHFfwk/3yuOiR9LdyHfZJuqHBd35PUK2lr+E3bEEFdf67g19OtknrCcWNM9lm22iLdb5JWSOoO33+bpH/I9/0ecV2Rf5+Neb+rdeIolLLvL06lBwBPxbWFAgDIgwAHAE8R4ADgKQIcADxFgAOApwhwAPAUAQ4Anvp/sPu7UJWpz1MAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Define a function to generate values of y from a list of x, \n",
+ "# Given parameters a and b\n",
+ "\n",
+ "def gen_y(x_list, a, b):\n",
+ " y_gen = []\n",
+ " for x_i in x_list:\n",
+ " y_i = a + b*x_i\n",
+ " y_gen.append(y_i)\n",
+ "\n",
+ " return(y_gen)\n",
+ "\n",
+ "#Generates y-values for given x-values based on parameters a, b\n",
+ "y_gen2 = gen_y(X, a, b)\n",
+ "\n",
+ "#Plot the graph\n",
+ "plt.scatter(X, Y)\n",
+ "plt.plot(X, y_gen2, color='red')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Assess our results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ 1.56, -2.45, -0.91, ..., -1.35, -4.22, 0.61])"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "errors = np.array(Y - y_gen2)\n",
+ "np.round(errors, 2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAD7CAYAAABgzo9kAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAQI0lEQVR4nO3db4hdeX3H8ffHrK6tVUy6k5AmoUlLapstuMoQLQuyNurGbjHbB4EILUMJxAexKBRq0ie2DwJpof8edIVUrQNVw9R2SVCxxrSLFKxxVre6yRqSumsyJk3GFbG2EMn22wdztl6Tmbl3/tydyS/vFwznnO/9nXO+c5h8cubce86kqpAktedlK92AJGk4DHhJapQBL0mNMuAlqVEGvCQ1yoCXpEb1Dfgkr0vyVM/XD5K8P8m6JKeSXOima3vWOZzkYpLzSR4e7rcgSZpNFvI5+CRrgO8AbwIOAt+rqqNJDgFrq+oDSXYAnwR2Aj8HfAH4pap6Ydm7lyTN6Z4Fjt8F/EdVfTvJHuChrj4OPAF8ANgDHK+qG8CzSS4yE/Zfmmuj9913X23dunWBrUjS3e3JJ5/8blWNzPX6QgN+HzNn5wAbquoqQFVdTbK+q28C/q1nnamuNqetW7cyOTm5wFYk6e6W5NvzvT7wm6xJXgG8C/j7fkNnqd12HSjJgSSTSSanp6cHbUOSNKCFfIrmncBXq+pat3wtyUaAbnq9q08BW3rW2wxcuXVjVXWsqkaranRkZM7fMCRJi7SQgH83P748A3ASGOvmx4ATPfV9Se5Nsg3YDpxZaqOSpIUZ6Bp8kp8G3g68p6d8FJhIsh+4BOwFqKqzSSaAc8BN4KCfoJGkl95AAV9V/wP87C2155n5VM1s448AR5bcnSRp0byTVZIaZcBLUqMMeElqlAEvSY1a6J2susttPfSZFdnvc0cfWZH9Sncyz+AlqVEGvCQ1yoCXpEYZ8JLUKN9kvQOt1Budku4snsFLUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEDBXyS1yb5VJJvJnkmya8lWZfkVJIL3XRtz/jDSS4mOZ/k4eG1L0may6Bn8H8FfK6qfhl4PfAMcAg4XVXbgdPdMkl2APuA+4HdwGNJ1ix345Kk+fUN+CSvAd4CfASgqn5UVd8H9gDj3bBx4NFufg9wvKpuVNWzwEVg5/K2LUnqZ5Az+F8ApoG/TfK1JB9O8ipgQ1VdBeim67vxm4DLPetPdTVJ0ktokIC/B3gj8KGqegPw33SXY+aQWWp126DkQJLJJJPT09MDNStJGtwgAT8FTFXVl7vlTzET+NeSbAToptd7xm/pWX8zcOXWjVbVsaoararRkZGRxfYvSZpD34Cvqv8ELid5XVfaBZwDTgJjXW0MONHNnwT2Jbk3yTZgO3BmWbuWJPU16N9k/T3g40leAXwL+F1m/nOYSLIfuATsBaiqs0kmmPlP4CZwsKpeWPbOJUnzGijgq+opYHSWl3bNMf4IcGTxbUmSlso7WSWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMGCvgkzyX5RpKnkkx2tXVJTiW50E3X9ow/nORikvNJHh5W85KkuS3kDP6tVfVAVY12y4eA01W1HTjdLZNkB7APuB/YDTyWZM0y9ixJGsBSLtHsAca7+XHg0Z768aq6UVXPAheBnUvYjyRpEQYN+AI+n+TJJAe62oaqugrQTdd39U3A5Z51p7raT0hyIMlkksnp6enFdS9JmtM9A457sKquJFkPnEryzXnGZpZa3VaoOgYcAxgdHb3tdUnS0gx0Bl9VV7rpdeBxZi65XEuyEaCbXu+GTwFbelbfDFxZroYlSYPpG/BJXpXk1S/OA+8AngZOAmPdsDHgRDd/EtiX5N4k24DtwJnlblySNL9BLtFsAB5P8uL4T1TV55J8BZhIsh+4BOwFqKqzSSaAc8BN4GBVvTCU7iVJc+ob8FX1LeD1s9SfB3bNsc4R4MiSu5MkLZp3skpSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYNHPBJ1iT5WpJPd8vrkpxKcqGbru0ZezjJxSTnkzw8jMYlSfNbyBn8+4BnepYPAaerajtwulsmyQ5gH3A/sBt4LMma5WlXkjSogQI+yWbgEeDDPeU9wHg3Pw482lM/XlU3qupZ4CKwc1m6lSQNbNAz+L8E/gD4357ahqq6CtBN13f1TcDlnnFTXe0nJDmQZDLJ5PT09EL7liT10Tfgk/wmcL2qnhxwm5mlVrcVqo5V1WhVjY6MjAy4aUnSoO4ZYMyDwLuS/AbwSuA1Sf4OuJZkY1VdTbIRuN6NnwK29Ky/GbiynE1LkvrrewZfVYeranNVbWXmzdN/rqrfBk4CY92wMeBEN38S2Jfk3iTbgO3AmWXvXJI0r0HO4OdyFJhIsh+4BOwFqKqzSSaAc8BN4GBVvbDkTiVJC7KggK+qJ4AnuvnngV1zjDsCHFlib5KkJfBOVklqlAEvSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1Ki+AZ/klUnOJPn3JGeT/HFXX5fkVJIL3XRtzzqHk1xMcj7Jw8P8BiRJsxvkDP4G8OtV9XrgAWB3kjcDh4DTVbUdON0tk2QHsA+4H9gNPJZkzRB6lyTNo2/A14wfdosv774K2AOMd/Vx4NFufg9wvKpuVNWzwEVg53I2LUnqb6Br8EnWJHkKuA6cqqovAxuq6ipAN13fDd8EXO5Zfaqr3brNA0kmk0xOT08v4VuQJM3mnkEGVdULwANJXgs8nuRX5xme2TYxyzaPAccARkdHb3td6rX10GdWbN/PHX1kxfYtLcWCPkVTVd8HnmDm2vq1JBsBuun1btgUsKVntc3AlaU2KklamEE+RTPSnbmT5KeAtwHfBE4CY92wMeBEN38S2Jfk3iTbgO3AmWXuW5LUxyCXaDYC490nYV4GTFTVp5N8CZhIsh+4BOwFqKqzSSaAc8BN4GB3iUeS9BLqG/BV9XXgDbPUnwd2zbHOEeDIkruTJC2ad7JKUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekho10B/d1uxW8g9BS1I/nsFLUqMMeElqlAEvSY0y4CWpUQa8JDWqb8An2ZLkX5I8k+Rskvd19XVJTiW50E3X9qxzOMnFJOeTPDzMb0CSNLtBzuBvAr9fVb8CvBk4mGQHcAg4XVXbgdPdMt1r+4D7gd3AY0nWDKN5SdLc+gZ8VV2tqq928/8FPANsAvYA492wceDRbn4PcLyqblTVs8BFYOcy9y1J6mNB1+CTbAXeAHwZ2FBVV2HmPwFgfTdsE3C5Z7Wprnbrtg4kmUwyOT09vYjWJUnzGTjgk/wM8A/A+6vqB/MNnaVWtxWqjlXVaFWNjoyMDNqGJGlAAwV8kpczE+4fr6p/7MrXkmzsXt8IXO/qU8CWntU3A1eWp11J0qAG+RRNgI8Az1TVn/e8dBIY6+bHgBM99X1J7k2yDdgOnFm+liVJgxjkYWMPAr8DfCPJU13tD4GjwESS/cAlYC9AVZ1NMgGcY+YTOAer6oXlblySNL++AV9V/8rs19UBds2xzhHgyBL6kiQtkXeySlKjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeElqlAEvSY0y4CWpUQa8JDWqb8An+WiS60me7qmtS3IqyYVuurbntcNJLiY5n+ThYTUuSZrfIGfwHwN231I7BJyuqu3A6W6ZJDuAfcD93TqPJVmzbN1KkgbWN+Cr6ovA924p7wHGu/lx4NGe+vGqulFVzwIXgZ3L06okaSEWew1+Q1VdBeim67v6JuByz7ipriZJeokt95usmaVWsw5MDiSZTDI5PT29zG1IkhYb8NeSbATopte7+hSwpWfcZuDKbBuoqmNVNVpVoyMjI4tsQ5I0l8UG/ElgrJsfA0701PcluTfJNmA7cGZpLUqSFuOefgOSfBJ4CLgvyRTwQeAoMJFkP3AJ2AtQVWeTTADngJvAwap6YUi9S5Lm0Tfgq+rdc7y0a47xR4AjS2lKkrR03skqSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1qu+zaO4EWw99ZqVbkKRVxzN4SWpUE2fw0jCt1G+Izx19ZEX2q3Z4Bi9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeElq1NACPsnuJOeTXExyaFj7kSTNbih3siZZA/w18HZgCvhKkpNVdW4Y+5NatJLPWPIu2jYM6wx+J3Cxqr5VVT8CjgN7hrQvSdIshvUsmk3A5Z7lKeBNQ9qXpGV2tz1/p9XfloYV8JmlVj8xIDkAHOgWf5jk/JB6WYz7gO+udBN3AI9Tfx6jwdwHfDd/stJtvPQW8D3P9rP08/OtMKyAnwK29CxvBq70DqiqY8CxIe1/SZJMVtXoSvex2nmc+vMYDcbj1N9ijtGwrsF/BdieZFuSVwD7gJND2pckaRZDOYOvqptJ3gv8E7AG+GhVnR3GviRJsxvaH/yoqs8Cnx3W9odsVV46WoU8Tv15jAbjcepvwccoVdV/lCTpjuOjCiSpUQb8HJL8UZLvJHmq+/qNle5ptfAxFINJ8lySb3Q/P5Mr3c9qkeSjSa4nebqnti7JqSQXuunalexxpc1xjBacSQb8/P6iqh7ovu7U9xOWVc9jKN4J7ADenWTHyna1qr21+/nxI4A/9jFg9y21Q8DpqtoOnO6W72Yf4/ZjBAvMJANeC+VjKLQkVfVF4Hu3lPcA4938OPDoS9nTajPHMVowA35+703y9e7Xpbv6V8Yesz2GYtMK9bLaFfD5JE92d25rbhuq6ipAN12/wv2sVgvKpLs64JN8IcnTs3ztAT4E/CLwAHAV+LOV7HUV6fsYCv2/B6vqjcxczjqY5C0r3ZDuaAvOpKF9Dv5OUFVvG2Rckr8BPj3kdu4UfR9DoRlVdaWbXk/yODOXt764sl2tWteSbKyqq0k2AtdXuqHVpqquvTg/aCbd1Wfw8+l+yF70W8DTc429y/gYigEkeVWSV784D7wDf4bmcxIY6+bHgBMr2MuqtJhMuqvP4Pv40yQPMHP54TngPSvazSrhYygGtgF4PAnM/Dv7RFV9bmVbWh2SfBJ4CLgvyRTwQeAoMJFkP3AJ2LtyHa68OY7RQwvNJO9klaRGeYlGkhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1Kj/AyUAO9UuIWpNAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.hist(errors)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Method 2: A multivariate linear regression model using sklearn"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Show our calculations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Plot our results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Assess our results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Method 3: A multivariate linear regression model using statsmodels"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Show our calculations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Plot our results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Assess our results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 7. Conclusion "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### What we accomplished. \n",
+ "### What we learnt.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Linear Regression team TS2.ipynb b/Linear Regression team TS2.ipynb
new file mode 100644
index 0000000..b2b7ac9
--- /dev/null
+++ b/Linear Regression team TS2.ipynb
@@ -0,0 +1,2115 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Regression Model that Predicts Apple Prices Based on Historical Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Table Of Contents\n",
+ "### 1. [Introduction](#introduction)\n",
+ "\n",
+ " 1. Objective\n",
+ " 2. Parameters\n",
+ " 3. Outline\n",
+ " \n",
+ "### 2. [Importing Data and Plotting](#import)\n",
+ "\n",
+ " 1. Import necessary packages\n",
+ " 2. Import the data into a Pandas Dataframe\n",
+ " 3. Show the data\n",
+ " 4. Make ean initial plot of the data\n",
+ "\n",
+ "### 3. [Exploratory Data Analysis](#explore)\n",
+ "### 4. [Split Data: Testing and training](#split)\n",
+ "### 5. [Outliers](#outliers)\n",
+ "### 6. [Regression Model](#regress)\n",
+ " 1. Taking estimates\n",
+ " 2. Least squares\n",
+ " 3. sklearn\n",
+ "### 7. [Conclusion](#conclude)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Introduction "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.1. Objective \n",
+ "In this notebook we will design a regression model that will predict the cost of apples based on given parameters."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.2. Parameters\n",
+ "1. Month/Season\n",
+ "2. Distance travelled\n",
+ "3. Supplier cost\n",
+ "4. Grade of apple\n",
+ "5. Demand and Supply\n",
+ "6. Container used?\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Importing Data and Plotting "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.1. Import necessary packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# These packages will be mainly used for data wrangling\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# These packages will be mostly used for plotting the data\n",
+ "from matplotlib import pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "# These packages will be mostly used to build our linear regression model so that we can make predictions from it.\n",
+ "import statsmodels as sm\n",
+ "import sklearn as skl"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.2. Import the data into a Pandas Dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#import train and test sets into DataFrames\n",
+ "sample_submission = pd.DataFrame(pd.read_csv(\"sample_submission.csv\"))\n",
+ "test_set = pd.DataFrame(pd.read_csv(\"df-test_set.csv\"))\n",
+ "train_set = pd.DataFrame(pd.read_csv(\"df-train_set.csv\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Filter for Commodity of interest in train set\n",
+ "train_set = train_set[train_set[\"Commodities\"]==\"APPLE GOLDEN DELICIOUS\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.3. Show the data "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### 2.3.1 Showing the training data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " CAPE | \n",
+ " M4183 | \n",
+ " 1L | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-09-09 | \n",
+ " 150.0 | \n",
+ " 170.0 | \n",
+ " 51710.0 | \n",
+ " 332 | \n",
+ " 6075.6 | \n",
+ " 822 | \n",
+ " 8.51 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " CAPE | \n",
+ " JG110 | \n",
+ " 2M | \n",
+ " 11.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-04-14 | \n",
+ " 50.0 | \n",
+ " 50.0 | \n",
+ " 16000.0 | \n",
+ " 320 | \n",
+ " 3520.0 | \n",
+ " 0 | \n",
+ " 4.55 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " JE090 | \n",
+ " 2S | \n",
+ " 9.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-04-16 | \n",
+ " 55.0 | \n",
+ " 55.0 | \n",
+ " 990.0 | \n",
+ " 18 | \n",
+ " 162.0 | \n",
+ " 1506 | \n",
+ " 6.11 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " CAPE | \n",
+ " M4183 | \n",
+ " 1S | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-04 | \n",
+ " 80.0 | \n",
+ " 120.0 | \n",
+ " 32020.0 | \n",
+ " 388 | \n",
+ " 7100.4 | \n",
+ " 443 | \n",
+ " 4.51 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " EASTERN CAPE | \n",
+ " IA400 | \n",
+ " 1S | \n",
+ " 400.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-09-28 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1 | \n",
+ " 400.0 | \n",
+ " 2 | \n",
+ " 4.50 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Province Container Size_Grade Weight_Kg \\\n",
+ "1 CAPE M4183 1L 18.3 \n",
+ "7 CAPE JG110 2M 11.0 \n",
+ "24 W.CAPE-BERGRIVER ETC JE090 2S 9.0 \n",
+ "40 CAPE M4183 1S 18.3 \n",
+ "69 EASTERN CAPE IA400 1S 400.0 \n",
+ "\n",
+ " Commodities Date Low_Price High_Price Sales_Total \\\n",
+ "1 APPLE GOLDEN DELICIOUS 2020-09-09 150.0 170.0 51710.0 \n",
+ "7 APPLE GOLDEN DELICIOUS 2020-04-14 50.0 50.0 16000.0 \n",
+ "24 APPLE GOLDEN DELICIOUS 2020-04-16 55.0 55.0 990.0 \n",
+ "40 APPLE GOLDEN DELICIOUS 2020-05-04 80.0 120.0 32020.0 \n",
+ "69 APPLE GOLDEN DELICIOUS 2020-09-28 1800.0 1800.0 1800.0 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "1 332 6075.6 822 8.51 \n",
+ "7 320 3520.0 0 4.55 \n",
+ "24 18 162.0 1506 6.11 \n",
+ "40 388 7100.4 443 4.51 \n",
+ "69 1 400.0 2 4.50 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Viewing the first five rows of our train_set dataframe.\n",
+ "train_set.head() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1952, 13)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#The dataframe has 64376 rows and 13 columns.\n",
+ "train_set.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 1952 entries, 1 to 64310\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Province 1952 non-null object \n",
+ " 1 Container 1952 non-null object \n",
+ " 2 Size_Grade 1952 non-null object \n",
+ " 3 Weight_Kg 1952 non-null float64\n",
+ " 4 Commodities 1952 non-null object \n",
+ " 5 Date 1952 non-null object \n",
+ " 6 Low_Price 1952 non-null float64\n",
+ " 7 High_Price 1952 non-null float64\n",
+ " 8 Sales_Total 1952 non-null float64\n",
+ " 9 Total_Qty_Sold 1952 non-null int64 \n",
+ " 10 Total_Kg_Sold 1952 non-null float64\n",
+ " 11 Stock_On_Hand 1952 non-null int64 \n",
+ " 12 avg_price_per_kg 1952 non-null float64\n",
+ "dtypes: float64(6), int64(2), object(5)\n",
+ "memory usage: 213.5+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "#The info method displays the nature of our data i.e datatypes and non-null count.\n",
+ "train_set.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The info summary above shows 64376 entries and it has the following data types: six float type data, two integer type data, five object type data. All columns showing zero null values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " EC120 | \n",
+ " 1M | \n",
+ " 12.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-07-09 | \n",
+ " 128.0 | \n",
+ " 136.0 | \n",
+ " 5008.0 | \n",
+ " 38 | \n",
+ " 456.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1X | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-01-20 | \n",
+ " 220.0 | \n",
+ " 220.0 | \n",
+ " 1760.0 | \n",
+ " 8 | \n",
+ " 146.4 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " EC120 | \n",
+ " 1S | \n",
+ " 12.0 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-08-19 | \n",
+ " 120.0 | \n",
+ " 120.0 | \n",
+ " 720.0 | \n",
+ " 6 | \n",
+ " 72.0 | \n",
+ " 45 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1M | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-06 | \n",
+ " 160.0 | \n",
+ " 160.0 | \n",
+ " 160.0 | \n",
+ " 1 | \n",
+ " 18.3 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " W.CAPE-BERGRIVER ETC | \n",
+ " M4183 | \n",
+ " 1L | \n",
+ " 18.3 | \n",
+ " APPLE GOLDEN DELICIOUS | \n",
+ " 2020-05-04 | \n",
+ " 140.0 | \n",
+ " 160.0 | \n",
+ " 14140.0 | \n",
+ " 100 | \n",
+ " 1830.0 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index Province Container Size_Grade Weight_Kg \\\n",
+ "0 1 W.CAPE-BERGRIVER ETC EC120 1M 12.0 \n",
+ "1 2 W.CAPE-BERGRIVER ETC M4183 1X 18.3 \n",
+ "2 3 W.CAPE-BERGRIVER ETC EC120 1S 12.0 \n",
+ "3 4 W.CAPE-BERGRIVER ETC M4183 1M 18.3 \n",
+ "4 5 W.CAPE-BERGRIVER ETC M4183 1L 18.3 \n",
+ "\n",
+ " Commodities Date Low_Price High_Price Sales_Total \\\n",
+ "0 APPLE GOLDEN DELICIOUS 2020-07-09 128.0 136.0 5008.0 \n",
+ "1 APPLE GOLDEN DELICIOUS 2020-01-20 220.0 220.0 1760.0 \n",
+ "2 APPLE GOLDEN DELICIOUS 2020-08-19 120.0 120.0 720.0 \n",
+ "3 APPLE GOLDEN DELICIOUS 2020-05-06 160.0 160.0 160.0 \n",
+ "4 APPLE GOLDEN DELICIOUS 2020-05-04 140.0 160.0 14140.0 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand \n",
+ "0 38 456.0 0 \n",
+ "1 8 146.4 2 \n",
+ "2 6 72.0 45 \n",
+ "3 1 18.3 8 \n",
+ "4 100 1830.0 19 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Viewing the first five rows of our test_set dataframe.\n",
+ "test_set.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 40.460912 | \n",
+ " 174.307377 | \n",
+ " 215.648053 | \n",
+ " 20053.533811 | \n",
+ " 174.510758 | \n",
+ " 2960.176332 | \n",
+ " 408.393955 | \n",
+ " 6.778893 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 99.655169 | \n",
+ " 373.553578 | \n",
+ " 433.546159 | \n",
+ " 39005.069445 | \n",
+ " 308.810797 | \n",
+ " 6097.416527 | \n",
+ " 724.450582 | \n",
+ " 2.248744 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 3.000000 | \n",
+ " 2.000000 | \n",
+ " 5.000000 | \n",
+ " 5.000000 | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 0.000000 | \n",
+ " 0.250000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 9.000000 | \n",
+ " 50.000000 | \n",
+ " 60.000000 | \n",
+ " 1325.000000 | \n",
+ " 12.000000 | \n",
+ " 219.600000 | \n",
+ " 9.000000 | \n",
+ " 5.460000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 12.000000 | \n",
+ " 80.000000 | \n",
+ " 108.000000 | \n",
+ " 5495.000000 | \n",
+ " 64.000000 | \n",
+ " 853.500000 | \n",
+ " 126.500000 | \n",
+ " 6.670000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 18.300000 | \n",
+ " 127.250000 | \n",
+ " 160.000000 | \n",
+ " 21082.500000 | \n",
+ " 200.000000 | \n",
+ " 3093.525000 | \n",
+ " 468.000000 | \n",
+ " 8.280000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 400.000000 | \n",
+ " 2300.000000 | \n",
+ " 3300.000000 | \n",
+ " 369464.000000 | \n",
+ " 4237.000000 | \n",
+ " 74000.000000 | \n",
+ " 6400.000000 | \n",
+ " 21.240000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "count 1952.000000 1952.000000 1952.000000 1952.000000 1952.000000 \n",
+ "mean 40.460912 174.307377 215.648053 20053.533811 174.510758 \n",
+ "std 99.655169 373.553578 433.546159 39005.069445 308.810797 \n",
+ "min 3.000000 2.000000 5.000000 5.000000 1.000000 \n",
+ "25% 9.000000 50.000000 60.000000 1325.000000 12.000000 \n",
+ "50% 12.000000 80.000000 108.000000 5495.000000 64.000000 \n",
+ "75% 18.300000 127.250000 160.000000 21082.500000 200.000000 \n",
+ "max 400.000000 2300.000000 3300.000000 369464.000000 4237.000000 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "count 1952.000000 1952.000000 1952.000000 \n",
+ "mean 2960.176332 408.393955 6.778893 \n",
+ "std 6097.416527 724.450582 2.248744 \n",
+ "min 3.000000 0.000000 0.250000 \n",
+ "25% 219.600000 9.000000 5.460000 \n",
+ "50% 853.500000 126.500000 6.670000 \n",
+ "75% 3093.525000 468.000000 8.280000 \n",
+ "max 74000.000000 6400.000000 21.240000 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Summary statistic of each column in the dataframe.\n",
+ "train_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Province | \n",
+ " Container | \n",
+ " Size_Grade | \n",
+ " Weight_Kg | \n",
+ " Commodities | \n",
+ " Date | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " avg_price_per_kg | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " False | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ " 1952 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Province Container Size_Grade Weight_Kg Commodities \\\n",
+ "avg_price_per_kg \n",
+ "False 1952 1952 1952 1952 1952 \n",
+ "\n",
+ " Date Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "avg_price_per_kg \n",
+ "False 1952 1952 1952 1952 1952 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "avg_price_per_kg \n",
+ "False 1952 1952 1952 "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Counting the Number of rows with INF\n",
+ "train_set.groupby(np.isinf(train_set['avg_price_per_kg'])).count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ " 1952.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 40.460912 | \n",
+ " 174.307377 | \n",
+ " 215.648053 | \n",
+ " 20053.533811 | \n",
+ " 174.510758 | \n",
+ " 2960.176332 | \n",
+ " 408.393955 | \n",
+ " 6.778893 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 99.655169 | \n",
+ " 373.553578 | \n",
+ " 433.546159 | \n",
+ " 39005.069445 | \n",
+ " 308.810797 | \n",
+ " 6097.416527 | \n",
+ " 724.450582 | \n",
+ " 2.248744 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 3.000000 | \n",
+ " 2.000000 | \n",
+ " 5.000000 | \n",
+ " 5.000000 | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 0.000000 | \n",
+ " 0.250000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 9.000000 | \n",
+ " 50.000000 | \n",
+ " 60.000000 | \n",
+ " 1325.000000 | \n",
+ " 12.000000 | \n",
+ " 219.600000 | \n",
+ " 9.000000 | \n",
+ " 5.460000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 12.000000 | \n",
+ " 80.000000 | \n",
+ " 108.000000 | \n",
+ " 5495.000000 | \n",
+ " 64.000000 | \n",
+ " 853.500000 | \n",
+ " 126.500000 | \n",
+ " 6.670000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 18.300000 | \n",
+ " 127.250000 | \n",
+ " 160.000000 | \n",
+ " 21082.500000 | \n",
+ " 200.000000 | \n",
+ " 3093.525000 | \n",
+ " 468.000000 | \n",
+ " 8.280000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 400.000000 | \n",
+ " 2300.000000 | \n",
+ " 3300.000000 | \n",
+ " 369464.000000 | \n",
+ " 4237.000000 | \n",
+ " 74000.000000 | \n",
+ " 6400.000000 | \n",
+ " 21.240000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "count 1952.000000 1952.000000 1952.000000 1952.000000 1952.000000 \n",
+ "mean 40.460912 174.307377 215.648053 20053.533811 174.510758 \n",
+ "std 99.655169 373.553578 433.546159 39005.069445 308.810797 \n",
+ "min 3.000000 2.000000 5.000000 5.000000 1.000000 \n",
+ "25% 9.000000 50.000000 60.000000 1325.000000 12.000000 \n",
+ "50% 12.000000 80.000000 108.000000 5495.000000 64.000000 \n",
+ "75% 18.300000 127.250000 160.000000 21082.500000 200.000000 \n",
+ "max 400.000000 2300.000000 3300.000000 369464.000000 4237.000000 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg \n",
+ "count 1952.000000 1952.000000 1952.000000 \n",
+ "mean 2960.176332 408.393955 6.778893 \n",
+ "std 6097.416527 724.450582 2.248744 \n",
+ "min 3.000000 0.000000 0.250000 \n",
+ "25% 219.600000 9.000000 5.460000 \n",
+ "50% 853.500000 126.500000 6.670000 \n",
+ "75% 3093.525000 468.000000 8.280000 \n",
+ "max 74000.000000 6400.000000 21.240000 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Droping INF \n",
+ "train_set = train_set.replace([np.inf, -np.inf], np.nan)\n",
+ "\n",
+ "train_set = train_set.dropna(axis = 0)\n",
+ "\n",
+ "train_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### 2.3.2. Showing the testing data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(685, 13)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#The dataframe has 685 rows and 13 columns.\n",
+ "test_set.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 685 entries, 0 to 684\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Index 685 non-null int64 \n",
+ " 1 Province 685 non-null object \n",
+ " 2 Container 685 non-null object \n",
+ " 3 Size_Grade 685 non-null object \n",
+ " 4 Weight_Kg 685 non-null float64\n",
+ " 5 Commodities 685 non-null object \n",
+ " 6 Date 685 non-null object \n",
+ " 7 Low_Price 685 non-null float64\n",
+ " 8 High_Price 685 non-null float64\n",
+ " 9 Sales_Total 685 non-null float64\n",
+ " 10 Total_Qty_Sold 685 non-null int64 \n",
+ " 11 Total_Kg_Sold 685 non-null float64\n",
+ " 12 Stock_On_Hand 685 non-null int64 \n",
+ "dtypes: float64(5), int64(3), object(5)\n",
+ "memory usage: 69.7+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "#The info method displays the nature of our data i.e datatypes and non-null count.\n",
+ "test_set.info() "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The info summary above shows 685 entries and it has the following data types: five float type data, three integer type data, five object type data. All columns showing zero null values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ " 685.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 343.000000 | \n",
+ " 34.142482 | \n",
+ " 164.202891 | \n",
+ " 195.590073 | \n",
+ " 18788.111212 | \n",
+ " 174.883212 | \n",
+ " 2725.402336 | \n",
+ " 439.245255 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 197.886752 | \n",
+ " 87.575995 | \n",
+ " 355.167319 | \n",
+ " 389.109476 | \n",
+ " 33951.586813 | \n",
+ " 299.351142 | \n",
+ " 5059.123311 | \n",
+ " 715.985761 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 10.000000 | \n",
+ " 10.000000 | \n",
+ " 10.000000 | \n",
+ " 1.000000 | \n",
+ " 6.300000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 172.000000 | \n",
+ " 9.000000 | \n",
+ " 50.000000 | \n",
+ " 64.000000 | \n",
+ " 1300.000000 | \n",
+ " 13.000000 | \n",
+ " 204.000000 | \n",
+ " 20.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 343.000000 | \n",
+ " 12.000000 | \n",
+ " 80.000000 | \n",
+ " 112.000000 | \n",
+ " 5520.000000 | \n",
+ " 62.000000 | \n",
+ " 860.100000 | \n",
+ " 153.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 514.000000 | \n",
+ " 18.300000 | \n",
+ " 128.000000 | \n",
+ " 160.000000 | \n",
+ " 21176.000000 | \n",
+ " 200.000000 | \n",
+ " 3033.000000 | \n",
+ " 516.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 685.000000 | \n",
+ " 400.000000 | \n",
+ " 2400.000000 | \n",
+ " 2400.000000 | \n",
+ " 308010.000000 | \n",
+ " 2774.000000 | \n",
+ " 47200.000000 | \n",
+ " 6827.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index Weight_Kg Low_Price High_Price Sales_Total \\\n",
+ "count 685.000000 685.000000 685.000000 685.000000 685.000000 \n",
+ "mean 343.000000 34.142482 164.202891 195.590073 18788.111212 \n",
+ "std 197.886752 87.575995 355.167319 389.109476 33951.586813 \n",
+ "min 1.000000 3.000000 10.000000 10.000000 10.000000 \n",
+ "25% 172.000000 9.000000 50.000000 64.000000 1300.000000 \n",
+ "50% 343.000000 12.000000 80.000000 112.000000 5520.000000 \n",
+ "75% 514.000000 18.300000 128.000000 160.000000 21176.000000 \n",
+ "max 685.000000 400.000000 2400.000000 2400.000000 308010.000000 \n",
+ "\n",
+ " Total_Qty_Sold Total_Kg_Sold Stock_On_Hand \n",
+ "count 685.000000 685.000000 685.000000 \n",
+ "mean 174.883212 2725.402336 439.245255 \n",
+ "std 299.351142 5059.123311 715.985761 \n",
+ "min 1.000000 6.300000 0.000000 \n",
+ "25% 13.000000 204.000000 20.000000 \n",
+ "50% 62.000000 860.100000 153.000000 \n",
+ "75% 200.000000 3033.000000 516.000000 \n",
+ "max 2774.000000 47200.000000 6827.000000 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Summary statistic of each column in the dataframe.\n",
+ "test_set.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After veiwing our data, we viewed the sample submission as well to confirm our response variable as the column to feed the submission file on Kaggle."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Index | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 13.94 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.30 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Index avg_price_per_kg\n",
+ "0 1 13.94\n",
+ "1 2 1.30"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sample_submission.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.4. Make an initial plot of the data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Exploratory Data Analysis \n",
+ "Make at least 7 plots."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.1. Explore the data shape and types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.2. Look for null values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.3. Is the data univariate or multivariate?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.4. Determine kurtosis and skew"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.5. Consider the distribution of the data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.6. Look for correlation of multivariate data "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Testing the suitability of the data\n",
+ "### 4.1. Testing for linearity"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.2. Testing for multicollinearity"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.3. Testing for independence"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.4. Testing for homoscedasticity\n",
+ "Do the magnitude of the risiduals increase as the fitted data increases? This will result in a cone shape and that is called heteroscedasticity. We don’t want that."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.5. Testing for normality"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4.6. Check for outliers in residuals \n",
+ "#### Plot Cook’s distance"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 5. Transforming the data to be most suitable to use for building a multivariate linear regression model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 5.1. Transforming categorical data to numerical data "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Convert categorical data to numerical data\n",
+ "train=pd.get_dummies(train_set,drop_first=True)\n",
+ "\n",
+ "#Replace spaces in column names with underscores\n",
+ "train.columns = train.columns.str.replace(' ','_')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " avg_price_per_kg | \n",
+ " Province_EASTERN_CAPE | \n",
+ " Province_NATAL | \n",
+ " ... | \n",
+ " Date_2020-09-07 | \n",
+ " Date_2020-09-09 | \n",
+ " Date_2020-09-16 | \n",
+ " Date_2020-09-17 | \n",
+ " Date_2020-09-19 | \n",
+ " Date_2020-09-21 | \n",
+ " Date_2020-09-23 | \n",
+ " Date_2020-09-28 | \n",
+ " Date_2020-10-01 | \n",
+ " Date_2020-10-03 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 18.3 | \n",
+ " 150.0 | \n",
+ " 170.0 | \n",
+ " 51710.0 | \n",
+ " 332 | \n",
+ " 6075.6 | \n",
+ " 822 | \n",
+ " 8.51 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 11.0 | \n",
+ " 50.0 | \n",
+ " 50.0 | \n",
+ " 16000.0 | \n",
+ " 320 | \n",
+ " 3520.0 | \n",
+ " 0 | \n",
+ " 4.55 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 9.0 | \n",
+ " 55.0 | \n",
+ " 55.0 | \n",
+ " 990.0 | \n",
+ " 18 | \n",
+ " 162.0 | \n",
+ " 1506 | \n",
+ " 6.11 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " 18.3 | \n",
+ " 80.0 | \n",
+ " 120.0 | \n",
+ " 32020.0 | \n",
+ " 388 | \n",
+ " 7100.4 | \n",
+ " 443 | \n",
+ " 4.51 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 400.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1 | \n",
+ " 400.0 | \n",
+ " 2 | \n",
+ " 4.50 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 179 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "1 18.3 150.0 170.0 51710.0 332 \n",
+ "7 11.0 50.0 50.0 16000.0 320 \n",
+ "24 9.0 55.0 55.0 990.0 18 \n",
+ "40 18.3 80.0 120.0 32020.0 388 \n",
+ "69 400.0 1800.0 1800.0 1800.0 1 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand avg_price_per_kg Province_EASTERN_CAPE \\\n",
+ "1 6075.6 822 8.51 0 \n",
+ "7 3520.0 0 4.55 0 \n",
+ "24 162.0 1506 6.11 0 \n",
+ "40 7100.4 443 4.51 0 \n",
+ "69 400.0 2 4.50 1 \n",
+ "\n",
+ " Province_NATAL ... Date_2020-09-07 Date_2020-09-09 Date_2020-09-16 \\\n",
+ "1 0 ... 0 1 0 \n",
+ "7 0 ... 0 0 0 \n",
+ "24 0 ... 0 0 0 \n",
+ "40 0 ... 0 0 0 \n",
+ "69 0 ... 0 0 0 \n",
+ "\n",
+ " Date_2020-09-17 Date_2020-09-19 Date_2020-09-21 Date_2020-09-23 \\\n",
+ "1 0 0 0 0 \n",
+ "7 0 0 0 0 \n",
+ "24 0 0 0 0 \n",
+ "40 0 0 0 0 \n",
+ "69 0 0 0 0 \n",
+ "\n",
+ " Date_2020-09-28 Date_2020-10-01 Date_2020-10-03 \n",
+ "1 0 0 0 \n",
+ "7 0 0 0 \n",
+ "24 0 0 0 \n",
+ "40 0 0 0 \n",
+ "69 1 0 0 \n",
+ "\n",
+ "[5 rows x 179 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1952, 179)"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cols = list(train)\n",
+ "#Move column to last index\n",
+ "cols.insert(len(cols)-1,cols.pop(cols.index(\"avg_price_per_kg\")))\n",
+ "\n",
+ "#Update column names\n",
+ "train = train.loc[:, cols]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Weight_Kg | \n",
+ " Low_Price | \n",
+ " High_Price | \n",
+ " Sales_Total | \n",
+ " Total_Qty_Sold | \n",
+ " Total_Kg_Sold | \n",
+ " Stock_On_Hand | \n",
+ " Province_EASTERN_CAPE | \n",
+ " Province_NATAL | \n",
+ " Province_ORANGE_FREE_STATE | \n",
+ " ... | \n",
+ " Date_2020-09-09 | \n",
+ " Date_2020-09-16 | \n",
+ " Date_2020-09-17 | \n",
+ " Date_2020-09-19 | \n",
+ " Date_2020-09-21 | \n",
+ " Date_2020-09-23 | \n",
+ " Date_2020-09-28 | \n",
+ " Date_2020-10-01 | \n",
+ " Date_2020-10-03 | \n",
+ " avg_price_per_kg | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 18.3 | \n",
+ " 150.0 | \n",
+ " 170.0 | \n",
+ " 51710.0 | \n",
+ " 332 | \n",
+ " 6075.6 | \n",
+ " 822 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.51 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 11.0 | \n",
+ " 50.0 | \n",
+ " 50.0 | \n",
+ " 16000.0 | \n",
+ " 320 | \n",
+ " 3520.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4.55 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 9.0 | \n",
+ " 55.0 | \n",
+ " 55.0 | \n",
+ " 990.0 | \n",
+ " 18 | \n",
+ " 162.0 | \n",
+ " 1506 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 6.11 | \n",
+ "
\n",
+ " \n",
+ " 40 | \n",
+ " 18.3 | \n",
+ " 80.0 | \n",
+ " 120.0 | \n",
+ " 32020.0 | \n",
+ " 388 | \n",
+ " 7100.4 | \n",
+ " 443 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4.51 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 400.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1800.0 | \n",
+ " 1 | \n",
+ " 400.0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4.50 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 179 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Weight_Kg Low_Price High_Price Sales_Total Total_Qty_Sold \\\n",
+ "1 18.3 150.0 170.0 51710.0 332 \n",
+ "7 11.0 50.0 50.0 16000.0 320 \n",
+ "24 9.0 55.0 55.0 990.0 18 \n",
+ "40 18.3 80.0 120.0 32020.0 388 \n",
+ "69 400.0 1800.0 1800.0 1800.0 1 \n",
+ "\n",
+ " Total_Kg_Sold Stock_On_Hand Province_EASTERN_CAPE Province_NATAL \\\n",
+ "1 6075.6 822 0 0 \n",
+ "7 3520.0 0 0 0 \n",
+ "24 162.0 1506 0 0 \n",
+ "40 7100.4 443 0 0 \n",
+ "69 400.0 2 1 0 \n",
+ "\n",
+ " Province_ORANGE_FREE_STATE ... Date_2020-09-09 Date_2020-09-16 \\\n",
+ "1 0 ... 1 0 \n",
+ "7 0 ... 0 0 \n",
+ "24 0 ... 0 0 \n",
+ "40 0 ... 0 0 \n",
+ "69 0 ... 0 0 \n",
+ "\n",
+ " Date_2020-09-17 Date_2020-09-19 Date_2020-09-21 Date_2020-09-23 \\\n",
+ "1 0 0 0 0 \n",
+ "7 0 0 0 0 \n",
+ "24 0 0 0 0 \n",
+ "40 0 0 0 0 \n",
+ "69 0 0 0 0 \n",
+ "\n",
+ " Date_2020-09-28 Date_2020-10-01 Date_2020-10-03 avg_price_per_kg \n",
+ "1 0 0 0 8.51 \n",
+ "7 0 0 0 4.55 \n",
+ "24 0 0 0 6.11 \n",
+ "40 0 0 0 4.51 \n",
+ "69 1 0 0 4.50 \n",
+ "\n",
+ "[5 rows x 179 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Building the regression models \n",
+ "### Method 1: A simple linear reggression model following the Least Squares Method\n",
+ "#### Calculating the slope and the intercept "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Slope = -0.0076244934807224145\n",
+ "Intercept = 7.0873874015160885\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Imported regression model\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "\n",
+ "#Split predictors and response\n",
+ "X = train_set['Weight_Kg']\n",
+ "Y = train_set['avg_price_per_kg']\n",
+ "\n",
+ "#Calculating x bar, y bar\n",
+ "x_bar = np.mean(X)\n",
+ "y_bar = np.mean(Y)\n",
+ "\n",
+ "#Calculating Slope\n",
+ "b = sum((X-x_bar) * (Y-y_bar)) / sum((X-x_bar)**2)\n",
+ "\n",
+ "#Calculating intercept\n",
+ "a = y_bar - b*x_bar\n",
+ "\n",
+ "print(\"Slope = \" + str(b))\n",
+ "print(\"Intercept = \" + str(a))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Plot our regression line on a scatter plot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "