-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
258 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,258 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": { | ||
"collapsed": true, | ||
"pycharm": { | ||
"is_executing": false | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import jenkspy\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"from sklearn import datasets\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"from scipy.cluster.hierarchy import dendrogram, linkage\n", | ||
"import numpy as np\n", | ||
"from scipy.cluster.hierarchy import fcluster\n", | ||
"import matplotlib.cm as cm\n", | ||
"import plot_utils\n", | ||
"import seaborn as sns\n", | ||
"import statistics\n", | ||
"from sklearn import tree, metrics, preprocessing\n", | ||
"from sklearn.datasets import load_iris\n", | ||
"from sklearn.metrics import accuracy_score, silhouette_score, silhouette_samples\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"from sklearn.naive_bayes import MultinomialNB\n", | ||
"from sklearn.tree import DecisionTreeClassifier, export_graphviz\n", | ||
"from sklearn.cluster import KMeans\n", | ||
"from sklearn.decomposition import PCA\n", | ||
"from scipy import stats\n", | ||
"from sklearn.cluster import KMeans" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": " alcohol quality sugar\n0 8.40 4 5.9\n1 8.50 6 5.9\n2 8.50 6 18.0\n3 8.50 6 18.0\n4 8.50 5 9.1\n... ... ... ...\n1804 13.80 6 4.6\n1805 13.90 7 2.6\n1806 14.00 7 1.9\n1807 14.00 7 1.9\n1808 14.05 7 8.4\n\n[1809 rows x 3 columns]", | ||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>quality</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>8.40</td>\n <td>4</td>\n <td>5.9</td>\n </tr>\n <tr>\n <th>1</th>\n <td>8.50</td>\n <td>6</td>\n <td>5.9</td>\n </tr>\n <tr>\n <th>2</th>\n <td>8.50</td>\n <td>6</td>\n <td>18.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>8.50</td>\n <td>6</td>\n <td>18.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>8.50</td>\n <td>5</td>\n <td>9.1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1804</th>\n <td>13.80</td>\n <td>6</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>1805</th>\n <td>13.90</td>\n <td>7</td>\n <td>2.6</td>\n </tr>\n <tr>\n <th>1806</th>\n <td>14.00</td>\n <td>7</td>\n <td>1.9</td>\n </tr>\n <tr>\n <th>1807</th>\n <td>14.00</td>\n <td>7</td>\n <td>1.9</td>\n </tr>\n <tr>\n <th>1808</th>\n <td>14.05</td>\n <td>7</td>\n <td>8.4</td>\n </tr>\n </tbody>\n</table>\n<p>1809 rows × 3 columns</p>\n</div>" | ||
}, | ||
"metadata": {}, | ||
"output_type": "execute_result", | ||
"execution_count": 19 | ||
} | ||
], | ||
"source": [ | ||
"wine_train = pd.read_csv(\"white_wine_training\")\n", | ||
"wine_train" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n", | ||
"is_executing": false | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": " alcohol sugar\n0 -1.826971 0.011679\n1 -1.743463 0.011679\n2 -1.743463 2.601695\n3 -1.743463 2.601695\n4 -1.743463 0.696642\n... ... ...\n1804 2.682434 -0.266588\n1805 2.765941 -0.694689\n1806 2.849449 -0.844525\n1807 2.849449 -0.844525\n1808 2.891203 0.546806\n\n[1809 rows x 2 columns]", | ||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>-1.826971</td>\n <td>0.011679</td>\n </tr>\n <tr>\n <th>1</th>\n <td>-1.743463</td>\n <td>0.011679</td>\n </tr>\n <tr>\n <th>2</th>\n <td>-1.743463</td>\n <td>2.601695</td>\n </tr>\n <tr>\n <th>3</th>\n <td>-1.743463</td>\n <td>2.601695</td>\n </tr>\n <tr>\n <th>4</th>\n <td>-1.743463</td>\n <td>0.696642</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1804</th>\n <td>2.682434</td>\n <td>-0.266588</td>\n </tr>\n <tr>\n <th>1805</th>\n <td>2.765941</td>\n <td>-0.694689</td>\n </tr>\n <tr>\n <th>1806</th>\n <td>2.849449</td>\n <td>-0.844525</td>\n </tr>\n <tr>\n <th>1807</th>\n <td>2.849449</td>\n <td>-0.844525</td>\n </tr>\n <tr>\n <th>1808</th>\n <td>2.891203</td>\n <td>0.546806</td>\n </tr>\n </tbody>\n</table>\n<p>1809 rows × 2 columns</p>\n</div>" | ||
}, | ||
"metadata": {}, | ||
"output_type": "execute_result", | ||
"execution_count": 20 | ||
} | ||
], | ||
"source": [ | ||
"X = wine_train[['alcohol', 'sugar']]\n", | ||
"Xz = pd.DataFrame(stats.zscore(X), columns=['alcohol', 'sugar'])\n", | ||
"Xz" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n", | ||
"is_executing": false | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": "array([1, 1, 1, ..., 0, 0, 0])" | ||
}, | ||
"metadata": {}, | ||
"output_type": "execute_result", | ||
"execution_count": 21 | ||
} | ||
], | ||
"source": [ | ||
"kmeans01 = KMeans(n_clusters = 2).fit(Xz)\n", | ||
"cluster = kmeans01.labels_\n", | ||
"cluster" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n", | ||
"is_executing": false | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": " alcohol sugar\ncount 1097.000000 1097.000000\nmean 0.490305 -0.623752\nstd 0.905663 0.475694\nmin -1.576448 -1.122791\n25% -0.156821 -0.951551\n50% 0.427732 -0.844525\n75% 1.179299 -0.352208\nmax 2.891203 1.477928", | ||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>1097.000000</td>\n <td>1097.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.490305</td>\n <td>-0.623752</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.905663</td>\n <td>0.475694</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-1.576448</td>\n <td>-1.122791</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-0.156821</td>\n <td>-0.951551</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.427732</td>\n <td>-0.844525</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>1.179299</td>\n <td>-0.352208</td>\n </tr>\n <tr>\n <th>max</th>\n <td>2.891203</td>\n <td>1.477928</td>\n </tr>\n </tbody>\n</table>\n</div>" | ||
}, | ||
"metadata": {}, | ||
"output_type": "execute_result", | ||
"execution_count": 22 | ||
} | ||
], | ||
"source": [ | ||
"Cluster1 = Xz.loc[cluster == 0]\n", | ||
"Cluster2 = Xz.loc[cluster == 1]\n", | ||
"Cluster1.describe()" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n", | ||
"is_executing": false | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 23, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": " alcohol sugar\ncount 712.000000 712.000000\nmean -0.755428 0.961034\nstd 0.580989 0.818726\nmin -1.826971 -0.908740\n25% -1.158911 0.354160\n50% -0.908388 0.867883\n75% -0.407343 1.488630\nmax 2.014374 5.512788", | ||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>712.000000</td>\n <td>712.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>-0.755428</td>\n <td>0.961034</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.580989</td>\n <td>0.818726</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-1.826971</td>\n <td>-0.908740</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-1.158911</td>\n <td>0.354160</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>-0.908388</td>\n <td>0.867883</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>-0.407343</td>\n <td>1.488630</td>\n </tr>\n <tr>\n <th>max</th>\n <td>2.014374</td>\n <td>5.512788</td>\n </tr>\n </tbody>\n</table>\n</div>" | ||
}, | ||
"metadata": {}, | ||
"output_type": "execute_result", | ||
"execution_count": 23 | ||
} | ||
], | ||
"source": [ | ||
"Cluster2.describe()" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n", | ||
"is_executing": false | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 24, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": " alcohol sugar\ncount 1122.000000 1122.000000\nmean 0.456397 -0.605782\nstd 0.903287 0.459740\nmin -1.675754 -1.089453\n25% -0.218729 -0.945241\n50% 0.368129 -0.821632\n75% 1.157351 -0.285988\nmax 2.776268 1.423949", | ||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>1122.000000</td>\n <td>1122.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.456397</td>\n <td>-0.605782</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.903287</td>\n <td>0.459740</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-1.675754</td>\n <td>-1.089453</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-0.218729</td>\n <td>-0.945241</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.368129</td>\n <td>-0.821632</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>1.157351</td>\n <td>-0.285988</td>\n </tr>\n <tr>\n <th>max</th>\n <td>2.776268</td>\n <td>1.423949</td>\n </tr>\n </tbody>\n</table>\n</div>" | ||
}, | ||
"metadata": {}, | ||
"output_type": "execute_result", | ||
"execution_count": 24 | ||
} | ||
], | ||
"source": [ | ||
"wine_test = pd.read_csv(\"white_wine_test\")\n", | ||
"X_test = wine_test[['alcohol', 'sugar']]\n", | ||
"Xz_test = pd.DataFrame(stats.zscore(X_test), \n", | ||
"columns=['alcohol', 'sugar'])\n", | ||
"kmeans_test = KMeans(n_clusters = 2).fit(Xz_test)\n", | ||
"cluster_test = kmeans_test.labels_ # Cluster membership\n", | ||
"Cluster1_test = Xz_test.loc[cluster_test == 0]\n", | ||
"Cluster2_test = Xz_test.loc[cluster_test == 1]\n", | ||
"Cluster1_test.describe()" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n", | ||
"is_executing": false | ||
} | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 25, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": " alcohol sugar\ncount 638.000000 638.000000\nmean -0.802630 1.065341\nstd 0.561207 0.779670\nmin -2.080483 -1.037949\n25% -1.190079 0.396441\n50% -0.947241 1.032518\n75% -0.542512 1.583612\nmax 1.562080 3.298700", | ||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>638.000000</td>\n <td>638.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>-0.802630</td>\n <td>1.065341</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.561207</td>\n <td>0.779670</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-2.080483</td>\n <td>-1.037949</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-1.190079</td>\n <td>0.396441</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>-0.947241</td>\n <td>1.032518</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>-0.542512</td>\n <td>1.583612</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.562080</td>\n <td>3.298700</td>\n </tr>\n </tbody>\n</table>\n</div>" | ||
}, | ||
"metadata": {}, | ||
"output_type": "execute_result", | ||
"execution_count": 25 | ||
} | ||
], | ||
"source": [ | ||
"Cluster2_test.describe()" | ||
], | ||
"metadata": { | ||
"collapsed": false, | ||
"pycharm": { | ||
"name": "#%%\n", | ||
"is_executing": false | ||
} | ||
} | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.6" | ||
}, | ||
"pycharm": { | ||
"stem_cell": { | ||
"cell_type": "raw", | ||
"source": [], | ||
"metadata": { | ||
"collapsed": false | ||
} | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |