Skip to content

Commit

Permalink
Create Clustering_BK2_ch10.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
GhazaleZe committed May 18, 2021
1 parent 2046ff6 commit f99dcc1
Showing 1 changed file with 258 additions and 0 deletions.
258 changes: 258 additions & 0 deletions Practices/Clustering_BK2_ch10.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true,
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"import jenkspy\n",
"\n",
"import pandas as pd\n",
"from sklearn import datasets\n",
"import matplotlib.pyplot as plt\n",
"from scipy.cluster.hierarchy import dendrogram, linkage\n",
"import numpy as np\n",
"from scipy.cluster.hierarchy import fcluster\n",
"import matplotlib.cm as cm\n",
"import plot_utils\n",
"import seaborn as sns\n",
"import statistics\n",
"from sklearn import tree, metrics, preprocessing\n",
"from sklearn.datasets import load_iris\n",
"from sklearn.metrics import accuracy_score, silhouette_score, silhouette_samples\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.tree import DecisionTreeClassifier, export_graphviz\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"from scipy import stats\n",
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": " alcohol quality sugar\n0 8.40 4 5.9\n1 8.50 6 5.9\n2 8.50 6 18.0\n3 8.50 6 18.0\n4 8.50 5 9.1\n... ... ... ...\n1804 13.80 6 4.6\n1805 13.90 7 2.6\n1806 14.00 7 1.9\n1807 14.00 7 1.9\n1808 14.05 7 8.4\n\n[1809 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>quality</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>8.40</td>\n <td>4</td>\n <td>5.9</td>\n </tr>\n <tr>\n <th>1</th>\n <td>8.50</td>\n <td>6</td>\n <td>5.9</td>\n </tr>\n <tr>\n <th>2</th>\n <td>8.50</td>\n <td>6</td>\n <td>18.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>8.50</td>\n <td>6</td>\n <td>18.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>8.50</td>\n <td>5</td>\n <td>9.1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1804</th>\n <td>13.80</td>\n <td>6</td>\n <td>4.6</td>\n </tr>\n <tr>\n <th>1805</th>\n <td>13.90</td>\n <td>7</td>\n <td>2.6</td>\n </tr>\n <tr>\n <th>1806</th>\n <td>14.00</td>\n <td>7</td>\n <td>1.9</td>\n </tr>\n <tr>\n <th>1807</th>\n <td>14.00</td>\n <td>7</td>\n <td>1.9</td>\n </tr>\n <tr>\n <th>1808</th>\n <td>14.05</td>\n <td>7</td>\n <td>8.4</td>\n </tr>\n </tbody>\n</table>\n<p>1809 rows × 3 columns</p>\n</div>"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 19
}
],
"source": [
"wine_train = pd.read_csv(\"white_wine_training\")\n",
"wine_train"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [
{
"data": {
"text/plain": " alcohol sugar\n0 -1.826971 0.011679\n1 -1.743463 0.011679\n2 -1.743463 2.601695\n3 -1.743463 2.601695\n4 -1.743463 0.696642\n... ... ...\n1804 2.682434 -0.266588\n1805 2.765941 -0.694689\n1806 2.849449 -0.844525\n1807 2.849449 -0.844525\n1808 2.891203 0.546806\n\n[1809 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>-1.826971</td>\n <td>0.011679</td>\n </tr>\n <tr>\n <th>1</th>\n <td>-1.743463</td>\n <td>0.011679</td>\n </tr>\n <tr>\n <th>2</th>\n <td>-1.743463</td>\n <td>2.601695</td>\n </tr>\n <tr>\n <th>3</th>\n <td>-1.743463</td>\n <td>2.601695</td>\n </tr>\n <tr>\n <th>4</th>\n <td>-1.743463</td>\n <td>0.696642</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>1804</th>\n <td>2.682434</td>\n <td>-0.266588</td>\n </tr>\n <tr>\n <th>1805</th>\n <td>2.765941</td>\n <td>-0.694689</td>\n </tr>\n <tr>\n <th>1806</th>\n <td>2.849449</td>\n <td>-0.844525</td>\n </tr>\n <tr>\n <th>1807</th>\n <td>2.849449</td>\n <td>-0.844525</td>\n </tr>\n <tr>\n <th>1808</th>\n <td>2.891203</td>\n <td>0.546806</td>\n </tr>\n </tbody>\n</table>\n<p>1809 rows × 2 columns</p>\n</div>"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 20
}
],
"source": [
"X = wine_train[['alcohol', 'sugar']]\n",
"Xz = pd.DataFrame(stats.zscore(X), columns=['alcohol', 'sugar'])\n",
"Xz"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"data": {
"text/plain": "array([1, 1, 1, ..., 0, 0, 0])"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 21
}
],
"source": [
"kmeans01 = KMeans(n_clusters = 2).fit(Xz)\n",
"cluster = kmeans01.labels_\n",
"cluster"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": " alcohol sugar\ncount 1097.000000 1097.000000\nmean 0.490305 -0.623752\nstd 0.905663 0.475694\nmin -1.576448 -1.122791\n25% -0.156821 -0.951551\n50% 0.427732 -0.844525\n75% 1.179299 -0.352208\nmax 2.891203 1.477928",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>1097.000000</td>\n <td>1097.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.490305</td>\n <td>-0.623752</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.905663</td>\n <td>0.475694</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-1.576448</td>\n <td>-1.122791</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-0.156821</td>\n <td>-0.951551</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.427732</td>\n <td>-0.844525</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>1.179299</td>\n <td>-0.352208</td>\n </tr>\n <tr>\n <th>max</th>\n <td>2.891203</td>\n <td>1.477928</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 22
}
],
"source": [
"Cluster1 = Xz.loc[cluster == 0]\n",
"Cluster2 = Xz.loc[cluster == 1]\n",
"Cluster1.describe()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 23,
"outputs": [
{
"data": {
"text/plain": " alcohol sugar\ncount 712.000000 712.000000\nmean -0.755428 0.961034\nstd 0.580989 0.818726\nmin -1.826971 -0.908740\n25% -1.158911 0.354160\n50% -0.908388 0.867883\n75% -0.407343 1.488630\nmax 2.014374 5.512788",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>712.000000</td>\n <td>712.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>-0.755428</td>\n <td>0.961034</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.580989</td>\n <td>0.818726</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-1.826971</td>\n <td>-0.908740</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-1.158911</td>\n <td>0.354160</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>-0.908388</td>\n <td>0.867883</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>-0.407343</td>\n <td>1.488630</td>\n </tr>\n <tr>\n <th>max</th>\n <td>2.014374</td>\n <td>5.512788</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 23
}
],
"source": [
"Cluster2.describe()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"data": {
"text/plain": " alcohol sugar\ncount 1122.000000 1122.000000\nmean 0.456397 -0.605782\nstd 0.903287 0.459740\nmin -1.675754 -1.089453\n25% -0.218729 -0.945241\n50% 0.368129 -0.821632\n75% 1.157351 -0.285988\nmax 2.776268 1.423949",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>1122.000000</td>\n <td>1122.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.456397</td>\n <td>-0.605782</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.903287</td>\n <td>0.459740</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-1.675754</td>\n <td>-1.089453</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-0.218729</td>\n <td>-0.945241</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.368129</td>\n <td>-0.821632</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>1.157351</td>\n <td>-0.285988</td>\n </tr>\n <tr>\n <th>max</th>\n <td>2.776268</td>\n <td>1.423949</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 24
}
],
"source": [
"wine_test = pd.read_csv(\"white_wine_test\")\n",
"X_test = wine_test[['alcohol', 'sugar']]\n",
"Xz_test = pd.DataFrame(stats.zscore(X_test), \n",
"columns=['alcohol', 'sugar'])\n",
"kmeans_test = KMeans(n_clusters = 2).fit(Xz_test)\n",
"cluster_test = kmeans_test.labels_ # Cluster membership\n",
"Cluster1_test = Xz_test.loc[cluster_test == 0]\n",
"Cluster2_test = Xz_test.loc[cluster_test == 1]\n",
"Cluster1_test.describe()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [
{
"data": {
"text/plain": " alcohol sugar\ncount 638.000000 638.000000\nmean -0.802630 1.065341\nstd 0.561207 0.779670\nmin -2.080483 -1.037949\n25% -1.190079 0.396441\n50% -0.947241 1.032518\n75% -0.542512 1.583612\nmax 1.562080 3.298700",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alcohol</th>\n <th>sugar</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>638.000000</td>\n <td>638.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>-0.802630</td>\n <td>1.065341</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.561207</td>\n <td>0.779670</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-2.080483</td>\n <td>-1.037949</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-1.190079</td>\n <td>0.396441</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>-0.947241</td>\n <td>1.032518</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>-0.542512</td>\n <td>1.583612</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.562080</td>\n <td>3.298700</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"output_type": "execute_result",
"execution_count": 25
}
],
"source": [
"Cluster2_test.describe()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n",
"is_executing": false
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit f99dcc1

Please sign in to comment.