-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
David Arroyo Menéndez
committed
May 15, 2019
1 parent
97f5dc7
commit bebf534
Showing
1 changed file
with
92 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
# Copyright (C) 2018 David Arroyo Menéndez | ||
|
||
# Author: David Arroyo Menéndez <[email protected]> | ||
# Maintainer: David Arroyo Menéndez <[email protected]> | ||
|
||
# This file is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation; either version 3, or (at your option) | ||
# any later version. | ||
|
||
# This file is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
|
||
# You should have received a copy of the GNU General Public License | ||
# along with GNU Emacs; see the file COPYING. If not, write to | ||
# the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | ||
# Boston, MA 02110-1301 USA, | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from sklearn.preprocessing import StandardScaler | ||
import matplotlib.pyplot as plt | ||
from matplotlib import style | ||
from sklearn.model_selection import train_test_split | ||
style.use('fivethirtyeight') | ||
from sklearn.neighbors import KNeighborsClassifier | ||
# 0. Load in the data and split the descriptive and the target feature | ||
df = pd.read_csv('data/Wine.txt',sep=',',names=['target','Alcohol','Malic_acid','Ash','Akcakinity','Magnesium','Total_pheonols','Flavanoids','Nonflavanoids','Proanthocyanins','Color_intensity','Hue','OD280','Proline']) | ||
X = df.iloc[:,1:].copy() | ||
target = df['target'].copy() | ||
X_train, X_test, y_train, y_test = train_test_split(X,target,test_size=0.3,random_state=0) | ||
# 1. Standardize the data | ||
for col in X_train.columns: | ||
X_train[col] = StandardScaler().fit_transform(X_train[col].values.reshape(-1,1)) | ||
# 2. Compute the mean vector mu and the mean vector per class mu_k | ||
mu = np.mean(X_train,axis=0).values.reshape(13,1) # Mean vector mu --> Since the data has been standardized, the data means are zero | ||
mu_k = [] | ||
for i,orchid in enumerate(np.unique(df['target'])): | ||
mu_k.append(np.mean(X_train.where(df['target']==orchid),axis=0)) | ||
mu_k = np.array(mu_k).T | ||
# 3. Compute the Scatter within and Scatter between matrices | ||
data_SW = [] | ||
Nc = [] | ||
for i,orchid in enumerate(np.unique(df['target'])): | ||
a = np.array(X_train.where(df['target']==orchid).dropna().values-mu_k[:,i].reshape(1,13)) | ||
data_SW.append(np.dot(a.T,a)) | ||
Nc.append(np.sum(df['target']==orchid)) | ||
SW = np.sum(data_SW,axis=0) | ||
SB = np.dot(Nc*np.array(mu_k-mu),np.array(mu_k-mu).T) | ||
|
||
# 4. Compute the Eigenvalues and Eigenvectors of SW^-1 SB | ||
eigval, eigvec = np.linalg.eig(np.dot(np.linalg.inv(SW),SB)) | ||
|
||
# 5. Select the two largest eigenvalues | ||
eigen_pairs = [[np.abs(eigval[i]),eigvec[:,i]] for i in range(len(eigval))] | ||
eigen_pairs = sorted(eigen_pairs,key=lambda k: k[0],reverse=True) | ||
w = np.hstack((eigen_pairs[0][1][:,np.newaxis].real,eigen_pairs[1][1][:,np.newaxis].real)) # Select two largest | ||
# 6. Transform the data with Y=X*w | ||
Y = X_train.dot(w) | ||
# Plot the data | ||
fig = plt.figure(figsize=(10,10)) | ||
ax0 = fig.add_subplot(111) | ||
ax0.set_xlim(-3,3) | ||
ax0.set_ylim(-4,3) | ||
for l,c,m in zip(np.unique(y_train),['r','g','b'],['s','x','o']): | ||
ax0.scatter(Y[0][y_train==l], | ||
Y[1][y_train==l], | ||
c=c, marker=m, label=l,edgecolors='black') | ||
ax0.legend(loc='upper right') | ||
# Plot the voroni spaces | ||
means = [] | ||
for m,target in zip(['s','x','o'],np.unique(y_train)): | ||
means.append(np.mean(Y[y_train==target],axis=0)) | ||
ax0.scatter(np.mean(Y[y_train==target],axis=0)[0],np.mean(Y[y_train==target],axis=0)[1],marker=m,c='black',s=100) | ||
|
||
mesh_x, mesh_y = np.meshgrid(np.linspace(-3,3),np.linspace(-4,3)) | ||
mesh = [] | ||
for i in range(len(mesh_x)): | ||
for j in range(len(mesh_x[0])): | ||
date = [mesh_x[i][j],mesh_y[i][j]] | ||
mesh.append((mesh_x[i][j],mesh_y[i][j])) | ||
NN = KNeighborsClassifier(n_neighbors=1) | ||
NN.fit(means,['r','g','b']) | ||
predictions = NN.predict(np.array(mesh)) | ||
ax0.scatter(np.array(mesh)[:,0],np.array(mesh)[:,1],color=predictions,alpha=0.3) | ||
plt.show() |