-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatapreprocess.py
63 lines (47 loc) · 1.81 KB
/
datapreprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
"""DataPreprocess.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1KQiI7hu7Kr26jqEFJFCgj2fLg_j6P-G_
"""
# Commented out IPython magic to ensure Python compatibility.
import idx2numpy
import matplotlib.pyplot as plt
# %matplotlib inline
import numpy as np
import seaborn as sns
from keras.utils import to_categorical
#loading training dataset
fileX = '/content/train-images.idx3-ubyte' #file location
X_train_img = idx2numpy.convert_from_file(fileX) #X_train_img.shape=(60000, 28, 28)
#loading training labels
fileY="/content/train-labels.idx1-ubyte"
Y_train = idx2numpy.convert_from_file(fileY)
y_train = to_categorical(Y_train.astype('float32')) #y_train.shape=(60000, 10)
#loading test image
fileXt = "/content/t10k-images.idx3-ubyte"
Xt_test_img = idx2numpy.convert_from_file(fileXt)
x_test = Xt_test_img.reshape(-1, 28, 28, 1).astype('float32') #x_test.shape=(10000, 28, 28, 1)
#loading Test label
fileYt = "/content/t10k-labels.idx1-ubyte"
Y_test = idx2numpy.convert_from_file(fileYt)
y_test = to_categorical(Y_test.astype('float32')) #y_test.shape=(10000, 10)
x_train = X_train_img.reshape(-1, 28, 28, 1).astype('float32') #x_train.shape=(60000, 28, 28, 1)
#EDA of training dataset:
length=np.zeros((10,1))
kj=[]
for i in Y_train:
length[i]=length[i]+1
for i in range(len(length)):
kj.append([i,length[i][0]])
labels=dict(kj)
plt.figure(figsize = (10, 5))
sns.barplot(x = list(labels.keys()), y = list(labels.values()), palette = 'Set3')
plt.ylabel('Label')
plt.xlabel('Count of Samples/Observations')
for i in range(len(length)):
per=(length[i][0]/sum(length)[0])*100
print("% of the total data: "+str(i)+" ==> "+str(per)+" %")
#data normalization
x_train=x_train/255
x_test=x_test/255