-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_mining.py
240 lines (222 loc) · 10.3 KB
/
data_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# import test
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import calendar
from sklearn.preprocessing import LabelEncoder
def printModelPerformance(dict_obj):
# st.dataframe(pd.DataFrame.from_records([dict_obj]))
st.write("Accuracy on test set : {:.3f}".format(dict_obj['score']))
st.write('AUC: %.2f' % dict_obj['auc'])
st.write('Mjority classifier Confusion Matrix:', dict_obj['confusion_majority'])
st.write('Mjority TN = ', dict_obj['confusion_majority'][0][0])
st.write('Mjority FP = ', dict_obj['confusion_majority'][0][1])
st.write('Mjority FN = ', dict_obj['confusion_majority'][1][0])
st.write('Mjority TP = ', dict_obj['confusion_majority'][1][1])
statistic = {}
statistic['Precision'] = dict_obj['precision_score']
statistic['Recall'] = dict_obj['recall_score']
statistic['F1'] = dict_obj['f1_score']
statistic['Accuracy'] = dict_obj['accuracy_score']
statistic_df = pd.Series(statistic.values(),index=statistic.keys())
statistic_df.plot(kind='bar')
st.pyplot()
#initialized the required data
df_main = pd.read_csv("Laundry_Data.csv")
reverse_result = pd.read_csv("reverse_result.csv")
reverse_result = reverse_result.drop('Unnamed: 0',axis=1)
final_result = reverse_result.copy()
from collections import defaultdict
d = defaultdict(LabelEncoder)
# Encoding the variable
final_result = final_result.apply(lambda x: d[x.name].fit_transform(x))
#Start body
st.title('Laundry Data Mining (Project)')
st.header('Laundry Dataset Overview')
st.dataframe(df_main.head())
st.header('Data Visualization')
st.subheader("Number of null values")
if st.checkbox("After data preprocessing:"):
reverse_result.isnull().sum().plot(kind='bar',figsize=(10,5))
st.pyplot()
st.subheader("Sample dataset after processed:")
st.dataframe(reverse_result.head())
else:
df_main.isnull().sum().plot(kind='bar',figsize=(10,5))
st.pyplot()
st.subheader("More visualization on cleaned data")
columnList = []
visualize_df = reverse_result.copy()
visualize_df['count'] = visualize_df.index
for column in visualize_df:
if column=='Date' or column=='Time' or column=='count' or column=='No':
continue
else:
columnList.append(column)
selected_count_column = st.selectbox("Choose the column that you wish to see the distribution",columnList)
visualize_df[['count',selected_count_column]].groupby(selected_count_column).agg('count').plot(kind="bar",figsize=(10,5))
st.pyplot()
st.text("Choose two columns to stack together for better visualization")
stacked_column1 = st.selectbox("Choose column 1",columnList)
stacked_column2 = st.selectbox("Choose column 2",columnList)
if stacked_column1!=stacked_column2:
temp = visualize_df.groupby([stacked_column1, stacked_column2])[stacked_column2].count().unstack(stacked_column2)
temp.plot(kind='barh', stacked=True)
st.pyplot()
else:
st.warning("Column 1 and column 2 cannot be same.")
# classification
st.header('Classification & Prediction')
st.subheader("The target y value: basket_size")
st.text("The column 'shirt_type' will be dropped as this column scores the lowest in \nBoruta algorithm, which indicates that it has not much impact on the target variable.")
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
boruta_df = final_result.copy()
y = boruta_df['Basket_Size']
X = boruta_df.drop(['Date','Time','Basket_Size','shirt_type'],axis=1) #Boruta in jupyter ady showed this feature has no impact
#Choose to SMOTE or not SMOTE
if st.checkbox("With SMOTE to oversample the data: "):
from imblearn import over_sampling
smt = over_sampling.SMOTE(sampling_strategy="minority", random_state=10, k_neighbors=5)
X_res, y_res = smt.fit_resample(X, y)
else:
X_res = X
y_res = y
st.text("Bar chart below shows the count of target value y(basket_size) in dataset")
y_res.value_counts().plot(kind='bar',figsize=(10,5),xlabel="Basket_Size (0 is big, 1 is small)",ylabel="count")
st.pyplot()
st.subheader("Decision Tree Classifier")
#Decision Tree Classifier
dt_performance = {}
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.4, random_state=10)
dt=None
if st.checkbox("Tuned the parameters",key="tune_param_1"):
criterion = st.selectbox("Choose criterion: ",['gini', 'entropy'])
max_depth = st.selectbox("Choose max depth: ",[2,4,6,8,10,12])
dt = DecisionTreeClassifier(criterion=criterion,random_state=10,max_depth=max_depth)
else:
dt = DecisionTreeClassifier(random_state=10)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
prob_DTC = dt.predict_proba(X_test)
prob_DTC = prob_DTC[:, 1]
auc_DT= roc_auc_score(y_test, prob_DTC)
confusion_majority=confusion_matrix(y_test, y_pred)
dt_performance['auc'] = auc_DT
dt_performance['score'] = dt.score(X_test, y_test)
dt_performance['confusion_majority'] = confusion_majority
dt_performance['precision_score'] = precision_score(y_test, y_pred)
dt_performance['recall_score'] = recall_score(y_test, y_pred)
dt_performance['f1_score'] = f1_score(y_test, y_pred)
dt_performance['accuracy_score'] = accuracy_score(y_test, y_pred)
printModelPerformance(dt_performance)
st.subheader("Random Forest Classifier")
#Random forest
rf=None
if st.checkbox("Tuned the parameters",key="tune_param_2"):
max_depth = st.selectbox("Choose max_depth:",[80, 90, 100, 110])
max_features = st.selectbox("Choose max_features: ",[2, 3])
min_samples_leaf = st.selectbox("Choose min_samples_leaf: ",[3, 4, 5])
min_samples_split = st.selectbox("Choose min_samples_split: ",[8, 10, 12])
n_estimators = st.selectbox("Choose n_estimators: ",[100, 200, 300, 1000])
rf = RandomForestClassifier(max_depth=max_depth,max_features=max_features,min_samples_leaf=min_samples_leaf,min_samples_split=min_samples_split,n_estimators=n_estimators)
else:
rf = RandomForestClassifier(random_state=10)
rf_performance = {}
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.4, random_state=10)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
prob_RFC = rf.predict_proba(X_test)
prob_RFC = prob_RFC[:, 1]
auc_RF= roc_auc_score(y_test, prob_RFC)
confusion_majority=confusion_matrix(y_test, y_pred)
rf_performance['score'] = rf.score(X_test, y_test)
rf_performance['auc'] = auc_RF
rf_performance['confusion_majority'] = confusion_majority
rf_performance['precision_score'] = precision_score(y_test, y_pred)
rf_performance['recall_score'] = recall_score(y_test, y_pred)
rf_performance['f1_score'] = f1_score(y_test, y_pred)
rf_performance['accuracy_score'] = accuracy_score(y_test, y_pred)
printModelPerformance(rf_performance)
st.subheader("Support Vector Machine Classifier")
#SVM
svm_performance = {}
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.4, random_state=10)
svm=None
if st.checkbox("Tuned the parameters",key="tune_param_3"):
kernel = st.selectbox("Choose kernel: ",['rbf','sigmoid','linear'])
C = st.selectbox("Choose C:",[0.001, 0.01, 0.1, 1, 10])
gamma = st.selectbox("Choose gamma: ",[0.001, 0.01, 0.1, 1])
svm = SVC(kernel=kernel,C=C,gamma = gamma,probability=True)
else:
svm = SVC(kernel="linear",gamma="auto", probability=True)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
prob_SVM = svm.predict_proba(X_test)
prob_SVM = prob_SVM[:, 1]
auc_SVM= roc_auc_score(y_test, prob_SVM)
confusion_majority=confusion_matrix(y_test, y_pred)
svm_performance['auc'] = auc_SVM
svm_performance['score'] = svm.score(X_test, y_test)
svm_performance['confusion_majority'] = confusion_majority
svm_performance['precision_score'] = precision_score(y_test, y_pred)
svm_performance['recall_score'] = recall_score(y_test, y_pred)
svm_performance['f1_score'] = f1_score(y_test, y_pred)
svm_performance['accuracy_score'] = accuracy_score(y_test, y_pred)
printModelPerformance(svm_performance)
st.subheader("ROC Curve to show the performance of each classifier")
#Plot the performance
fpr_DT, tpr_DT, thresholds_DT = roc_curve(y_test, prob_DTC)
fpr_RFC, tpr_RFC, thresholds_RFC = roc_curve(y_test, prob_RFC)
fpr_SVM, tpr_SVM, thresholds_SVM = roc_curve(y_test, prob_SVM)
plt.plot(fpr_DT, tpr_DT, color='orange', label='DT')
plt.plot(fpr_RFC, tpr_RFC, color='blue', label='RF')
plt.plot(fpr_SVM, tpr_SVM, color='red', label='SVM')
plt.plot([0, 1], [0, 1], color='green', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
st.pyplot()
model_list = {
"Support Vector Machine" : svm,
"Decision Tree":dt,
"Random Forest":rf
}
st.header("Predict the Busket_Size by customers characteristic using model created above.")
modelInput = st.radio("Choose model to predict.",tuple(model_list.keys()))
modelSelected = model_list[modelInput]
inputList = {}
except_list = ['Day','Time_Range','Basket_Size','Time','Date','shirt_type']
inputList['Date'] = st.date_input("Input the date: ")
inputList['Time'] = st.time_input("Input the time: ")
# inputList['Age_Range'] = st.slider("Input the Age_Range: ",1,100)
for column in reverse_result:
if column in except_list:
continue
inputList[column] = st.selectbox("Input the "+str(column) + ": ",reverse_result[column].unique())
if st.button("Predict"):
inputList['Day']=calendar.day_name[inputList['Date'].weekday()]
inputList['Time_Range'] = inputList['Time'].hour
inputList['Date'] = df_main['Date'].iloc[0] #this column is useless
inputList['Time'] = 21 #this column is useless
new_df = pd.DataFrame(inputList,index=[0])
new_df['Time_Range']=pd.cut(new_df['Time_Range'],bins = [-1,7,12,19,24],labels=['midnight','morning','afternoon','night'])
transformed_bck = new_df.apply(lambda x: d[x.name].transform(x))
transformed_bck = transformed_bck.drop(['Time','Date'],axis=1)
predict_result = modelSelected.predict(transformed_bck)
basket = None
if predict_result[0]== 0:
basket = "Big Basket"
else:
basket = "Small Basket"
st.success("Done prediction. This customer is most likely to bring a "+str(basket))