-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmargin_maximization_lmc.py
261 lines (220 loc) · 17.9 KB
/
margin_maximization_lmc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#Code for the paper Margin Maximization as Lossless Maximal Compression
#by N. Nikolaou, H. Reeve & G. Brown,
#Submitted in MLJ, S.I. ECML-PKDD 2020 on Nov 8 2019.
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.metrics import zero_one_loss
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy.io as sio
n_runs = 100 #number of runs; each run uses a different train/test split
n_est = 100 #(maximum) number of boosting rounds
bin_count = 100 #every feature will be discretized to bin_count equal-sized bins
score_F_bins = 100 # scores (ensemble model outputs) will be discretized to score_F_bins equal-sized bins
#Gradient boosting hyperparameters: loss function, number of boosting rounds, maximum tree depth, learning rate (1 for no shrinkage, loss function: for AdaBoost use 'exponential'):
params = {'loss': 'exponential', 'n_estimators': n_est, 'max_depth': 6 , 'learning_rate' : 1, 'subsample' : 1}
###############################################################################
#Load one (or more...in which case finish loop below) of these datasets:
#dataset_list = ["survival", "ionosphere", "congress_orig", "liver", "pima",
# "parkinsons", "landsatM", "krvskp", "heart_orig", "wdbc_orig",
# "german_credit", "sonar_orig", "semeion_orig", "spliceM",
# "waveformM", "spambase", "mushroom", "musk2"]
dataset_name = ["heart_orig"]
#survival, congress_orig, landsatM --discrete; no further discretization needed unless to bin to fewer than 10 values
#survival, pima, liver spambase --noisy for bin_count < 100; not used in experiments shown in paper (we cover noiseless case); will be used in future work, included here nonetheless
##for dat_num in range(len(dataset_name)):
## data_path = os.path.join(os.getcwd(), 'Datasets', dataset_list[dat_num]+'.mat')
## ...
data_path = os.path.join(os.getcwd(), 'Datasets', dataset_name[0]+'.mat')
mat_contents = sio.loadmat(data_path)
X_undisc = mat_contents['data']
y = np.asarray([float(i) for i in mat_contents['labels'].ravel()]) #loadmat loads the labels as a list of uints which we turn into an array of floats
y[y!=1] = 0 #binarize multiclass datasets using 0/1 encoding
###############################################################################
#OR make an artificial dataset using scikit-learn's make_classification():
#from sklearn.datasets import make_classification
#X_undisc, y = make_classification(n_samples=2000, n_features=20, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01)#weights=[0.1,0.9]
###############################################################################
#Discretize features:
num_features = X_undisc.shape[1]
X = np.zeros(X_undisc.shape)
for i in range(num_features):
if np.max(np.absolute(X_undisc[:,i]))==0:#to avoid division by 0 when entire column is 0
X[:,i] = X_undisc[:,i]
else:
X[:,i] = X_undisc[:,i] / np.max(np.absolute(X_undisc[:,i]))
X_i_binned = np.histogram(X_undisc[:,i], bins=bin_count)
X[:,i] = np.fmin(np.digitize(X_undisc[:,i], X_i_binned[1]), bin_count)
#Variables that will store quantities of interest:
margin = np.zeros((n_runs,n_est)) #average margin of model on training set
margin_hist = np.zeros((n_runs,n_est,int(X_undisc.shape[0]*0.5))) #margins of model on all training examples
MI_FY = np.zeros((n_runs,n_est)) #I(F;Y), i.e. mutual information between score produced by model on training dataset (score_F_i_disc) & target label of training examples (y_train)
MI_XF = np.zeros((n_runs,n_est)) #I(X;F), i.e. mutual information between score produced by model on training dataset (score_F_i_disc) & (joint) feature vector of training examples (X_train)
error_tr = np.zeros((n_runs,n_est)) #training error of model
error_te = np.zeros((n_runs,n_est)) #test error of model
entropy_F = np.zeros((n_runs,n_est)) #H(F), i.e. entropy of scores produced by model on training dataset (score_F_i_disc)
MI_XY = np.zeros(n_runs) #I(X;Y), , i.e. mutual information between (joint) feature vector of training examples (X_train) & target label of training examples (y_train)
entropy_Y = np.zeros(n_runs) #H(Y), i.e. entropy of target label of training examples (y_train)
entropy_X = np.zeros(n_runs) #H(X), i.e. entropy of (joint) feature vectors in dataset (X_train)
max_score = 0 # will store maximal score assigned by model
min_score = 0 # will store minimal score assigned by model
#On each run perform a train/test split, train ensemble model and store all quantities of interest:
for run in range(n_runs):
#Perform train/test split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
y_train_sign = 2*y_train-1 #switch to -1/1 encoding
#Form a joint r.v. out of all the features (all features have been discretized to bin_count values each):
X_train_joint = np.zeros(X_train[:,0].shape)
for i in range(num_features):
X_train_joint = X_train_joint + (X_train[:,i]-1)*((bin_count)**i)
#Build ensemble
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)
#Calculate scores and predictions for each boosting round (i.e. 'staged'):
score_F_bin_list = np.linspace(0, 1.0, num=score_F_bins+1)[1:]
#for i, score_F_i in enumerate(clf.staged_predict_proba(X_train)): #A bad built-in for calculating probability estimates (i.e. in [0,1])
for i, score_F_i in enumerate(clf.staged_decision_function(X_train)): #A better built-in for calculating raw (i.e. real-valued) scores (Actual F(x) of Boosting divided by sum of alphas -- basicaly equals normalized_margin(x)/y )
#Discretize real-valued scores for estimating information-theoretic quantities:
score_F_i_binned = np.histogram(score_F_i[:,0], bins=score_F_bin_list)
score_F_i_disc = np.fmin(np.digitize(score_F_i[:,0], score_F_i_binned[1]), len(score_F_bin_list))
#Estimate model-dependent information-theoretic quantities (both model and training dataset vary per run):
MI_FY[run,i] = mutual_info_score(score_F_i_disc, y_train)
MI_XF[run,i] = mutual_info_score(score_F_i_disc, X_train_joint)
entropy_F[run,i] = mutual_info_score(score_F_i_disc, score_F_i_disc) #I(F;F) = H(F)
#To get normalized margins must normalize scores (can also use raw, i.e. real-valued scores):
max_score = max(max_score, np.max(score_F_i_disc)) #largest score ever assigned by model
min_score = min(min_score, np.min(score_F_i_disc)) #lowest score ever assigned by model
score_range = max_score - min_score
score_F_i_norm = 2*((score_F_i_disc - min_score)/score_range) - 1 #normalize by range within [-1,1]
#Compute model's margins on dataset (both model and training dataset vary per run):
margin[run,i] = np.nanmean(np.divide((score_F_i_norm).T, y_train_sign))
margin_hist[run,i,:] = np.divide((score_F_i_norm).T, y_train_sign)
#Estimate model-independent information-theoretic quantities (training dataset varies per run):
MI_XY[run] = mutual_info_score(X_train_joint, y_train)
entropy_Y[run] = mutual_info_score(y_train, y_train) #I(Y;Y) = H(Y)
entropy_X[run] = mutual_info_score(X_train_joint, X_train_joint) #I(X;X) = H(X)
#Compute model's predictions on dataset (both model and training dataset vary per run):
for i, pred_y_train in enumerate(clf.staged_predict(X_train)):
error_tr[run,i] = zero_one_loss(pred_y_train, y_train)
for i, pred_y_test in enumerate(clf.staged_predict(X_test)):
error_te[run,i] = zero_one_loss(pred_y_test, y_test)
#############################
#Compute mean and standard deviation of quantities of interest:
margin_mean = np.nanmean(margin, axis=0)
margin_hist_mean = np.nanmean(margin_hist, axis=0)
MI_XF_mean = np.nanmean(MI_XF, axis=0)
MI_FY_mean = np.nanmean(MI_FY, axis=0)
avg_pairwise_distance_MI = (np.abs(MI_XF_mean-MI_FY_mean) + np.abs(MI_XF_mean-np.tile(np.nanmean(entropy_Y),(n_est,))) + np.abs(MI_FY_mean-np.tile(np.nanmean(entropy_Y),(n_est,))))/3 #average pairwise distance among MI_XF_mean, MI_FY_mean and MI_XY (if zero, then LMC)
entropy_F_mean = np.nanmean(entropy_F, axis=0)
error_tr_mean = np.nanmean(error_tr, axis=0)
error_te_mean = np.nanmean(error_te, axis=0)
margin_std = np.nanstd(margin, axis=0)
margin_hist_std = np.nanstd(margin_hist, axis=0)
MI_XF_std = np.nanstd(MI_XF, axis=0)
MI_FY_std = np.nanstd(MI_FY, axis=0)
#std_pairwise_distance_MI = ... # not used; define if needed
entropy_F_std = np.nanstd(entropy_F, axis=0)
error_tr_std = np.nanstd(error_tr, axis=0)
error_te_std = np.nanstd(error_te, axis=0)
#Find index (round of boosting) at which each quantity of interest --on average-- reaches its minimum/maximum:
index_MI_XF_mean_max = np.where(MI_XF_mean==np.max(MI_XF_mean, axis=0))[0][0]
index_MI_XF_mean_min = np.where(MI_XF_mean==np.min(MI_XF_mean, axis=0))[0][0]
index_MI_FY_mean_max = np.where(MI_FY_mean==np.max(MI_FY_mean, axis=0))[0][0]
index_error_tr_first_min = np.where(error_tr_mean==np.min(error_tr_mean))[0][0]
index_error_te_first_min = np.where(error_te_mean==np.min(error_te_mean))[0][0]
index_entropy_F_first_max = np.where(entropy_F_mean==np.max(entropy_F_mean))[0][0]
#Round reaching LMC point (index at which MI_FY_mean is first maximized while MI_XF_mean being also minimized):
try:
index_LMC = np.intersect1d(np.where(MI_FY_mean==np.max(MI_FY_mean, axis=0))[0], np.where(MI_XF_mean==np.min(MI_XF_mean, axis=0))[0])[0]
except IndexError as e: #If no such point found, find point closest to it:
index_LMC = np.where(avg_pairwise_distance_MI==np.min(avg_pairwise_distance_MI))[0][0] #Round reaching point in which (sum / average of) pairwise distances of MI_XF_mean, MI_FY_mean and MI_XY is minimal (closest point to LMC)
index_margin_mean_max = np.where(margin_mean==np.max(margin_mean, axis=0))[0][0] #remove [0] ...
#############################
#Plot results:
##Code below produces continuous shaded equivalents of errorbars (not used in paper; error plots not included):
#def errorfill(x, y, yerr, color=None, alpha_fill=0.3, ax=None, linestyle = None):
# ax = ax if ax is not None else plt.gca()
# if color is None:
# color = next(ax._get_lines.prop_cycler)['color']
# if np.isscalar(yerr) or len(yerr) == len(y):
# ymin = y - yerr
# ymax = y + yerr
# elif len(yerr) == 2:
# ymin, ymax = yerr
# ax.plot(x, y, color=color, linestyle=linestyle)
# ax.fill_between(x, ymax, ymin, color=color, alpha=alpha_fill)
#
##Plot the evolution of the ensemble's training and test error and IP quantities (not included in paper):
#plt.figure(1)
#errorfill(np.arange(1, n_est+1, 1), error_tr_mean, error_tr_std, color='blue') #show training error (mean +/- standard deviation) #tr_error_plt, = plt.plot(np.arange(1, n_est+1, 1), error_tr_mean, 'b', linewidth=3) #show training error (mean)
#errorfill(np.arange(1, n_est+1, 1), error_te_mean, error_te_std, color='red') #show test error (mean +/- standard deviation) #te_error_plt, =plt.plot(np.arange(1, n_est+1, 1), error_te_mean, 'r', linewidth=3) #show test error (mean)
#plt.axvline(x=index_error_tr_first_min+1, color = 'k', linestyle='--', linewidth=2) #show round of minimal training error
#plt.axvline(x=index_error_te_first_min+1, color = 'm', linestyle='--', linewidth=2) #show round of minimal test error
#plt.axvline(x=index_margin_mean_max[0]+1, color = 'g', linestyle=':', linewidth=2) #show round of margin maximization
#plt.axvline(x=index_LMC+1, color = 'gray', linestyle='-', linewidth=2) #show round of LMC
##plt.axhline(y=np.min(MI_XF_mean, axis=0)/np.nanmean(entropy_X, axis=0), color = 'k') #show minimal value of I_S(F;X) / H_S(X) (maximal compression)
##plt.axvline(x=index_MI_XF_mean_max+1, color = 'y', linestyle='--', linewidth=2) #show round of minimally compressed model
#plt.legend(labels= ["Training Error", "Test Error", "Min. Training Error Round", "Min. Test Error Round", "Max. Margin Round", "LMC Round"], loc=0)
#plt.gca().set_xlim(left=1)
#plt.title("Evolution of the ensemble's average training & test error")
#plt.ylabel("Classification Error")
#plt.xlabel("Boosting Round")
#plt.show()
#Plot the trajectory of the model in the entropy-normalized information plane, encoding round of boosting in color of point:
plt.figure(2)
x = MI_XF_mean/np.nanmean(entropy_X, axis=0)
y = MI_FY_mean/np.nanmean(entropy_Y)
plt.xlim(0.98*np.nanmean(MI_XY)/np.nanmean(entropy_X, axis=0), 1.02*x.max())
plt.ylim(0.98*y.min(), 1.02*y.max())
plt.plot(x,y)
plt.plot(np.nanmean(MI_XY)/np.nanmean(entropy_X, axis=0), 1, marker='*', markersize=12, markeredgecolor='gray', color="red") #plot LMC point for noiseless datasets
plt.plot(MI_XF_mean[index_error_tr_first_min]/np.nanmean(entropy_X, axis=0), MI_FY_mean[index_error_tr_first_min]/np.nanmean(entropy_Y), marker='o', markersize=6, color="black") #plot point of training error minimization (point of achieving losslessness)
plt.plot(MI_XF_mean[index_error_te_first_min]/np.nanmean(entropy_X, axis=0), MI_FY_mean[index_error_te_first_min]/np.nanmean(entropy_Y), marker='s', markerfacecolor='none', markersize=14, color="magenta") #plot point of test error minimization (point of maximal generalization)
#plt.plot(MI_XF_mean[index_MI_XF_mean_max]/np.nanmean(entropy_X, axis=0), MI_FY_mean[index_MI_XF_mean_max]/np.nanmean(entropy_Y), marker='p', markersize=8, color="yellow") #plot point of minimally compressed model
#plt.plot(MI_XF_mean[index_entropy_F_first_max]/np.nanmean(entropy_X, axis=0), MI_FY_mean[index_entropy_F_first_max]/np.nanmean(entropy_Y), marker='o', markersize=8, color="yellow")#plot point with maximal entropy of F (should be same as above...)
#plt.plot(np.nanmean(MI_XY)/np.nanmean(entropy_X, axis=0), np.nanmean(MI_XY, axis=0)/np.nanmean(entropy_Y, axis=0), marker='d', markersize=12, color="red") #plot LMC point for general dataset (coincides with the 1st point s.t. rounding)
#plt.plot(MI_XF_mean[index_LMC]/np.nanmean(entropy_X, axis=0), MI_FY_mean[index_LMC]/np.nanmean(entropy_Y), marker='o', markersize=8, color="red")#plot point in trajectory that is closest to LMC point (coincides with the above s.t. rounding)
plt.plot(MI_XF_mean[index_margin_mean_max]/np.nanmean(entropy_X, axis=0), MI_FY_mean[index_margin_mean_max]/np.nanmean(entropy_Y), marker='o', markersize=20, fillstyle = 'none', color="green")#plot point of average margin maximization
plt.title("Average trajectory of the ensemble in the normalized information plane")
plt.ylabel("$I_S(F;Y) / H_S(Y)$")
plt.xlabel("$I_S(F;X) / H_S(X)$")
plt.show()
##Plot the evolution of the ensemble's average margin and IP quantities (not included in paper):
#plt.figure(3)
#errorfill(np.arange(1, n_est+1, 1), margin_mean, margin_std, color='blue') #show average training margin per round (mean +/- standard deviation) #margin_mean_plt, = plt.plot(np.arange(1, n_est+1, 1), margin_mean, 'b', linewidth=3) #show average training margin per round (mean)
#plt.axvline(x=index_error_tr_first_min+1, color = 'k', linestyle='--', linewidth=2) #show round of minimal training error
#plt.axvline(x=index_error_te_first_min+1, color = 'm', linestyle='--', linewidth=2) #show round of minimal test error
#plt.axvline(x=index_margin_mean_max[0]+1, color = 'green', linestyle='-', linewidth=2) #show round of margin maximization
#plt.axvline(x=index_LMC+1, color = 'red', linestyle=':', linewidth=2) #show round of LMCplt.legend(labels= ["Average Margin", "$I_S(F;Y) / H_S(Y)$", "$I_S(F;X) / H_S(X)$", "Max. Training Accuracy Round", "Max. Test Accuracy Round", "Max. Compression"], loc=0)
#plt.gca().set_xlim(left=1)
##plt.set_yscale('log')
#plt.title("Evolution of the ensemble's average margin and normalized $I_S(F;Y)$ & $I_S(F;X)$")
##plt.ylabel("Average margin")
#plt.xlabel("Boosting Round")
#plt.show()
#Plot some random trajectories (first k):, but also show position in IP of model attaining minimal test & training error as well as maximal margin per individual run
k = 6
fig2 = plt.figure()
fig2.subplots_adjust(hspace=0.3, wspace=0.3)
fig2.suptitle("Some individual trajectories of the ensemble in the normalized information plane", fontsize=12)
for i in range(k):
ax = fig2.add_subplot(2, 3, i+1)
x = MI_XF[i,:]/entropy_X[i]
y = MI_FY[i,:]/entropy_Y[i]
plt.xlim(0.9*x.min(), 1.1*x.max())
plt.ylim(0.9*y.min(), 1.1*y.max())
plt.plot(x,y)
plt.plot(MI_XY[i]/entropy_X[i], 1, marker='*', markersize=12, markeredgecolor='gray', color="red") #plot LMC point for noiseless datasets
#find indices of points of interest per run:
index_error_tr_first_min_trajectory = np.where(error_tr[i,:]==np.min(error_tr[i,:]))[0][0]
index_error_te_first_min_trajectory = np.where(error_te[i,:]==np.min(error_te[i,:]))[0][0]
index_margin_mean_max_trajectory = np.where(margin[i,:]==np.max(margin[i,:]))[0][0] #for Landsat, semeion remove [0] here...
plt.plot(MI_XF[i,index_error_tr_first_min_trajectory]/entropy_X[i], MI_FY[i, index_error_tr_first_min_trajectory]/entropy_Y[i], marker='o', markersize=6, color="black") #plot point of training error minimization (point of achieving losslessness)
plt.plot(MI_XF[i,index_error_te_first_min_trajectory]/entropy_X[i], MI_FY[i, index_error_te_first_min_trajectory]/entropy_Y[i], marker='s', markerfacecolor='none', markersize=14, color="magenta") #plot point of test error minimization (point of maximal generalization)
plt.plot(MI_XF[i,index_margin_mean_max_trajectory]/entropy_X[i], MI_FY[i, index_margin_mean_max_trajectory]/entropy_Y[i], marker='o', markersize=20, fillstyle = 'none', color="green") #plot point of average margin maximization
plt.title("Run "+str(i+1), fontsize=10)
plt.ylabel("$I_S(F;Y) / H_S(Y)$")
plt.xlabel("$I_S(F;X) / H_S(X)$")
plt.show()