-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconsolidate_nn_data_clean.py
239 lines (199 loc) · 8.16 KB
/
consolidate_nn_data_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
"""
Instructions:
- set BASE_DIR at the top
- Place me in the `assignment3` directory of chadcode
- Change the ds#_names and component/cluster counts in __main__
Output:
6 files of summarized data
author: Geoff Von Allen
"""
from collections import namedtuple
import pandas as pd
BASE_DIR = 'output'
AlgoDS = namedtuple('AlgoDS', 'folder param_name dim_val')
benchmark = AlgoDS('benchmark', '', '')
def get_non_cluster_results(algo, ds_name, bench=None):
suffix = 'dim_red.csv'
# Assignment 1 Architecture
ass1 = pd.read_csv(f'{BASE_DIR}/{algo.folder}/{ds_name}_ass1_{suffix}').T
ass1 = ass1.rename(columns={0: f'ass1_{algo.folder}_dr'})
# Sort chadcode NN grid search by rank
dr_results = pd.read_csv(f'{BASE_DIR}/{algo.folder}/{ds_name}_{suffix}')
sorted_ = dr_results.sort_values(by=['rank_test_score'])
# Best overall - any dim reduction or architecture
overall = sorted_.T.iloc[:,0]
overall.name = f'{algo.folder}_overall_best'
# Get best overall using current algo n_components param
overall_n = sorted_[sorted_[algo.param_name] == algo.dim_val]
if overall_n.empty:
# Dim value was never tested
print(f"\tDim param {algo.param_name} with value {algo.dim_val}"\
f" not tested for {algo.folder}")
return ass1
overall_n = overall_n.T.iloc[:,0]
overall_n.name = f'{algo.folder}_overall_best_n={algo.dim_val}'
# Get best using benchmark chadcode architecture
alpha_param = 'param_NN__alpha'
nn_arch = 'param_NN__hidden_layer_sizes'
# filter for algo specific dim and benchmark nn params
# Already sorted so best in first position
filter_ = \
(sorted_[algo.param_name] == algo.dim_val) & \
(sorted_[alpha_param] == bench[alpha_param]) & \
(sorted_[nn_arch] == bench[nn_arch])
best_bench = sorted_[filter_].T.iloc[:,0]
best_bench.name = f'{algo.folder}_best_benchmark'
final = pd.concat([ass1, best_bench, overall_n, overall], axis=1, sort=True)
return final
def get_benchmark_results(algo, ds_name):
# Assignment 1 Architecture
r = pd.read_csv(f'{BASE_DIR}/{algo.folder}/{ds_name}_ass1_nn_bmk.csv').T
r = r.rename(columns={0: f'ass1_benchmark'})
# Best 'New' Architecture on non reduced dataset
new_best = pd.read_csv(f'{BASE_DIR}/{algo.folder}/{ds_name}_nn_bmk.csv')
new_best = new_best.sort_values(by=['rank_test_score'])
new_best = new_best.T.iloc[:,0]
new_best.name = 'new_benchmark_no_dr'
final = pd.concat([r, new_best], axis=1, sort=True)
return final
def get_cluster_results(algo, cluster_type, ds_name, bench=None):
# Sort chadcode NN grid search by rank
c_results = pd.read_csv(f'{BASE_DIR}/{algo.folder}/clustering/{ds_name}_cluster_{cluster_type}.csv')
sorted_ = c_results.sort_values(by=['rank_test_score'])
# Get best overall using any num clusters or architecrures
overall = sorted_.T.iloc[:,0]
overall.name = f'{algo.folder}_overall_best'
# Get best overall with selected number of clusters among all nn arch
overall_k = sorted_[sorted_[algo.param_name] == algo.dim_val]
if overall_k.empty:
# Dim value was never tested
print(f"\tDim param {algo.param_name} with value {algo.dim_val}"\
f" not tested for {algo.folder}")
return overall
overall_k = overall_k.T.iloc[:,0]
overall_k.name = f'{algo.folder}_overall_best_k={algo.dim_val}'
# Get best using benchmark chadcode architecture
alpha_param = 'param_NN__alpha'
nn_arch = 'param_NN__hidden_layer_sizes'
# filter for algo specific dim and benchmark nn params
# Already sorted so best in first position
filter_ = \
(sorted_[algo.param_name] == algo.dim_val) & \
(sorted_[alpha_param] == bench[alpha_param]) & \
(sorted_[nn_arch] == bench[nn_arch])
best_bench = sorted_[filter_].T.iloc[:,0]
best_bench.name = f'{algo.folder}_best_benchmark'
final = pd.concat([best_bench, overall_k, overall], axis=1, sort=True)
return final
def run_non_clustered_nn(ds1, ds2):
print("{:-^50}".format(" Running Non Clustering "))
dss = [ds1, ds2]
for ds in dss:
print(f"Running Dataset: {ds['name']}")
result = get_benchmark_results(benchmark, ds['name'])
bench = result['new_benchmark_no_dr'].T
for algo in ds['algos']:
print(f"\tFetching: {algo.folder}")
r = get_non_cluster_results(algo, ds['name'], bench)
result = pd.concat([result, r], axis=1, sort=True)
result.to_csv(f'non_cluster_results_{ds["name"]}.csv')
def run_clustered_nn(*args):
print("{:-^50}".format(" Running Clustering "))
gmms = [x for x in args if x['cluster'] == 'GMM']
kms = [x for x in args if x['cluster'] == 'kmeans']
for ds in gmms:
print(f"Running Dataset: {ds['name']} for cluster {ds['cluster']}")
result = get_benchmark_results(benchmark, ds['name'])
bench = result['new_benchmark_no_dr'].T
result = result['new_benchmark_no_dr']
for algo in ds['algos']:
print(f"\tFetching: {algo.folder} for {ds['cluster']}")
r = get_cluster_results(algo, ds['cluster'], ds['name'], bench)
result = pd.concat([result, r], axis=1, sort=True)
result.to_csv(f'cluster_results_{ds["name"]}_{ds["cluster"]}.csv')
for ds in kms:
print(f"Running Dataset: {ds['name']} for cluster {ds['cluster']}")
result = get_benchmark_results(benchmark, ds['name'])
bench = result['new_benchmark_no_dr'].T
result = result['new_benchmark_no_dr']
for algo in ds['algos']:
print(f"\tFetching: {algo.folder} for {ds['cluster']}")
r = get_cluster_results(algo, ds['cluster'], ds['name'], bench)
result = pd.concat([result, r], axis=1, sort=True)
result.to_csv(f'cluster_results_{ds["name"]}_{ds["cluster"]}.csv')
if __name__ == '__main__':
# Non Clustering
# NOTE: Change to your ds names that are used in filenames
ds1_name = 'pendigits'
ds2_name = 'statlog_vehicle'
# NOTE: Change last field to your components for each DR
ds1_nc = {
'name': ds1_name,
'algos': [
AlgoDS('ICA', 'param_ica__n_components', 7),
AlgoDS('PCA', 'param_pca__n_components', 3),
AlgoDS('RP', 'param_rp__n_components', 7),
AlgoDS('RF', 'param_filter__n', 2),
]
}
ds2_nc = {
'name': ds2_name,
'algos': [
AlgoDS('ICA', 'param_ica__n_components', 4),
AlgoDS('PCA', 'param_pca__n_components', 2),
AlgoDS('RP', 'param_rp__n_components', 7),
AlgoDS('RF', 'param_filter__n', 1),
]
}
run_non_clustered_nn(ds1_nc, ds2_nc)
# Clustering
param_gmm = 'param_gmm__n_components'
param_km = 'param_km__n_clusters'
gmm = 'GMM'
km = 'kmeans'
# NOTE: Change last field to your selected cluster count
ds1_gmm = {
'name': ds1_name,
'cluster': gmm,
'algos': [
AlgoDS('benchmark', param_gmm, 15),
AlgoDS('ICA', param_gmm, 15),
AlgoDS('PCA', param_gmm, 15),
AlgoDS('RP', param_gmm, 15),
AlgoDS('RF', param_gmm, 15),
]
}
ds2_gmm = {
'name': ds2_name,
'cluster': gmm,
'algos': [
AlgoDS('benchmark', param_gmm, 4),
AlgoDS('ICA', param_gmm, 4),
AlgoDS('PCA', param_gmm, 4),
AlgoDS('RP', param_gmm, 4),
AlgoDS('RF', param_gmm, 4),
]
}
ds1_km = {
'name': ds1_name,
'cluster': km,
'algos': [
AlgoDS('benchmark', param_km, 10),
AlgoDS('ICA', param_km, 10),
AlgoDS('PCA', param_km, 10),
AlgoDS('RP', param_km, 10),
AlgoDS('RF', param_km, 10),
]
}
ds2_km = {
'name': ds2_name,
'cluster': km,
'algos': [
AlgoDS('benchmark', param_km, 4),
AlgoDS('ICA', param_km, 4),
AlgoDS('PCA', param_km, 4),
AlgoDS('RP', param_km, 4),
AlgoDS('RF', param_km, 4),
]
}
run_clustered_nn(ds1_gmm, ds2_gmm, ds1_km, ds2_km)