-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_evaluation_dissertation.py
593 lines (518 loc) · 35.7 KB
/
run_evaluation_dissertation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
"""Run dissertation evaluation
Script to compute summary statistics and create plots for the dissertation. Should be run after the
experimental pipeline, as this script requires the pipeline's outputs as inputs.
Usage: python -m run_evaluation_dissertation --help
"""
import argparse
import ast
import pathlib
import warnings
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import data_handling
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['figure.max_open_warning'] = 0
DEFAULT_COL_PALETTE = 'YlGnBu'
# Main-routine: run complete evaluation pipeline. To that end, read results from the "results_dir"
# and some dataset information from "data_dir". Save plots to the "plot_dir". Print some statistics
# to the console.
def evaluate(data_dir: pathlib.Path, results_dir: pathlib.Path, plot_dir: pathlib.Path) -> None:
if not results_dir.is_dir():
raise FileNotFoundError('The results directory does not exist.')
if not plot_dir.is_dir():
print('The plot directory does not exist. We create it.')
plot_dir.mkdir(parents=True)
if any(plot_dir.glob('*.pdf')):
print('The plot directory is not empty. Files might be overwritten but not deleted.')
results = data_handling.load_results(directory=results_dir)
# Make feature sets proper lists:
results['selected_idxs'] = results['selected_idxs'].apply(ast.literal_eval)
# Sanity check: correct number of feature selected
assert ((results['train_objective'].isna() & (results['selected_idxs'].apply(len) == 0)) |
(results['selected_idxs'].apply(len) == results['k'])).all()
# Rename some values:
results['fs_name'] = results['fs_name'].str.removesuffix('Selector').replace(
{'GreedyWrapper': 'Greedy Wrapper', 'ModelImportance': 'Model Gain', 'MRMR': 'mRMR'})
fs_name_plot_order = ['MI', 'FCBF', 'mRMR', 'Model Gain', 'Greedy Wrapper']
results['search_name'] = results['search_name'].replace(
{'search_sequentially': 'seq.', 'search_simultaneously': 'sim.',
'search_greedy_balancing': 'bal.', 'search_greedy_replacement': 'rep.'})
results.loc[results['search_name'] == 'sim.', 'search_name'] = (
'sim. (' + results.loc[results['search_name'] == 'sim.', 'objective_agg'] + ')')
results.drop(columns='objective_agg', inplace=True)
search_name_hue_order_solver = ['sim. (min)', 'sim. (sum)', 'seq.']
search_name_hue_order_all = ['sim. (sum)', 'sim. (min)', 'bal.', 'seq.', 'rep.']
results['optimization_status'].replace({0: 'Optimal', 1: 'Feasible', 2: 'Infeasible',
6: 'Not solved'}, inplace=True)
status_order = ['Infeasible', 'Not solved', 'Feasible', 'Optimal']
# Define columns for main experimental dimensions (corresponding to independent search runs):
group_cols = ['dataset_name', 'split_idx', 'fs_name', 'search_name', 'k', 'tau_abs',
'num_alternatives']
# Define columns for evaluation metrics:
metric_name_mapping = {'train_objective': '$Q_{\\mathrm{train}}$',
'test_objective': '$Q_{\\mathrm{test}}$',
'decision_tree_train_mcc': '$MCC_{\\mathrm{train}}^{\\mathrm{tree}}$',
'decision_tree_test_mcc': '$MCC_{\\mathrm{test}}^{\\mathrm{tree}}$'}
# Number the alternatives (however, they only have a natural order in sequential search)
results['n_alternative'] = results.groupby(group_cols).cumcount()
print('\n-------- 6.3 Experimental Design --------')
print('\n------ 6.3.3 Methods ------')
print('\n---- 6.3.3.3 Alternatives (Constraints) ----')
print('\n-- Timeout --')
print('\nHow is the optimization status distributed (for solver-based search)?')
print(results.loc[results['search_name'].isin(search_name_hue_order_solver),
'optimization_status'].value_counts(normalize=True).apply('{:.2%}'.format))
print('\n------ 6.3.4 Datasets ------')
print('\n## Table 6.2: Dataset overview ##\n')
dataset_overview = data_handling.load_dataset_overview(directory=data_dir)
dataset_overview = dataset_overview[['dataset', 'n_instances', 'n_features']]
dataset_overview.rename(columns={'dataset': 'Dataset', 'n_instances': '$m$',
'n_features': '$n$'}, inplace=True)
dataset_overview.sort_values(by='Dataset', key=lambda x: x.str.lower(), inplace=True)
print(dataset_overview.style.format(escape='latex').hide(axis='index').to_latex(hrules=True))
print('\n-------- 6.4 Evaluation --------')
print('\n------ 6.4.1 Feature-Selection Methods ------')
plot_results = results[(results['search_name'] == 'seq.') & (results['tau_abs'] == 1) &
(results['n_alternative'] == 0)]
print('\n-- Prediction performance --')
print('\nHow is the test-set prediction performance distributed for different',
'feature-selection methods (for the original feature sets of sequential search)?')
print(plot_results.groupby('fs_name')['decision_tree_test_mcc'].describe().round(2))
# Figure 6.1a: Test-set prediction performance by feature-set size "k" and feature-selection
# method
plt.figure(figsize=(5, 5))
plt.rcParams['font.size'] = 18
sns.boxplot(x='k', y='decision_tree_test_mcc', hue='fs_name', data=plot_results,
palette=DEFAULT_COL_PALETTE, fliersize=1, hue_order=fs_name_plot_order)
plt.xlabel('Feature-set size $k$')
plt.ylabel(metric_name_mapping['decision_tree_test_mcc'])
plt.yticks(np.arange(start=-0.4, stop=1.1, step=0.2))
plt.legend(title=' ', edgecolor='white', loc='upper left', bbox_to_anchor=(-0.15, -0.1),
columnspacing=1, framealpha=0, handletextpad=0.2, ncols=2)
plt.figtext(x=0.06, y=0.11, s='Selection', rotation='vertical')
plt.tight_layout()
plt.savefig(plot_dir / 'afs-impact-fs-method-k-decision-tree-test-mcc.pdf')
print('\nHow is the iteration count of Greedy Wrapper distributed (for the original feature',
'sets of sequential search)?')
print(plot_results['wrapper_iters'].describe().round(2))
print('\nHow is the iteration count of Greedy Wrapper distributed (for all experimental',
'settings)?')
print(results['wrapper_iters'].describe().round(2))
print('\nHow is the optimization status distributed for different feature-selection methods',
'(for all experimental settings of solver-based search)?')
solver_results = results[results['search_name'].isin(search_name_hue_order_solver)]
print(pd.crosstab(solver_results['optimization_status'], solver_results['fs_name'],
normalize='columns').applymap('{:.2%}'.format))
print('\nHow is the optimization status distributed for different feature-selection methods',
'(for the original feature sets of sequential search)?')
print(pd.crosstab(plot_results['optimization_status'], plot_results['fs_name'],
normalize='columns').applymap('{:.2%}'.format))
print('\n-- Influence of feature-set size "k" --')
# Figure 6.1b: Difference in feature-set quality between feature-set sizes "k" by evaluation
# metric and feature-selection method
plot_metrics = ['train_objective', 'test_objective', 'decision_tree_test_mcc']
plot_results = plot_results[['dataset_name', 'split_idx', 'fs_name', 'k'] + plot_metrics].copy()
plot_results = plot_results.pivot(index=['dataset_name', 'split_idx', 'fs_name'], columns='k',
values=plot_metrics).reset_index()
for metric in plot_metrics:
plot_results[(metric, 'diff')] = plot_results[(metric, 10)] - plot_results[(metric, 5)]
plot_results.drop(columns=[(metric, 10), (metric, 5)], inplace=True)
plot_results = plot_results.droplevel(level='k', axis='columns')
plot_results = plot_results.melt(id_vars='fs_name', value_vars=plot_metrics, var_name='Metric',
value_name='Difference')
plot_results['Metric'].replace(metric_name_mapping, inplace=True)
plt.figure(figsize=(5, 5))
plt.rcParams['font.size'] = 18
sns.boxplot(x='Metric', y='Difference', hue='fs_name', data=plot_results,
palette=DEFAULT_COL_PALETTE, fliersize=1, hue_order=fs_name_plot_order)
plt.ylabel('Difference $k$=10 vs. $k$=5', y=0.45) # moved a bit downwards to fit on plot
plt.ylim(-0.65, 0.65)
plt.yticks(np.arange(start=-0.6, stop=0.7, step=0.2))
plt.legend(title=' ', edgecolor='white', loc='upper left', bbox_to_anchor=(-0.15, -0.1),
columnspacing=1, framealpha=0, handletextpad=0.2, ncols=2)
plt.figtext(x=0.06, y=0.11, s='Selection', rotation='vertical')
plt.tight_layout()
plt.savefig(plot_dir / 'afs-impact-fs-method-k-metric-diff.pdf')
print('\nWhat is the median feature-set-quality difference per experimental setting between',
'k=10 and k=5 for different feature-selection methods and evaluation metrics (for the',
'original feature sets of sequential search)?')
print(plot_results.groupby(['Metric', 'fs_name']).median().round(2))
print('\n------ 6.4.2 Search Methods for Alternatives ------')
comparison_results = results[(results['k'] == 5) &
results['search_name'].isin(['sim. (min)', 'sim. (sum)', 'bal.'])]
for num_alternatives in results.loc[results['search_name'].str.startswith('sim'),
'num_alternatives'].unique():
# Extract first "num_alternatives + 1" feature sets (solver-based sequential search and
# Greedy Replacement are only run for one value of "num_alternatives", but you can get
# results for smaller "a" by subsetting)
seq_results = results[results['search_name'].isin(['seq.', 'rep.']) & (results['k'] == 5) &
(results['n_alternative'] <= num_alternatives)].copy()
seq_results['num_alternatives'] = num_alternatives
comparison_results = pd.concat([comparison_results, seq_results])
# Filter search settings where any search method has any invalid feature set in its search run
# (when we compare search methods over alternatives, all combinations of these two dimensions
# should have the same number of fully feasible search runs, else the mean/variance of
# feature-set quality in search runs may be biased (in particular, sequential seach may return
# less than the desired number of alternatives while simultaneous search is all-or-nothing)
# or the comparison over alternatives may be biased
filter_group_cols = [x for x in group_cols if x not in ['search_name', 'num_alternatives']]
valid_results = comparison_results.groupby(filter_group_cols).filter(
lambda x: x['train_objective'].notna().all())
plot_metrics = ['train_objective', 'test_objective', 'decision_tree_test_mcc']
print('\n-- Variance in feature-set quality --')
print('\nWhat is the median standard deviation of feature-set quality within one search run',
'for different feature-selection methods, search methods, numbers of alternatives, and',
'evaluation metrics (for k=5 and 1-5 alternatives)?')
for metric in plot_metrics:
print(valid_results.groupby(group_cols)[metric].std().reset_index().rename(
columns={'num_alternatives': 'a'}).groupby(['fs_name', 'search_name', 'a'])[
metric].median().reset_index().pivot(
index=['fs_name', 'a'], columns='search_name').round(3))
# Figures 6.2a-6.2c: Standard deviation of feature-set quality in search runs by search
# method and number of alternatives "a" (subfigures: evaluation metrics)
plot_results = valid_results[valid_results['fs_name'] == 'Model Gain']
plot_results = plot_results.groupby(group_cols)[metric].std().reset_index()
plt.figure(figsize=(8, 3))
plt.rcParams['font.size'] = 15
sns.boxplot(x='num_alternatives', y=metric, hue='search_name', data=plot_results,
palette=DEFAULT_COL_PALETTE, fliersize=1,
hue_order=search_name_hue_order_all)
plt.xlabel('Number of alternatives $a$')
plt.ylabel(f'$\\sigma$ of {metric_name_mapping[metric]}')
plt.yticks(np.arange(start=0, stop=0.35, step=0.1))
plt.ylim(-0.05, 0.35)
leg = plt.legend(title='Search', edgecolor='white', framealpha=0, loc='upper left',
bbox_to_anchor=(0, -0.1), columnspacing=1, handletextpad=0.3, ncol=5)
leg.get_title().set_position((-262, -21))
plt.tight_layout()
plt.savefig(plot_dir / f'afs-impact-search-stddev-{metric.replace("_", "-")}.pdf')
print('\n-- Average value of feature-set quality --')
print('\nWhat is the median average value of feature-set quality within one search run for',
'different feature-selection methods, search methods, numbers of alternatives, and',
'evaluation metrics (for k=5 and 1-5 alternatives)?')
for metric, ylim, min_tick in zip(
plot_metrics, [(-0.05, 1.05), (-0.05, 1.05), (-0.3, 1.05)], [0, 0, -0.2]):
print(valid_results.groupby(group_cols)[metric].mean().reset_index().rename(
columns={'num_alternatives': 'a'}).groupby(['fs_name', 'search_name', 'a'])[
metric].median().reset_index().pivot(
index=['fs_name', 'a'], columns='search_name').round(3))
# Figures 6.3a-6.3c: Average feature-set quality in search runs by search method and number
# of alternatives "a" (subfigures: evaluation metrics)
plot_results = valid_results[valid_results['fs_name'] == 'Model Gain']
plot_results = plot_results.groupby(group_cols)[metric].mean().reset_index()
plt.figure(figsize=(8, 3))
plt.rcParams['font.size'] = 15
sns.boxplot(x='num_alternatives', y=metric, hue='search_name', data=plot_results,
palette=DEFAULT_COL_PALETTE, fliersize=1,
hue_order=search_name_hue_order_all)
plt.xlabel('Number of alternatives $a$')
plt.ylabel(f'Mean of {metric_name_mapping[metric]}')
plt.ylim(ylim)
plt.yticks(np.arange(start=min_tick, stop=ylim[1], step=0.2))
leg = plt.legend(title='Search', edgecolor='white', framealpha=0, loc='upper left',
bbox_to_anchor=(0, -0.1), columnspacing=1, handletextpad=0.3, ncol=5)
leg.get_title().set_position((-262, -21))
plt.tight_layout()
plt.savefig(plot_dir / f'afs-impact-search-mean-{metric.replace("_", "-")}.pdf')
print('\nHow is the feature-set-quality difference per experimental setting between',
'simultaneous search (sum-aggregation) and sequential search distributed for different',
'feature-selection methods, numbers of alternatives, and evaluation metrics (for k=5',
'and 1-5 alternatives)?')
for metric in ['train_objective', 'test_objective', 'decision_tree_test_mcc']:
plot_results = valid_results.groupby(group_cols)[metric].mean().reset_index(
).pivot(index=[x for x in group_cols if x != 'search_name'], columns='search_name',
values=metric).reset_index()
plot_results['sim - seq'] = plot_results['sim. (sum)'] - plot_results['seq.']
print(f'Metric: {metric}')
print(plot_results.groupby(['fs_name', 'num_alternatives'])['sim - seq'].agg(
['min', 'median', 'mean', 'max']).round(2))
print('\n-- Quality difference of heuristics --')
search_methods = [(('sim. (min)', 'bal.'), 'sim'), (('seq.', 'rep.'), 'seq')]
for search_method_pair, search_method_file_infix in search_methods:
plot_results = valid_results[valid_results['fs_name'].isin(['MI', 'Model Gain'])]
plot_results = plot_results[plot_results['search_name'].isin(search_method_pair)]
plot_results = plot_results.groupby(group_cols)[plot_metrics].mean().reset_index()
plot_results = plot_results.pivot(index=[x for x in group_cols if x != 'search_name'],
columns='search_name', values=plot_metrics).reset_index()
plot_results = plot_results.rename(columns={'num_alternatives': 'a'})
plot_results['tau'] = plot_results['tau_abs'] / plot_results['k']
for metric in plot_metrics:
plot_results[(metric, 'diff')] = (plot_results[(metric, search_method_pair[0])] -
plot_results[(metric, search_method_pair[1])])
plot_results = plot_results.loc[:, (slice(None), ['', 'diff'])] # keep "diff" & non-search
plot_results = plot_results.droplevel(level='search_name', axis='columns')
# Not all feature-selection methods compared; still retain their global order and color:
heu_fs_name_plot_order = [x for x in fs_name_plot_order
if x in plot_results['fs_name'].unique()]
heu_fs_col_palette = [col for col, fs_name in zip(
sns.color_palette(DEFAULT_COL_PALETTE, len(fs_name_plot_order)), fs_name_plot_order
) if fs_name in heu_fs_name_plot_order]
parameters = [('a', 'Number of alternatives $a$', 'num-alternatives'),
('tau', 'Dissimilarity threshold $\\tau$', 'tau')]
for parameter, parameter_label, parameter_file_infix in parameters:
print('\nHow is the difference in feature-set quality per experimental setting',
f'between "{search_method_pair[0]}" search and "{search_method_pair[1]}" search',
f'distributed for different feature-selection methods, values of "{parameter}",',
'and evaluation metrics (for k=5 and 1-5 alternatives)?')
for metric in plot_metrics:
print('Metric:', metric)
print(plot_results.groupby(['fs_name', parameter])[metric].describe().drop(
columns='count').round(3))
# Figures 6.4a-6.4d: Difference in feature-set quality between exact and heuristic
# search by user parameter and feature-selection method (subfigures: users parameters
# and search methods)
metric = 'train_objective'
plt.figure(figsize=(5, 5))
plt.rcParams['font.size'] = 18
sns.boxplot(x=parameter, y=metric, hue='fs_name', data=plot_results,
palette=heu_fs_col_palette, hue_order=heu_fs_name_plot_order, fliersize=1)
plt.xlabel(parameter_label)
plt.ylabel(f'$\\Delta${metric_name_mapping[metric]} ({search_method_pair[0]} ' +
f'vs. {search_method_pair[1]})')
plt.ylim(-0.13, 0.13)
plt.yticks(np.arange(start=-0.12, stop=0.14, step=0.04))
leg = plt.legend(title='Selection', edgecolor='white', loc='upper left',
bbox_to_anchor=(0, -0.1), columnspacing=1, framealpha=0,
handletextpad=0.2, ncols=2)
leg.get_title().set_position((-161, -26))
plt.tight_layout()
plt.savefig(plot_dir / ('afs-impact-search-heuristics-metric-diff-' +
f'{search_method_file_infix}-{parameter_file_infix}.pdf'))
print('\n-- Optimization status --')
# To not bias analysis regarding the number of alternatives (simultaneous-search results
# duplicate optimization statuses within search runs, sequential-search results with higher
# "a" always contains results from lower "a" as well), we only extract one status for each
# dataset, cross-validation fold, feature-selection method, search method, "a", and "tau"
plot_results = comparison_results.loc[
comparison_results['num_alternatives'] == comparison_results['n_alternative'],
['fs_name', 'search_name', 'num_alternatives', 'optimization_status']
]
plot_results = plot_results[plot_results['fs_name'] != 'Greedy Wrapper']
print('\nHow is the optimization status distributed for different feature-selection methods',
'(excluding Greedy Wrapper) and search methods (for k=5 and 1-5 alternatives)?')
print(plot_results.groupby(['fs_name', 'search_name'])['optimization_status'].value_counts(
normalize=True).round(4).apply('{:.2%}'.format))
print('\n## Table 6.3: Optimization status by feature-selection method and search method (for',
'k=5 and 1-5 alternatives) ##\n')
print_results = (plot_results.groupby(['fs_name', 'search_name'])[
'optimization_status'].value_counts(normalize=True) * 100).rename('Frequency').reset_index()
print_results = print_results.pivot(index=['fs_name', 'search_name'], values='Frequency',
columns='optimization_status').fillna(0).reset_index()
col_order = [x for x in status_order if x in print_results.columns] # some might not occur
print_results = print_results[print_results.columns[:2].tolist() + col_order] # re-order
print(print_results.style.format('{:.2f}\\%'.format, subset=col_order).hide(
axis='index').to_latex(hrules=True))
print('\nHow is the optimization status distributed for different numbers of alternatives',
'and search methods (for k=5 and excluding Greedy Wrapper)?')
for search_name in search_name_hue_order_all:
print('\nSearch method:', search_name)
print(pd.crosstab(plot_results.loc[plot_results['search_name'] == search_name,
'optimization_status'],
plot_results.loc[plot_results['search_name'] == search_name,
'num_alternatives'],
normalize='columns').applymap('{:.2%}'.format))
print('\n## Table 6.4: Optimization status by number of alternatives (for simultaneous search',
'with sum-aggregation, k=5, and excluding Greedy Wrapper) ##\n')
print_results = plot_results[plot_results['search_name'] == 'sim. (sum)']
print_results = (print_results.groupby('num_alternatives')['optimization_status'].value_counts(
normalize=True) * 100).rename('Frequency').reset_index()
print_results = print_results.pivot(index='num_alternatives', values='Frequency',
columns='optimization_status').fillna(0).reset_index()
col_order = [x for x in status_order if x in print_results.columns] # some might not occur
print_results = print_results[[print_results.columns[0]] + col_order] # re-order
print(print_results.style.format('{:.2f}\\%'.format, subset=col_order).hide(
axis='index').to_latex(hrules=True))
print('\n-- Optimization time --')
# While sequential search has one optimization time per feature set, simultaneous search and
# the two heuristic search methods duplicate the same runtime record for multiple feature sets
# found by one search; for a fair comparison, we extract only one runtime for each of these
# searches and sum the runtimes of sequential search runs
assert ((comparison_results[comparison_results['search_name'] != 'seq.'].groupby(group_cols)[
'optimization_time'].nunique() == 1).all())
plot_results = pd.concat([
comparison_results[comparison_results['search_name'] == 'seq.'].groupby(
group_cols + ['n'])['optimization_time'].sum().reset_index()[
['n', 'fs_name', 'search_name', 'num_alternatives', 'optimization_time']
],
comparison_results[comparison_results['search_name'] != 'seq.'].groupby(
group_cols + ['n']).first().reset_index()[
['n', 'fs_name', 'search_name', 'num_alternatives', 'optimization_time']
]
])
print('\nHow is the optimization time distributed for different feature-selection methods and',
'search methods (for k=5 and 1-5 alternatives)?')
for search_name in search_name_hue_order_all:
print('\nSearch method:', search_name)
print(plot_results[plot_results['search_name'] == search_name].groupby('fs_name')[
'optimization_time'].describe().round(3))
print('\n## Table 6.5: Mean optimization time by feature-selection method and search method',
'(for k=5 and 1-5 alternatives) ##\n')
print_results = plot_results.groupby(['fs_name', 'search_name'])[
'optimization_time'].mean().reset_index()
print_results = print_results.pivot(index='fs_name', columns='search_name')
print(print_results.style.format('{:.2f}~s'.format, na_rep='---').to_latex(hrules=True))
print('\nWhat is the mean optimization time for different feature-selection methods, numbers',
'of alternatives, and search methods (for k=5 and 1-5 alternatives)?')
for search_name in search_name_hue_order_all:
print('\nSearch method:', search_name)
print(plot_results[plot_results['search_name'] == search_name].groupby(
['fs_name', 'num_alternatives'])['optimization_time'].mean().reset_index().pivot(
index='num_alternatives', columns='fs_name').round(3))
print('\n## Table 6.6: Mean optimization time by number of alternatives and feature-selection',
'method (for simultaneous search with sum-aggregation and k=5) ##\n')
print_results = plot_results[plot_results['search_name'] == 'sim. (sum)'].groupby(
['fs_name', 'num_alternatives'])['optimization_time'].mean().reset_index()
print_results = print_results.pivot(index='num_alternatives', columns='fs_name')
print(print_results.style.format('{:.2f}~s'.format).to_latex(hrules=True))
print('\nWhat is the mean optimization time for different feature-selection methods and',
'dataset dimensionalities "n" (for sequential search with k=5 and 1-5 alternatives)?')
print(plot_results[plot_results['search_name'] == 'seq.'].groupby(['fs_name', 'n'])[
'optimization_time'].mean().reset_index().pivot(index='n', columns='fs_name').round(3))
print('\nWhat is the mean optimization time for different feature-selection methods and',
'dataset dimensionalities "n" (for simultaneous search with sum-aggregation, k=5, and',
'1-5 alternatives)?')
print(plot_results[plot_results['search_name'] == 'sim. (sum)'].groupby(['fs_name', 'n'])[
'optimization_time'].mean().reset_index().pivot(index='n', columns='fs_name').round(3))
print('\n------ 6.4.3 User Parameters "a" and "tau" ------')
print('\n-- Feature-set quality / Influence of feature-selection method --')
plot_metrics = ['train_objective', 'test_objective', 'decision_tree_test_mcc']
for fillna in (False, True):
# Here, we use k=10 instead of k=5 to show more distinct values of "tau" (10 instead of 5)
norm_results = results.loc[(results['search_name'] == 'seq.') & (results['k'] == 10),
group_cols + plot_metrics + ['n', 'n_alternative']].copy()
# Shift [-1, 1] metrics to [0, 1] first, since (1) normalizing with a negative max changes
# order, e.g., [-0.5, -0.6, ..., -1] becomes [1, 1.2, ..., 2] (lower numbers get higher and
# maximum can exceed 1) and (2) filling NAs with 0 (which we do for some of the plots to
# account for infeasibility) makes most sense if 0 is the theoretical minimum of the metric
condition = norm_results['fs_name'].isin(('mRMR', 'Greedy Wrapper'))
norm_results.loc[condition, ['train_objective', 'test_objective']] = (
norm_results.loc[condition, ['train_objective', 'test_objective']] + 1) / 2
norm_results['decision_tree_test_mcc'] = (norm_results['decision_tree_test_mcc'] + 1) / 2
if fillna: # replace quality of infeasible feature sets with 0
norm_results[plot_metrics] = norm_results[plot_metrics].fillna(0)
normalization_name = 'max-fillna'
else:
normalization_name = 'max'
norm_results[plot_metrics] = norm_results.groupby(group_cols)[plot_metrics].apply(
lambda x: x / x.max()) # applies function to each column independently
print(f'\nWhat is the mean feature-set quality ({normalization_name}-normalized per',
'experimental setting) for different numbers of alternatives, feature-selection',
'methods, and evaluation metrics (for sequential search with k=10)?')
for metric in plot_metrics:
print(norm_results.groupby(['n_alternative', 'fs_name'])[metric].mean().reset_index(
).pivot(index='n_alternative', columns='fs_name').round(2))
print(f'\nWhat is the mean feature-set quality ({normalization_name}-normalized per',
'experimental setting) for different dissimilarity thresholds "tau", feature-selection',
'methods, and evaluation metrics (for sequential search with k=10)?')
for metric in plot_metrics:
print(norm_results.groupby(['tau_abs', 'fs_name'])[metric].mean().reset_index(
).pivot(index='tau_abs', columns='fs_name').round(2))
print(f'\nHow does the feature-set quality ({normalization_name}-normalized per',
'experimental setting) (Spearman-)correlate with dataset dimensionality "n" for',
'each alternative and dissimilarity threshold "tau" (for sequential search with',
'k=10 and Model Gain as feature-selection method)?')
for metric in plot_metrics:
with warnings.catch_warnings():
warnings.filterwarnings(action='ignore',
category=scipy.stats.SpearmanRConstantInputWarning)
print('Metric:', metric)
print(norm_results[norm_results['fs_name'] == 'Model Gain'].groupby(
['n_alternative', 'tau_abs']).apply(lambda x: x[metric].corr(
x['n'], method='spearman')).rename('').reset_index().pivot(
index='tau_abs', columns='n_alternative').round(2))
# Figures 6.5a-6.5f (Model Gain) and 6.7a-6.7d (other feature-selection methods):
# Feature-set quality by number of alternatives and dissimilarity threshold "tau"
# (subfigures: evaluation metrics, normalizations, and feature-selection methods)
if fillna or metric != 'train_objective':
fs_names = ['Model Gain']
else:
fs_names = fs_name_plot_order # all feature-selection methods
for fs_name in fs_names:
plot_results = norm_results[norm_results['fs_name'] == fs_name].groupby(
['n_alternative', 'tau_abs'])[metric].mean().reset_index()
plot_results['tau'] = plot_results['tau_abs'] / 10
plt.figure(figsize=(4, 3))
plt.rcParams['font.size'] = 15
sns.lineplot(x='n_alternative', y=metric, hue='tau', data=plot_results,
palette=DEFAULT_COL_PALETTE, hue_norm=(-0.2, 1), legend=False)
# Use color scale instead of standard line plot legend; start color scaling at
# -0.2, so the color for the actual lowest value (tau=0) is more readable (darker):
cbar = plt.colorbar(ax=plt.gca(), mappable=plt.cm.ScalarMappable(
cmap=DEFAULT_COL_PALETTE, norm=plt.Normalize(-0.2, 1)),
values=plot_results['tau'].unique())
cbar.ax.invert_yaxis() # put low values at top (like most lines are ordered)
cbar.ax.set_title('$\\tau}$', y=0, pad=-20, loc='left')
cbar.ax.set_yticks(np.arange(start=0.2, stop=1.1, step=0.2))
plt.xlabel('Number of alternative')
plt.ylabel(f'Normalized {metric_name_mapping[metric]}')
plt.xticks(range(0, 11, 1))
plt.yticks(np.arange(start=0, stop=1.1, step=0.2))
plt.ylim(-0.05, 1.05)
plt.tight_layout()
file_name = 'afs-impact-num-alternatives-tau'
file_name = file_name + f'-{metric.replace("_", "-")}-{normalization_name}'
if fs_name != 'Model Gain':
file_name = file_name + '-' + fs_name.lower().replace(' ', '-')
file_name = file_name + '.pdf'
plt.savefig(plot_dir / file_name)
print('\nHow do the evaluation metrics (Spearman-)correlate for different feature-selection',
'methods (for all experimental settings; computed per dataset and cross-validation',
'fold, then averaged over them)?')
print_metrics = ['train_objective', 'test_objective', 'decision_tree_train_mcc',
'decision_tree_test_mcc']
for fs_name in results['fs_name'].unique():
print('Feature-selection method:', fs_name)
print_results = results[results['fs_name'] == fs_name]
print_results = print_results.groupby(['dataset_name', 'split_idx'])[print_metrics].corr(
method='spearman').reset_index().rename(columns={'level_2': 'Metric'})
print_results = print_results.groupby('Metric', sort=False)[print_metrics].mean(
).round(2).reset_index().set_index('Metric')
print_results = print_results.rename(columns=(lambda x: x.replace('decision_', '')),
index=(lambda x: x.replace('decision_', '')))
print(print_results)
print('\n-- Optimization status --')
for k in results['k'].unique():
plot_results = results[(results['fs_name'] == 'Model Gain') & (results['k'] == k) &
(results['search_name'] == 'seq.')]
# Figures 6.6a, 6.6b: Optimization status (fraction optimal = not infeasible) by number of
# alternatives and dissimilarity threshold "tau" (subfigures: feature-set sizes "k")
assert plot_results['optimization_status'].isin(['Infeasible', 'Optimal']).all()
plot_results = plot_results.groupby(['tau_abs', 'n_alternative'])[
'optimization_status'].agg(lambda x: (x == 'Optimal').sum() / len(x)).reset_index()
plot_results['tau'] = plot_results['tau_abs'] / k
plt.figure(figsize=(4, 3))
plt.rcParams['font.size'] = 15
sns.lineplot(x='n_alternative', y='optimization_status', hue='tau', data=plot_results,
palette=DEFAULT_COL_PALETTE, hue_norm=(-0.2, 1), legend=False)
cbar = plt.colorbar(ax=plt.gca(), mappable=plt.cm.ScalarMappable(
cmap=DEFAULT_COL_PALETTE, norm=plt.Normalize(-0.2, 1)),
values=plot_results['tau'].unique())
cbar.ax.invert_yaxis() # put low values at top (like most lines are ordered)
cbar.ax.set_title('$\\tau}$', y=0, pad=-20, loc='left')
cbar.ax.set_yticks(np.arange(start=0.2, stop=1.1, step=0.2))
plt.xlabel('Number of alternative')
plt.ylabel('Valid feature sets')
plt.xticks(range(0, 11, 1))
plt.yticks(np.arange(start=0, stop=1.1, step=0.2))
plt.gca().yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter(xmax=1))
plt.ylim(-0.05, 1.05)
plt.tight_layout()
plt.savefig(plot_dir / f'afs-impact-num-alternatives-tau-optimization-status-k-{k}.pdf')
# Parse some command-line arguments and run the main routine.
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Creates the dissertations\'s plots and print statistics.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-d', '--data', type=pathlib.Path, default='data/datasets/', dest='data_dir',
help='Directory with prediction datasets in (X, y) form.')
parser.add_argument('-r', '--results', type=pathlib.Path, default='data/results/',
dest='results_dir', help='Directory with experimental results.')
parser.add_argument('-p', '--plots', type=pathlib.Path, default='data/plots/',
dest='plot_dir', help='Output directory for plots.')
print('Evaluation started.')
evaluate(**vars(parser.parse_args()))
print('\nPlots created and saved.')