This repository has been archived by the owner on Jul 31, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_results.py
333 lines (302 loc) · 16.8 KB
/
evaluate_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
import argparse
import math
import pandas as pd
from pathlib import Path
from typing import List, Tuple
from scipy.stats import wilcoxon, friedmanchisquare
from libs.constants import BASE_PATH, P_POLLUTIONS, N_TRAIN_ANOMALIES
# Determine the mean and standard deviation of all result files - output is a LaTeX table
# Configuration
this_parse = argparse.ArgumentParser(description="Merge the test result of all random seeds")
this_parse.add_argument(
'random_seeds', nargs='+', help='Random seeds of the experiments'
)
this_parse.add_argument(
'--p_pollution', nargs='+', help='Pollution factors to evaluate'
)
this_parse.add_argument(
'--n_anomalies', nargs='+', help='Number of known anomalies during training'
)
this_parse.add_argument(
"--model_path", default=BASE_PATH / "models", type=Path, help="Path to the results (usually where the models are)"
)
this_parse.add_argument(
"--metric_name", default=None, type=str, help="Name of the metric, usually 'AUC' or 'AP', if None show both"
)
this_parse.add_argument(
"--p_name", default=None, type=str, help="Significance test name, either 'wilcoxon', 'friedman' or None"
)
this_parse.add_argument(
"--exp_name", default=None, type=str, help="Output results for the experiments besides the main results"
)
this_parse.add_argument(
"--show_stddev", default=False, type=bool, help="Show the standard deviation next to the mean"
)
this_parse.add_argument(
"--mean_only", default=False, type=int, help="Show only the mean value, if 2: show everything but the mean value"
)
this_parse.add_argument(
"--show_pollution", default=False, type=bool, help="Show the pollution as separate column/row"
)
this_parse.add_argument(
"--transpose", default=True, type=bool, help="If True, the data sets are on the rows and baselines on the columns"
)
this_parse.add_argument(
"--is_transfer", default=False, type=bool, help="Evaluate the transfer tests"
)
this_parse.add_argument(
"--is_ablation", default=False, type=bool, help="Evaluate the ablation study"
)
this_parse.add_argument(
"--out_path", default=None, type=Path, help="Path to output csv - if None, use stddout instead"
)
this_args = this_parse.parse_args()
AD_NAME = "GAA-3"
P_POLLUTION = [str(cur_poll) for cur_poll in P_POLLUTIONS] if not this_args.p_pollution else this_args.p_pollution
N_ANOMALIES = [cur_n for cur_n in N_TRAIN_ANOMALIES] if not this_args.n_anomalies else this_args.n_anomalies
BASELINE_METHODS = [AD_NAME, "AE", "GradCon", "DeepSAD", "DevNet", "A3"]
if this_args.is_ablation:
BASELINE_METHODS = (["GAA-1", "GAA-2", "GAA-3", "GAA-4"])
if not this_args.exp_name:
if this_args.is_transfer:
# Transfer semi-supervised
NAME_TO_ID = {
"MNIST_{SEED}_y_norm:0,1,2,3,y_anom_test:4,5,6,7,8,9,y_anom_train:4,5,6,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "MN.",
"FMNIST_{SEED}_y_norm:0,1,2,3,y_anom_test:4,5,6,7,8,9,y_anom_train:4,5,6,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "FMN.",
"CovType_{SEED}_y_norm:1,2,3,y_anom_test:4,5,6,7,y_anom_train:4,5,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "CT",
"URL_{SEED}_y_norm:benign,y_anom_test:Defacement,malware,phishing,spam,y_anom_train:Defacement,malware,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "URL",
"Darknet_{SEED}_y_norm:Non-Tor,NonVPN,y_anom_test:Tor,VPN,y_anom_train:Tor,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "DN",
"KDD_{SEED}_y_norm:normal,y_anom_test:DoS,Probe,R2L,U2R,y_anom_train:DoS,Probe,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "KDD",
"IDS_{SEED}_y_norm:Benign,y_anom_test:Bot,BruteForce,Infiltration,WebAttacks,y_anom_train:Bot,BruteForce,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "IDS",
}
else:
# Basic semi-supervised
NAME_TO_ID = {
"MNIST_{SEED}_y_norm:0,1,2,3,y_anom_test:4,5,6,7,8,9,y_anom_train:4,5,6,7,8,9,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "MN.",
"FMNIST_{SEED}_y_norm:0,1,2,3,y_anom_test:4,5,6,7,8,9,y_anom_train:4,5,6,7,8,9,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "FMN.",
"CovType_{SEED}_y_norm:1,2,3,y_anom_test:4,5,6,7,y_anom_train:4,5,6,7,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "CT",
"KDD_{SEED}_y_norm:normal,y_anom_test:DoS,Probe,R2L,U2R,y_anom_train:DoS,Probe,R2L,U2R,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "KDD",
"URL_{SEED}_y_norm:benign,y_anom_test:Defacement,malware,phishing,spam,y_anom_train:Defacement,malware,phishing,spam,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "URL",
"Darknet_{SEED}_y_norm:Non-Tor,NonVPN,y_anom_test:Tor,VPN,y_anom_train:Tor,VPN,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "DN",
"DoH_{SEED}_y_norm:Benign,y_anom_test:Malicious,y_anom_train:Malicious,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "DoH",
"IDS_{SEED}_y_norm:Benign,y_anom_test:Bot,BruteForce,Infiltration,WebAttacks,y_anom_train:Bot,BruteForce,Infiltration,WebAttacks,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test": "IDS",
}
if P_POLLUTION == ["0.0"]:
# These data sets do not contain enough samples for the pollution experiments
NAME_TO_ID["CreditCard_{SEED}_y_norm:0,y_anom_test:1,y_anom_train:1,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test"] = "CC"
NAME_TO_ID["Mammography_{SEED}_y_norm:0,y_anom_test:1,y_anom_train:1,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test"] = "Mam."
else:
# Single data set experiments (used for the plots)
NAME_TO_ID = {}
if this_args.exp_name == "doh":
NAME_TO_ID["DoH_{SEED}_y_norm:Benign,y_anom_test:Malicious,y_anom_train:Malicious,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test"] = "DoH"
elif this_args.exp_name == "darknet":
NAME_TO_ID["Darknet_{SEED}_y_norm:Non-Tor,NonVPN,y_anom_test:Tor,VPN,y_anom_train:Tor,VPN,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test"] = "DN"
elif this_args.exp_name == "mnist":
NAME_TO_ID["MNIST_{SEED}_y_norm:0,1,2,3,y_anom_test:4,5,6,7,8,9,y_anom_train:4,5,6,7,8,9,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test"] = "MN."
elif this_args.exp_name == "kdd":
NAME_TO_ID["KDD_{SEED}_y_norm:normal,y_anom_test:DoS,Probe,R2L,U2R,y_anom_train:DoS,Probe,R2L,U2R,p_pollution:{POLL},n_train_anomalies:{NANO}_split:test"] = "KDD"
# If our name dict is empty, something has gone wrong
assert NAME_TO_ID, f"Experiment {this_args.exp_name} is not known"
def get_path(basepath: Path, p_pollution: float, random_seed: int, n_anomalies: int, file_name: str, file_suffix: str = ".metric.csv"):
out_path = basepath
# There are subfolders based on the random seed
out_path /= f"{random_seed}"
# The filename contains the random seed (bad design decision btw)
parsed_name = file_name.replace("{SEED}", str(random_seed))
# And also the pollution (honestly, this makes things harder)
parsed_name = parsed_name.replace("{POLL}", str(p_pollution).replace(".", ""))
parsed_name = parsed_name.replace("{NANO}", str(n_anomalies))
out_path /= parsed_name
out_path = out_path.with_suffix(file_suffix)
return out_path
if __name__ == '__main__':
# In the end, we want a DF with all results indexed by the contamination and the experiment IDs
df_tot = df_avg_only = pd.DataFrame(
columns=pd.MultiIndex.from_product(
[N_ANOMALIES, P_POLLUTION, list(NAME_TO_ID.values())],
names=["Num.", "Cont.", "Exp."]
)
)
# We need the single experiments for the friedman coefficient
df_out_per_rep = pd.DataFrame(
columns=pd.MultiIndex.from_product(
[N_ANOMALIES, P_POLLUTION, list(NAME_TO_ID.values()), ["AUC", "AP"]],
names=["Num.", "Cont.", "Exp.", "Metric"]
),
index=pd.MultiIndex.from_product(
[BASELINE_METHODS, this_args.random_seeds],
names=["Baseline", "Seed"]
)
)
df_out = df_tot.copy()
df_out.index = pd.MultiIndex(levels=2*[[]], codes=2*[[]], names=["Method", "Metric"])
# Go through all metric files
results_by_method = {}
for cur_n_anomalies in N_ANOMALIES:
for cur_pollution in P_POLLUTION:
for cur_name, cur_id in NAME_TO_ID.items():
# We open all metric files given their random seed
all_metrics = []
for cur_seed in this_args.random_seeds:
cur_path = get_path(
basepath=this_args.model_path,
p_pollution=cur_pollution,
n_anomalies=cur_n_anomalies,
random_seed=cur_seed,
file_name=cur_name
)
# List all files that are missing
try:
in_df = pd.read_csv(cur_path, index_col=0)
all_metrics.append(in_df)
df_out_per_rep.loc[(BASELINE_METHODS, cur_seed), (cur_n_anomalies, cur_pollution, NAME_TO_ID[cur_name], ["AUC", "AP"])] = in_df.loc[BASELINE_METHODS, :].values
except FileNotFoundError:
print(f"Cannot find {cur_path}. Please check the path.")
continue
# Once opened, we merge them
pd_concat = pd.concat(all_metrics)
concat_by_method = pd_concat.groupby(pd_concat.index)
# We need the groupby later for the friedman coefficient
results_by_method[cur_id] = [cur_df for cur_df in concat_by_method]
# We want everything in one series which will become a row in the final DF
this_mean = concat_by_method.mean().stack()
this_std = concat_by_method.std().stack()
# Also we should add a new level to the MultiIndex to mark the mean and stddev
this_mean = pd.DataFrame(this_mean)
this_mean.loc[:, "type"] = "mean"
this_mean = this_mean.set_index("type", append=True)
this_std = pd.DataFrame(this_std)
this_std.loc[:, "type"] = "std"
this_std = this_std.set_index("type", append=True)
# Add to the overall DF
merged_metric = pd.concat([this_mean, this_std])
df_tot[(cur_n_anomalies, cur_pollution, NAME_TO_ID[cur_name])] = merged_metric[0]
all_baselines = df_tot.index.unique(0)
# By default, show both metrics
if this_args.metric_name is None:
df_tot = df_tot.reindex(["AUC", "AP"], axis=0, level=1)
else:
df_tot = df_tot.loc[(all_baselines, [this_args.metric_name]), :]
# Reorder baselines
level_name_reordered = BASELINE_METHODS
# Reorder
df_tot = df_tot.reindex(level_name_reordered, axis=0, level=0)
# Round
df_not_rounded = df_tot.copy()
df_tot = df_tot.round(decimals=2)
# Decision: let's build the LaTeX code here instead of using pgfplotstable & Co
for cur_idx, cur_df in df_tot.groupby(level=[0, 1]):
# Merge to "mean \pm stddev"
this_latex = cur_df.iloc[0, :].map("{:,.2f}".format)
if this_args.show_stddev:
this_latex += " \\scriptscriptstyle \\pm " + cur_df.iloc[1, :].map("{:,.2f}".format)
# Add the math environment
this_latex = "$" + this_latex + "$"
# Highest score should be black, rest gray
max_per_column = df_tot.loc[(slice(None), cur_idx[1], "mean"), :].max(axis=0)
is_max = cur_df.loc[cur_idx + ("mean",), :] == max_per_column
this_latex.loc[is_max] = "\\color{black}" + this_latex.loc[is_max]
# this_latex.loc[is_max] = "\\textbf{" + this_latex.loc[is_max] + "}"
df_out.loc[cur_idx, :] = this_latex
# Add p-value
if this_args.p_name == "wilcoxon":
for cur_idx, cur_df in df_not_rounded.groupby(axis=0, level=[0, 1]):
# Don't compare GAA to itself
if cur_idx[0] == AD_NAME:
continue
# First group by baseline & metric, then by contamination level
for cur_idx_2, cur_df_2 in cur_df.groupby(axis=1, level=[0, 1]):
# Compare the distribution of GAA to the baseline
dist_GAA = df_not_rounded.loc[(AD_NAME, cur_idx[1], "mean"), (cur_idx_2, slice(None))]
dist_baseline = cur_df_2.loc[cur_idx + ("mean", ), :]
# Calculate the p-value
_, p_val = wilcoxon(x=dist_GAA, y=dist_baseline)
# Prepare for LaTeX
p_val = round(p_val, ndigits=2)
p_val = f"${p_val:,.2f}$"
# Add to the output DF
df_out.loc[cur_idx, cur_idx_2 + ("p-val", )] = p_val
# Mark GAA's p-value by "-"
df_out.loc[(AD_NAME, slice(None)), (slice(None), slice(None), "p-val")] = "-"
elif this_args.p_name == "friedman":
# We need the results of one method for all datasets
# 0) Loop over pollution and metric
for cur_id_0, cur_df_0 in df_out_per_rep.groupby(axis=1, level=[0, 2]):
# 1) Loop over method
for cur_id_1, cur_df_1 in cur_df_0.groupby(axis=0, level=0):
list_of_results = []
# 2) Loop over seed
for cur_id_2, cur_df_2 in cur_df_1.groupby(axis=0, level=1):
list_of_results.append(cur_df_2.values.reshape((-1,)))
# Calculate the Friedman score
friedman_score = friedmanchisquare(*list_of_results)
# Prepare for LaTeX
p_val = round(friedman_score.pvalue, ndigits=2)
p_val = f"${p_val:,.2f}$"
# Add to the output df
df_out.loc[(cur_id_1, cur_id_0[1]), (cur_id_0[0], "p-val")] = p_val
elif this_args.p_name is None:
# Don't add anything
pass
else:
raise NotImplementedError("Unknown significance test.")
# Add average row
for cur_idx, cur_df in df_not_rounded.groupby(axis=1, level=[0, 1]):
df_avg = cur_df.mean(axis=1)
# We're only interested in the mean of the mean
df_avg = df_avg.loc[(slice(None), slice(None), "mean")]
df_avg_rounded = df_avg.round(decimals=2)
# Add the math environment
this_latex = "$" + df_avg_rounded.map("{:,.2f}".format) + "$"
# Show the maximum
all_max = []
for max_idx, max_df in df_avg_rounded.groupby(axis=0, level=1):
all_max.append(
max_df.loc[(slice(None), [max_idx])] == max_df.max()
)
is_max = pd.concat(all_max)
this_latex.loc[is_max] = "\\color{black}" + this_latex.loc[is_max]
df_out[cur_idx + ("mean", )] = this_latex
df_not_rounded[cur_idx + ("mean", )] = df_avg
if this_args.mean_only == 1:
# For some experiments, we're just interested in the mean - nothing else
df_out = df_out.loc[:, (slice(None), slice(None), "mean")]
df_not_rounded = df_not_rounded.loc[:, (slice(None), slice(None), "mean")]
elif this_args.mean_only == 2:
# For some we're only interested in "else"
df_out = df_out.drop("mean", axis=1, level=2)
df_not_rounded = df_not_rounded.drop("mean", axis=1, level=2)
# Save if desired
if this_args.out_path is not None:
df_csv = df_not_rounded.copy()
# Combine the MultiIndex for easier indexing afterwards
df_csv.index = df_csv.index.to_series().apply(lambda x: f"{x[0]}-{x[1]}-{x[2]}")
df_csv = df_csv.transpose()
# Based on the current experiment, name the first axis differently (makes life in TikZ easier)
if this_args.n_anomalies is None:
df_csv.index = df_csv.index.to_series().apply(lambda x: int(x[0]))
df_csv.index = df_csv.index.set_names("NAnomalies")
elif this_args.p_pollution is None:
df_csv.index = df_csv.index.to_series().apply(lambda x: {(float(x[1])*100)})
df_csv.index = df_csv.index.set_names("Pollution")
else:
df_csv.index = df_csv.index.to_series().apply(lambda x: f"{x[0]}-{x[1]}-{x[2]}")
df_csv.index = df_csv.index.set_names("Exp.")
# Save
df_csv = df_csv.sort_index()
if this_args.p_pollution is None:
# Add percent symbol
df_csv.index = df_csv.index.to_series().apply(lambda x: f"{x}\%")
df_csv.to_csv(this_args.out_path)
# Convert to TeX
df_out = df_out.sort_index(axis=1, level=0)
if this_args.transpose:
df_out = df_out.transpose()
latex = df_out.to_latex(
multicolumn_format="c", column_format=">{\\color{gray}}c "*(df_out.index.nlevels + len(df_out.columns)), escape=False
)
# Get back the backslash and math environments
latex = latex.replace("\\textbackslash ", "\\").replace("\\$", "$").replace("0.", ".")
if this_args.out_path is None:
print(latex)