-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
415 lines (410 loc) · 14.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
import argparse
import time
import numpy as np
import pandas as pd
import re
from somanomaly.detector import SomDetect
from sklearn.metrics import classification_report
def load_data(path):
if re.search(r'\.csv$', path, re.IGNORECASE):
return pd.read_csv(path, header = None)
elif re.search(r'\.parquet$', path, re.IGNORECASE):
return pd.read_parquet(path)
elif re.search(r'\.feather$', path, re.IGNORECASE):
return pd.read_feather(path)
elif re.search(r'\.xlsx$', path, re.IGNORECASE):
return pd.read_excel(path, header = None)
elif re.search(r'\.json$', path, re.IGNORECASE):
return pd.read_json(path)
# elif re.search(r'\.pkl$', file_path, re.IGNORECASE):
# with open(file_path, 'rb') as file:
# return pickle.load(file)
else:
raise ValueError("Unsupported file format")
def save_data(df, path):
if re.search(r'\.csv$', path, re.IGNORECASE):
df.to_csv(path, index = False, header = False)
elif re.search(r'\.parquet$', path, re.IGNORECASE):
df.to_parquet(path, index = False)
elif re.search(r'\.feather$', path, re.IGNORECASE):
df.reset_index(drop = True).to_feather(path)
elif re.search(r'\.xlsx$', path, re.IGNORECASE):
df.to_excel(path, index = False, header = False)
elif re.search(r'\.json$', path, re.IGNORECASE):
df.to_json(path, orient = 'records')
else:
raise ValueError("Unsupported file format")
def main():
parser = argparse.ArgumentParser()
# positional arguments
parser.add_argument(
"normal",
type = str,
help = "Normal dataset file"
)
parser.add_argument(
"online",
type = str,
help = "Streaming dataset file"
)
parser.add_argument(
"output",
type = str,
help = "Output"
)
parser.add_argument(
"-c", "--column",
type = str,
help = "Column index to read - start,end"
)
parser.add_argument(
"-e", "--eval",
help = "True label dataset file"
)
parser.add_argument(
"--log",
help = "Log transform",
action = "store_true"
)
parser.add_argument(
"--logstat",
help = "Log2 stat",
action = "store_true"
)
# SOM training
parser.add_argument(
"--standardize",
help = "Standardize both data sets",
action = "store_true"
)
parser.add_argument(
"-w", "--window",
type = int,
default = 30,
help = "Window size (Default = 30)"
)
parser.add_argument(
"-j", "--jump",
type = int,
default = 30,
help = "Shift size (Default = 30)"
)
parser.add_argument(
"-x", "--xgrid",
type = int,
default = None,
help = "Number of x-grid (Default = sqrt(N * sqrt(N)))"
)
parser.add_argument(
"-y", "--ygrid",
type = int,
default = None,
help = "Number of y-grid (Default = sqrt(N * sqrt(N)))"
)
parser.add_argument(
"-p", "--prototype",
type = str,
default = "hexagonal",
help = "Topology of SOM output space - hexagonal (default) or rectangular"
)
parser.add_argument(
"-n", "--neighborhood",
type = str,
default = "gaussian",
help = "Neighborhood function - gaussian (default), triangular, or bubble"
)
parser.add_argument(
"-m", "--metric",
type = str,
default = "frobenius",
help = "Distance function - frobenius (default), nuclear, mahalanobis, or eros"
)
parser.add_argument(
"-d", "--decay",
type = str,
default = "linear",
help = "Decaying function - linear (default) or exponential"
)
parser.add_argument(
"-s", "--seed",
type = int,
help = "Random seed (Default = system time)"
)
parser.add_argument(
"-i", "--iter",
type = int,
default = 50,
help = "Epoch number (Default = 50)"
)
parser.add_argument(
"-a", "--alpha",
type = float,
default = .1,
help = "Initial learning rate (Default = 0.1)"
)
parser.add_argument(
"-r", "--radius",
type = float,
help = "Initial radius of BMU neighborhood (Default = 2/3 quantile of every distance between nodes)"
)
# parser.add_argument(
# "--subset",
# type = int,
# help = "Subset codebook matrix set among epochs (Default = epoch number)"
# )
# Anomaly detection
parser.add_argument(
"-l", "--label",
type = str,
default = "1,0",
help = "Anomaly and normal labels, e.g. 1,0 (default)"
)
parser.add_argument(
"-u", "--threshold",
type = str,
default = "cltlind",
help = "Threshold method - cltlind (default), clt, anova, ztest, mean, quantile, radius, inv_som, kmeans, hclust"
)
# clt and cltlind
parser.add_argument(
"-b", "--bootstrap",
type = str,
default = 1,
help = "Bootstrap sample numbers (Default = 1, bootstrap not performed)"
)
parser.add_argument(
"-o", "--overfit",
help = "Use only mapped codebook",
action = "store_true"
)
parser.add_argument(
"--find",
type = float,
help = "When using mapped codebook, their neighboring nodes also can be used. - radius for neighbor"
)
parser.add_argument(
"-q", "--multiple",
type = str,
default = "gai",
help = "Multiple testing method - gai (default), invest, lord, or bh"
)
# Plot
parser.add_argument(
"-1", "--error",
help = "Plot reconstruction error for each epoch",
action = "store_true"
)
parser.add_argument(
"-2", "--heat",
help = "Plot heatmap of SOM",
action = "store_true"
)
parser.add_argument(
"-3", "--pred",
help = "Plot heatmap of projection onto normal SOM",
action = "store_true"
)
# assign arguments
args = parser.parse_args()
normal_file = args.normal
online_file = args.online
normal_list = None
online_list = None
if str(args.normal).strip().find(",") != -1:
normal_list = str(args.normal).strip().split(",")
if str(args.online).strip().find(",") != -1:
online_list = str(args.online).strip().split(",")
output_list = None
dstat_file = None
if str(args.output).strip().find(",") != -1:
output_list = str(args.output).strip().split(",")
output_file = output_list[0]
dstat_file = output_list[1]
else:
output_file = args.output
cols = None
col_list = None
if args.column is not None:
if str(args.column).count(",") == 1:
cols = str(args.column).strip().split(",")
cols = range(int(cols[0]), int(cols[1]))
elif str(args.column).count(",") > 1:
col_tmp = str(args.column).strip().split(",")
col_list = np.array(col_tmp).astype(int).reshape((-1, 2))
if args.eval is not None:
print_eval = True
true_file = args.eval
target_names = ["anomaly", "normal"]
test_log = args.log
stat_log = args.logstat
standard = args.standardize
window_size = args.window
jump_size = args.jump
xdim = args.xgrid
ydim = args.ygrid
topo = args.prototype
neighbor = args.neighborhood
dist = args.metric
decay = args.decay
seed = args.seed
epoch = args.iter
# subset_net = epoch
init_rate = args.alpha
init_radius = args.radius
# if args.subset is not None:
# subset_net = args.subset
label = str(args.label).strip().split(",")
label = [int(label[0]), int(label[1])]
threshold_list = None
ztest_opt = .9
if str(args.threshold).strip().find(",") != -1:
threshold_list = str(args.threshold).strip().split(",")
threshold = threshold_list[0]
ztest_opt = float(threshold_list[1])
else:
threshold = args.threshold
boot = args.bootstrap
proj = args.overfit
neighbor_node = args.find
multiple_list = None
eta = None
rho = None
if str(args.multiple).strip().find(",") != -1:
multiple_list = str(args.multiple).strip().split(",")
multiple_test = multiple_list[0]
if str(multiple_list[1]).strip().find("+") != -1:
multiple_opt = str(multiple_list[1]).strip().split("+")
eta = float(multiple_opt[0])
rho = float(multiple_opt[1])
else:
eta = float(multiple_list[1])
elif str(args.multiple).strip().find("+") != -1:
multiple_list = str(args.multiple).strip().split("+")
multiple_test = multiple_list[0]
rho = float(multiple_list[1])
else:
multiple_test = args.multiple
print_error = args.error
print_heat = args.heat
print_projection = args.pred
# somanomaly
start_time = time.time()
if normal_list is None:
som_anomaly = SomDetect(normal_file, online_file, cols, standard,
window_size, jump_size, test_log,
xdim, ydim, topo, neighbor, dist, decay, seed)
som_anomaly.learn_normal(epoch = epoch, init_rate = init_rate, init_radius = init_radius)
som_anomaly.detect_anomaly(label = label, threshold = threshold,
level = ztest_opt, clt_test = multiple_test, mfdr = eta, power = rho, log_stat = stat_log,
bootstrap = boot, clt_map = proj, neighbor = neighbor_node)
som_anomaly.label_anomaly()
anomaly_df = pd.DataFrame({".pred": som_anomaly.anomaly})
# anomaly_df.to_csv(output_file, index = False, header = False)
save_data(anomaly_df, output_file)
if dstat_file is not None:
dstat_df = pd.DataFrame({".som": som_anomaly.dstat})
# dstat_df.to_csv(dstat_file, index = False, header = False)
save_data(dstat_df, dstat_file)
window_df = pd.DataFrame({".pred": som_anomaly.window_anomaly})
# window_df.to_csv(dstat_file.replace(".csv", "_pred.csv"), index = False, header = False)
save_data(window_df, re.sub(r'(\.\w+)$', r'_pred\1', dstat_file))
else:
anomaly_pred = SomDetect.detect_block(
normal_list, online_list, col_list,
standard, window_size, jump_size, test_log,
xdim, ydim, topo, neighbor, dist, decay, seed,
epoch, init_rate, init_radius, label, ztest_opt, multiple_test, eta, rho, stat_log
)
anomaly_df = pd.DataFrame({".pred": anomaly_pred})
# anomaly_df.to_csv(output_file, index = False, header = False)
save_data(anomaly_df, output_file)
print("")
print("process for %.2f seconds================================================\n" %(time.time() - start_time))
# files
print("Files-------------------------------------")
if normal_list is None:
print("Normal data: ", normal_file)
print("Streaming data: ", online_file)
print("Anomaly detection: ", output_file)
if dstat_file is not None:
print("SomAnomly statistic: ", dstat_file)
print("Window prediction: ", dstat_file.replace(".csv", "_pred.csv"))
# print parameter
print("SOM parameters----------------------------")
if som_anomaly.standard:
print("Standardized!")
print("Initial learning rate: ", som_anomaly.som_grid.initial_learn)
print("Initial radius: ", som_anomaly.som_grid.initial_r)
print("[Window, jump]: ", [som_anomaly.win_size, som_anomaly.jump])
print("SOM grid: ", som_anomaly.som_grid.net_dim)
print("Topology: ", som_anomaly.som_grid.topo)
print("Neighborhood function: ", som_anomaly.som_grid.neighbor_func)
print("Decay function: ", som_anomaly.som_grid.decay_func)
print("Distance function: ", som_anomaly.som_grid.dist_func)
print("Epoch number: ", som_anomaly.som_grid.epoch)
else:
print("Normal data: ", normal_list)
print("Streaming data: ", online_list)
print("Anomaly detection: ", output_file)
# print parameter
print("SOM parameters----------------------------")
print("Initial learning rate: ", init_rate)
print("Initial radius: ", init_radius)
print("[Window, jump]: ", [window_size, jump_size])
print("SOM grid: ", np.array([xdim, ydim]))
print("Topology: ", topo)
print("Neighborhood function: ", neighbor)
print("Decay function: ", decay)
print("Distance function: ", dist)
print("Epoch number: ", epoch)
# if epoch > subset_net:
# print("Subset weight matrix of: ", subset_net)
print("------------------------------------------")
if threshold_list is not None:
print("Anomaly detection by %s of %.3f" %(threshold, ztest_opt))
else:
print("Anomaly detection by ", threshold)
if threshold == "clt" or threshold == "cltlind":
if multiple_list is not None:
if eta is not None:
if rho is not None:
print("with multiple testing %s of %.3f and %.3f" % (multiple_test, eta, rho))
else:
print("with multiple testing %s of %.3f" %(multiple_test, eta))
else:
print("with multiple testing %s" % multiple_test)
print("==========================================")
# evaluation
if normal_list is None:
if print_eval:
true_anomaly = load_data(true_file)
true_anomaly = pd.DataFrame.to_numpy(true_anomaly)
print(
classification_report(
true_anomaly, som_anomaly.anomaly,
labels = label, target_names = target_names
)
)
# plot
if print_error or print_heat or print_projection:
plot_start = time.time()
if print_error:
som_anomaly.som_grid.plot_error()
if print_heat:
som_anomaly.som_grid.plot_heatmap(som_anomaly.som_tr.window_data)
if print_projection:
som_anomaly.plot_heatmap()
print("Plotting time: %.2f seconds" % (time.time() - plot_start))
else:
if print_eval:
true_anomaly = load_data(true_file)
true_anomaly = pd.DataFrame.to_numpy(true_anomaly)
print(
classification_report(
true_anomaly, anomaly_pred,
labels=label, target_names=target_names
)
)
if __name__ == '__main__':
np.set_printoptions(precision = 3)
main()