forked from shubhomoydas/ad_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathforest_description.py
executable file
·553 lines (464 loc) · 24.6 KB
/
forest_description.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
import os
import numpy as np
import logging
from ..common.expressions import get_feature_meta_default, convert_feature_ranges_to_rules, \
get_max_len_in_rules, convert_conjunctive_rules_to_feature_ranges
from ..bayesian_ruleset.bayesian_ruleset import BayesianRuleset
from .aad_globals import *
from .aad_support import *
def get_most_anomalous_subspace_indexes(model, n_top=30):
wd = np.multiply(model.w, model.d)
ordered_wd_idxs = np.argsort(-wd)[0:n_top] # sort in reverse order
# logger.debug("ordered_wd:\n%s" % str(wd[ordered_wd_idxs]))
return ordered_wd_idxs
def get_region_indexes_for_instances(x, model=None, n_top=-1):
region_idxs = np.array(model.get_region_ids(x))
# logger.debug("#region_idxs: %d" % len(region_idxs))
# logger.debug("region_idxs:\n%s" % str(list(region_idxs)))
if n_top < 0:
n_top = len(region_idxs)
wd = np.multiply(model.w[region_idxs], model.d[region_idxs])
ordered_wd_idxs = np.argsort(-wd)[0:min(n_top, len(wd))] # sort in reverse order
# logger.debug("ordered_wd_idxs:\n%s" % str(list(ordered_wd_idxs)))
return region_idxs[ordered_wd_idxs]
def get_region_volumes(model, region_indexes, feature_ranges):
volumes = np.zeros(len(region_indexes), dtype=np.float32)
d = feature_ranges.shape[0] # number of features
for i, ridx in enumerate(region_indexes):
region = model.all_regions[ridx].region
# logger.debug(str(region))
region_ranges = np.zeros(d, dtype=np.float32)
for j in range(feature_ranges.shape[0]):
rmin = feature_ranges[j][0] if np.isinf(region[j][0]) else region[j][0]
rmax = feature_ranges[j][1] if np.isinf(region[j][1]) else region[j][1]
# logger.debug("%d: %f, %f" % (j, rmin, rmax))
if rmax == rmin:
# logger.debug("%d: %f, %f" % (j, rmin, rmax))
# If the range of a variable is a single value, we just ignore it.
region_ranges[j] = 1.0
else:
region_ranges[j] = rmax - rmin
volumes[i] = np.prod(region_ranges)
# logger.debug("volumes:\n%s" % str(volumes))
return volumes
def get_instances_for_description(x=None, labels=None, metrics=None, instance_indexes=None):
""" Returns indexes of instances for which we need descriptions
The instances are selected as follows:
- If the instance indexes are directly passed, then select those
- If instance indexes are not passed, then the queried indexes which are
labeled as true anomalies will be selected.
- If there are no queried instances, then indexes of all true anomalies will
be selected.
:param x: np.ndarray
The instances in *original feature* space
:param labels: np.array(int)
True labels
:param metrics: MetricsStructure
:param instance_indexes: np.array(int)
Indexes of instances whose region memberships need to be checked
:return:
"""
if instance_indexes is not None:
return instance_indexes
elif metrics is not None and metrics.queried is not None:
queried = np.array(metrics.queried)
eval_indexes = np.where(labels[queried] == 1)[0]
instance_indexes = queried[eval_indexes]
else:
instance_indexes = np.where(labels == 1)[0]
# logger.debug("instance_indexes: %d\n%s" % (len(instance_indexes), str(list(instance_indexes))))
return instance_indexes
def get_regions_for_description(x, instance_indexes=None, model=None, region_score_only=False, n_top=-1):
""" Get the set of candidate regions for describing the instances
Ensures that atmost n_top most anomalous regions that an instance belongs to
will be present in the output list of regions
:param x: np.ndarray
The instances in *original feature* space
:param instance_indexes: np.array(int)
Indexes of instances whose region memberships need to be checked
:param model: Aad
AAD model
:param region_score_only: bool
If False, score regions by the multiplying region anomaly scores with corresponding weights
If True, score regions by only their anomaly scores
:param n_top: int
Number of top ranked regions (by score) per data instance to use
:return: np.array(int)
"""
# instance_region_idxs = np.array(model.get_region_ids(x[instance_indexes, :]))
# logger.debug(instance_region_idxs)
if region_score_only:
nwd = -model.d
else:
nwd = -np.multiply(model.w, model.d)
regions = set()
if n_top < 0:
n_top = len(nwd)
for i, inst in enumerate(instance_indexes):
inst_regs = np.array(model.get_region_ids(x[[inst], :]))
idxs = np.argsort(nwd[inst_regs])
ordered_inst_regs = inst_regs[idxs]
ordered_inst_regs = ordered_inst_regs[0:min(len(ordered_inst_regs), n_top)]
regions.update(ordered_inst_regs)
regions = list(regions)
# logger.debug("selected regions: %d\n%s" % (len(regions), str(regions)))
return np.array(regions, dtype=int)
def get_region_memberships(x, model=None,
instance_indexes=None, region_indexes=None):
""" Returns which regions the required instances belong to.
:param x: np.ndarray
The instances in *original feature* space
:param labels: np.array(int)
True labels
:param model: Aad
AAD model
:param metrics: MetricsStructure
:param instance_indexes: np.array(int)
Indexes of instances whose region memberships need to be checked
:param region_indexes: np.array(int)
Indexes of the candidate regions within which to contain the instances
:return: np.array(int), np.ndarray(int)
The first value is the list of instances which belong to any of the regions passed.
The second is a matrix of binary values, one row per instance. The columns correspond
to the regions. '1' indicates that an instance belongs to the corresponding region,
'0' otherwise.
"""
if instance_indexes is None or len(instance_indexes) == 0:
return None, None
nregions = len(region_indexes)
member_insts = list()
region_membership_indicators = list()
for i in instance_indexes:
inds = np.zeros(nregions, dtype=int)
for j, ridx in enumerate(region_indexes):
inds[j] = is_in_region(x[i, :], model.all_regions[ridx].region)
if np.sum(inds) > 0:
member_insts.append(i)
region_membership_indicators.append(np.reshape(inds, newshape=(1, nregions)))
else:
# logger.debug("No region selected for instance %d" % i)
pass
member_insts = np.array(member_insts, dtype=int)
# logger.debug("#region_indexes: %d, #instance_indexes: %d, #region_membership_indicators: %d" %
# (len(region_indexes), len(instance_indexes), len(region_membership_indicators)))
if len(region_membership_indicators) > 0:
region_membership_indicators = np.vstack(region_membership_indicators)
else:
region_membership_indicators = None
return member_insts, region_membership_indicators
def get_compact_regions(x, model=None, instance_indexes=None, region_indexes=None, volumes=None, p=1):
""" Returns the most compact set of regions among region_indexes that contain the required instances
:param x: np.ndarray
The instances in *original feature* space
:param model: Aad
AAD model
:param region_indexes: np.array(int)
Indexes of the candidate regions within which to contain the instances
:param volumes: np.array(float)
The volumes of the regions whose indexes are provided in region_indexes
:param p: int
Determines how much to penalize the size of the regions (based on their volumes).
If this is large, then bigger regions will be strongly discouraged from getting selected.
:return:
"""
import cvxopt
from cvxopt import glpk
member_insts, member_inds = get_region_memberships(x, model=model,
instance_indexes=instance_indexes,
region_indexes=region_indexes)
# logger.debug("anom indexes in selected regions (%d):\n%s" % (len(member_anoms), str(list(member_anoms))))
# logger.debug("member_inds (%s):\n%s" % (str(member_inds.shape), str(member_inds)))
nvars = member_inds.shape[1]
glpk.options['msg_lev'] = 'GLP_MSG_OFF'
c = cvxopt.matrix([float(v**p) for v in volumes], tc='d') # minimize total volume**p
# below states that each anomaly should be included in atleast one region
G = cvxopt.matrix(-member_inds, tc='d')
h = cvxopt.matrix([-1] * member_inds.shape[0], tc='d')
bin_vars = [i for i in range(nvars)]
(status, soln) = cvxopt.glpk.ilp(c, G, h, B=set(bin_vars))
# logger.debug("ILP status: %s" % status)
if soln is not None:
soln = np.reshape(np.array(soln), newshape=(nvars,))
# logger.debug("ILP solution:\n%s" % str(soln))
idxs = np.where(soln == 1)[0]
if False:
logger.debug("\nregion_indexes: %d\n%s\nmember_insts: %d\n%s" %
(len(idxs), str(list(region_indexes[idxs])),
len(member_insts), str(list(member_insts))))
return region_indexes[idxs]
else:
return None
class InstancesDescriber(object):
def __init__(self, x, y, model, opts, sample_negative=False):
"""
:param x: np.ndarray
The instance matrix with ALL instances
:param y: np.array
:param model: Aad
:param opts: AadOpts
:param sample_negative: bool
"""
self.x = x
self.y = y
self.model = model
self.opts = opts
self.sample_negative = sample_negative
self.meta = None
def sample_instances(self, exclude, n):
s = np.ones(self.x.shape[0], dtype=np.int32)
s[exclude] = 0
s = np.where(s == 1)[0]
np.random.shuffle(s)
return s[:n]
def convert_regions_to_rules(self, regions, region_indexes=None):
if self.meta is None:
raise ValueError("must set metadata before calling this function")
rules, str_rules = convert_feature_ranges_to_rules(regions, self.meta)
if region_indexes is not None:
for rule, index in zip(rules, region_indexes):
rule.id = index
return rules, str_rules
def get_top_regions(self, instance_indexes):
""" Gets the regions having highest anomaly scores
:param instance_indexes: np.array
:return: tuple, list(map)
tuple: (region indexes, #instances among instance_indexes that fall in the region)
list(map): list of region extents where each region extent is a
map {feature index: feature range}
"""
region_indexes = get_regions_for_description(self.x, instance_indexes=instance_indexes,
model=self.model,
n_top=self.opts.describe_n_top
)
regions = [self.model.all_regions[ridx].region for ridx in region_indexes]
return region_indexes, regions
def describe(self, instance_indexes):
""" Generates descriptions for positive instances among those passed as input
:param instance_indexes: indexes of instances
:param sample_negative: Sample random instances and mark them as negative
Might help in avoiding false positives.
Number of sampled instances will be len(instance_indexes)
:return: Rules
"""
pass
class CompactDescriber(InstancesDescriber):
""" Generates compact descriptions for instances
This is different from the method get_compact_regions() in that it
reduces false positives by excluding negative examples while always
including positive examples.
"""
def __init__(self, x, y, model, opts, sample_negative=False):
InstancesDescriber.__init__(self, x, y, model, opts, sample_negative)
self.prec_threshold = 0.4
self.neg_penalty = 1.0
self.meta = get_feature_meta_default(x, y)
# will be used to compute volumes
self.feature_ranges = get_sample_feature_ranges(self.x)
def get_complexity(self, regions):
""" Gets the complexity of rules derived from feature ranges that define the regions
Compute the finite values in defining the regions. These finite values
become part of the rule. Fewer such values, smaller the rule in length.
E.g.: let a region be:
{0: (-inf, 2), 1: (3, 5), 2: (-inf, inf)}
This region will become a rule of length 3 and complexity = 2^(3-1) = 4:
feature0 <= 2 & feature1 > 3 & feature1 <= 5
:param regions: list of dict
:return: np.array
"""
complexity = np.zeros(len(regions), dtype=np.float32)
# logger.debug("regions:\n%s" % str(regions))
for i, region in enumerate(regions):
c = 0
for key, range in region.items():
if np.isfinite(range[0]):
c += 1
if np.isfinite(range[1]):
c += 1
complexity[i] = c
# We take (complexity-1) because we assume a rule of length 1 has no complexity
complexity = np.power(2, np.minimum(complexity-1, 9))
return complexity
def describe(self, instance_indexes):
""" Generates descriptions for positive instances among those passed as input
:param instance_indexes: indexes of instances
:param sample_negative: Sample random instances and mark them as negative
Might help in avoiding false positives.
Number of sampled instances will be len(instance_indexes)
:return: Rules
"""
# separate positive and negative instances
positive_indexes = instance_indexes[np.where(self.y[instance_indexes] == 1)[0]]
negative_indexes = instance_indexes[np.where(self.y[instance_indexes] == 0)[0]]
if self.sample_negative:
n_neg_samples = len(instance_indexes)
neg_samples = self.sample_instances(exclude=instance_indexes, n=n_neg_samples)
negative_indexes = np.append(negative_indexes, neg_samples)
# get most anomalous regions that cover the positives
region_indexes, _ = self.get_top_regions(positive_indexes)
volumes = get_region_volumes(self.model, region_indexes, self.feature_ranges)
total_vol = np.sum(volumes)
# logger.debug("Total vol: %f" % total_vol)
if total_vol > 0:
volumes = volumes / np.sum(volumes)
complexities = self.get_complexity([self.model.all_regions[ridx].region for ridx in region_indexes])
# logger.debug("complexities: %s" % str(list(complexities)))
# solve the set-covering problem where every positive instance is
# covered by *some* region.
ordered_compact_idxs, n_pos, n_neg, vol = self.find_compact(positive_indexes=positive_indexes,
negative_indexes=negative_indexes,
region_indexes=region_indexes,
volumes=volumes, complexities=complexities)
# filter out less precise regions which probably arose due to noise
prec = n_pos / (n_pos + n_neg)
selected = np.where(prec >= self.prec_threshold)[0]
if len(selected) == 0:
selected = np.where(prec == max(prec))[0]
if len(selected) < len(prec):
# logger.debug("prec: %s" % str(list(prec[selected])))
ordered_compact_idxs, n_pos, n_neg, vol = \
ordered_compact_idxs[selected], n_pos[selected], n_neg[selected], vol[selected]
if ordered_compact_idxs is not None:
feature_ranges = [self.model.all_regions[ridx].region for ridx in ordered_compact_idxs]
rules, str_rules = self.convert_regions_to_rules(feature_ranges, region_indexes=ordered_compact_idxs)
# logger.debug("logging compact descriptions: (%d/%d)" % (len(feature_ranges), len(rules)))
else:
# logger.debug("No description found")
feature_ranges = rules = None
return ordered_compact_idxs, feature_ranges, rules
def find_compact(self, positive_indexes, negative_indexes, region_indexes, volumes, complexities):
""" Returns the most compact set of region_indexes that contain positive examples
Tries to find regions that include all positive instances while
minimizing the presence of negative instances.
:param positive_indexes: np.array
Indexes of positive instances
:param negative_indexes: np.array
Indexes of negative instances
:param region_indexes: np.array(int)
Indexes of the candidate regions within which to contain the instances
:param volumes: np.array(float)
The volumes of the regions whose indexes are provided in region_indexes
:return: np.array
Selected regions
"""
import cvxopt
from cvxopt import glpk
m_positives, m_positive_inds = get_region_memberships(self.x, model=self.model,
instance_indexes=positive_indexes,
region_indexes=region_indexes)
n_positives = np.asarray(np.sum(m_positive_inds, axis=0), dtype=np.float32)
if negative_indexes is not None and len(negative_indexes) > 0:
m_negatives, m_negative_inds = get_region_memberships(self.x, model=self.model,
instance_indexes=negative_indexes,
region_indexes=region_indexes)
if len(m_negatives) > 0:
n_negatives = np.asarray(np.sum(m_negative_inds, axis=0), dtype=np.float32)
else:
n_negatives = np.zeros(len(region_indexes), dtype=np.float32)
else:
n_negatives = np.zeros(len(region_indexes), dtype=np.float32)
# logger.debug("anom indexes in selected regions (%d):\n%s" % (len(m_positives), str(list(m_positives))))
# logger.debug("m_positive_inds (%s):\n%s" % (str(m_positive_inds.shape), str(m_positive_inds)))
# logger.debug("m_negative_inds (%s):\n%s" % (str(m_negative_inds.shape), str(m_negative_inds)))
nvars = m_positive_inds.shape[1]
glpk.options['msg_lev'] = 'GLP_MSG_OFF'
# minimize total volume and the number of negative examples, i.e.:
# (volume**p + n_negatives)
# logger.debug("volumes:\n%s" % str(list(volumes)))
# median_volume = np.median(volumes)
# logger.debug("median volume: %f" % median_volume)
vol_neg = ((volumes ** self.opts.describe_volume_p)
+ self.neg_penalty * np.multiply(n_negatives, volumes)
+ complexities)
# logger.debug("vol_neg:\n%s" % str(list(vol_neg)))
c = cvxopt.matrix([float(v) for v in vol_neg], tc='d')
# below states that each anomaly should be included in atleast one region
G = cvxopt.matrix(-m_positive_inds, tc='d')
h = cvxopt.matrix([-1] * m_positive_inds.shape[0], tc='d')
bin_vars = [i for i in range(nvars)]
(status, soln) = cvxopt.glpk.ilp(c, G, h, B=set(bin_vars))
# logger.debug("ILP status: %s" % status)
if soln is not None:
soln = np.reshape(np.array(soln), newshape=(nvars,))
# logger.debug("ILP solution:\n%s" % str(soln))
idxs = np.where(soln == 1)[0]
if False:
logger.debug("\nregion_indexes: %d\n%s\nm_positives: %d\n%s" %
(len(idxs), str(list(region_indexes[idxs])),
len(m_positives), str(list(m_positives))))
return region_indexes[idxs], n_positives[idxs], n_negatives[idxs], volumes[idxs]
else:
return None
class MinimumVolumeCoverDescriber(CompactDescriber):
def __init__(self, x, y, model, opts):
CompactDescriber.__init__(self, x, y, model, opts)
def describe(self, instance_indexes):
""" Returns descriptions that cover all input instances
Finds all subspaces which together contain all input instances
and have the minimum volume. No effort is made to exclude
false positives.
"""
instance_indexes = np.array(instance_indexes)
# get most anomalous regions that cover the positives
region_indexes = get_regions_for_description(self.x, instance_indexes=instance_indexes,
model=self.model,
n_top=self.opts.describe_n_top
)
volumes = get_region_volumes(self.model, region_indexes, self.feature_ranges)
# get the smallest set of smallest regions that together cover all instances
compact_region_idxs = get_compact_regions(self.x, model=self.model,
instance_indexes=instance_indexes,
region_indexes=region_indexes,
volumes=volumes, p=self.opts.describe_volume_p)
regions = [self.model.all_regions[ridx].region for ridx in compact_region_idxs]
rules, str_rules = self.convert_regions_to_rules(regions, region_indexes=compact_region_idxs)
return compact_region_idxs, regions, rules
class BayesianRulesetsDescriber(InstancesDescriber):
def __init__(self, x, y, model=None, opts=None, meta=None, candidate_rules=None):
InstancesDescriber.__init__(self, x, y, model, opts, sample_negative=True)
self.meta = meta
if self.meta is None:
self.meta = get_feature_meta_default(x, y)
self.candidate_rules = candidate_rules
def describe(self, instance_indexes):
instance_indexes = np.array(instance_indexes, dtype=np.int32)
if self.sample_negative:
frac_more = 1.0
else:
frac_more = 0.0
if self.sample_negative and frac_more > 0:
# add additional unlabeled instances as negative examples
n_neg_samples = int(frac_more * len(instance_indexes))
neg_samples = self.sample_instances(exclude=instance_indexes, n=n_neg_samples)
selected_indexes = np.append(instance_indexes, neg_samples)
x_br = self.x[selected_indexes]
y_br = np.zeros(len(selected_indexes), dtype=self.y.dtype) # all negative by default
y_br[0:len(instance_indexes)] = self.y[instance_indexes] # known labels
else:
x_br = self.x[instance_indexes]
y_br = self.y[instance_indexes]
if self.candidate_rules is not None:
# If candidate rules were provided, use them
rules = self.candidate_rules
else:
# If candidate rule were not provided, then get the top anomalous
# regions which cover all positive instances and create candidate
# rules from them.
# separate positive and negative instances
positive_indexes = instance_indexes[np.where(self.y[instance_indexes] == 1)[0]]
# get most anomalous regions that cover the positives
region_indexes, regions = self.get_top_regions(positive_indexes)
rules, str_rules = self.convert_regions_to_rules(regions, region_indexes=region_indexes)
br = BayesianRuleset(meta=self.meta, opts=None, max_iter=200,
maxlen=get_max_len_in_rules(rules),
n_min_support_stop=int(0.1 * len(y_br)))
br.fit(x_br, y_br, rules)
bayesian_rules = [br.rules[idx] for idx in br.predicted_rules]
if self.candidate_rules is not None:
bayesian_region_idxs = np.array([br.rules[idx].id if br.rules[idx].id is not None else -1
for idx in br.predicted_rules],
dtype=np.int32)
feature_ranges = convert_conjunctive_rules_to_feature_ranges(bayesian_rules, self.meta)
else:
bayesian_region_idxs = np.array([br.rules[idx].id for idx in br.predicted_rules], dtype=np.int32)
feature_ranges = [self.model.all_regions[ridx].region for ridx in bayesian_region_idxs]
return bayesian_region_idxs, feature_ranges, bayesian_rules