Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEATURE: add isolation forest algorithm #1876

Merged
merged 1 commit into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/narumiruna/go-iforest v0.2.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,6 @@ github.com/muesli/kmeans v0.3.0/go.mod h1:eNyybq0tX9/iBEP6EMU4Y7dpmGK0uEhODdZpnG
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/narumiruna/go-iforest v0.2.2 h1:48GGRVLSlgtV3vGr+eedXODn5RT3WvYroqpMNEoQvkk=
github.com/narumiruna/go-iforest v0.2.2/go.mod h1:2pumoiqKf0Lr+KvLECMC8uNrbRkxtSvUwMJC/6AW7DM=
github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE=
github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU=
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
Expand Down
186 changes: 186 additions & 0 deletions pkg/ensemble/iforest/forest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package iforest

import (
"math"
"math/rand"
"sync"
)

const (
defaultNumTrees = 100
defaultSampleSize = 256
defaultScoreThreshold = 0.6
defaultDetectionType = DetectionTypeThreshold
offset = 0.5
)

type DetectionType string

const (
DetectionTypeThreshold DetectionType = "threshold"
DetectionTypeProportion DetectionType = "proportion"
)

type Options struct {
// The method used for anomaly detection
DetectionType DetectionType `json:"detectionType"`

// The anomaly score threshold
Threshold float64 `json:"threshold"`

// The proportion of outliers in the dataset
Proportion float64 `json:"proportion"`

// The number of trees to build in the forest
NumTrees int `json:"numTrees"`

// The sample size for each isolation tree
SampleSize int `json:"sampleSize"`

// The maximum depth of each isolation tree
MaxDepth int `json:"maxDepth"`
}

// SetDefaultValues applies default settings to unspecified fields
func (o *Options) SetDefaultValues() {
if o.DetectionType == "" {
o.DetectionType = defaultDetectionType
}

if o.Threshold == 0 {
o.Threshold = defaultScoreThreshold
}

if o.NumTrees == 0 {
o.NumTrees = defaultNumTrees
}

if o.SampleSize == 0 {
o.SampleSize = defaultSampleSize
}

if o.MaxDepth == 0 {
o.MaxDepth = int(math.Ceil(math.Log2(float64(o.SampleSize))))
}
}

// IsolationForest orchestrates anomaly detection using isolation trees
type IsolationForest struct {
*Options

Trees []*TreeNode
}

// New creates an IsolationForest with default options.
func New() *IsolationForest {
options := &Options{}
options.SetDefaultValues()
return &IsolationForest{Options: options}
}

// NewWithOptions creates an IsolationForest with the specified options.
func NewWithOptions(options Options) *IsolationForest {
options.SetDefaultValues()
return &IsolationForest{Options: &options}
}

// Fit constructs isolation trees from a given dataset
func (f *IsolationForest) Fit(samples [][]float64) {
wg := sync.WaitGroup{}
wg.Add(f.NumTrees)

f.Trees = make([]*TreeNode, f.NumTrees)
for i := 0; i < f.NumTrees; i++ {
sampled := SampleRows(samples, f.SampleSize)
go func(index int) {
defer wg.Done()
tree := f.BuildTree(sampled, 0)
f.Trees[index] = tree
}(i)
}
wg.Wait()
}

// BuildTree recursively partitions samples to isolate outliers
func (f *IsolationForest) BuildTree(samples [][]float64, depth int) *TreeNode {
numSamples := len(samples)
if numSamples == 0 {
return &TreeNode{}
}
numFeatures := len(samples[0])
if depth >= f.MaxDepth || numSamples <= 1 {
return &TreeNode{Size: numSamples}
}

splitIndex := rand.Intn(numFeatures)
column := Column(samples, splitIndex)
minValue, maxValue := MinMax(column)
splitValue := rand.Float64()*(maxValue-minValue) + minValue

leftSamples := make([][]float64, 0)
rightSamples := make([][]float64, 0)
for _, sample := range samples {
if sample[splitIndex] < splitValue {
leftSamples = append(leftSamples, sample)
} else {
rightSamples = append(rightSamples, sample)
}
}

return &TreeNode{
Left: f.BuildTree(leftSamples, depth+1),
Right: f.BuildTree(rightSamples, depth+1),
SplitIndex: splitIndex,
SplitValue: splitValue,
}
}

// Score computes anomaly scores for each sample
func (f *IsolationForest) Score(samples [][]float64) []float64 {
scores := make([]float64, len(samples))
for i, sample := range samples {
score := 0.0
for _, tree := range f.Trees {
score += pathLength(sample, tree, 0)
}
scores[i] = math.Pow(2.0, -score/float64(len(f.Trees))/averagePathLength(float64(f.SampleSize)))
}
return scores
}

// Predict labels samples as outliers (1) or normal (0) based on the detection type
func (f *IsolationForest) Predict(samples [][]float64) []int {
predictions := make([]int, len(samples))
scores := f.Score(samples)

var threshold float64
switch f.DetectionType {
case DetectionTypeThreshold:
threshold = f.Threshold
case DetectionTypeProportion:
threshold = Quantile(f.Score(samples), 1-f.Proportion)
default:
panic("Invalid detection type")
}

for i, score := range scores {
if score >= threshold {
predictions[i] = 1
} else {
predictions[i] = 0
}
}

return predictions
}

// FeatureImportance computes an importance score for each feature
func (f *IsolationForest) FeatureImportance(sample []float64) []int {
importance := make([]int, len(sample))
for _, tree := range f.Trees {
for i, value := range tree.FeatureImportance(sample) {
importance[i] += value
}
}
return importance
}
34 changes: 34 additions & 0 deletions pkg/ensemble/iforest/forest_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package iforest

import (
"testing"
)

func TestIsolationForest(t *testing.T) {
tests := []struct {
features [][]float64
predictions []int
}{
{
[][]float64{
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{1, 1, 1},
},
[]int{0, 0, 0, 1},
},
}

for _, tt := range tests {
forest := New()
forest.Fit(tt.features)

preds := forest.Predict(tt.features)
for i, pred := range preds {
if pred != tt.predictions[i] {
t.Errorf("expected %v, got %v", tt.predictions[i], pred)
}
}
}
}
47 changes: 47 additions & 0 deletions pkg/ensemble/iforest/matrix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package iforest

import (
"math"
"math/rand"
)

// SampleRows randomly selects 'size' rows from the matrix.
func SampleRows(matrix [][]float64, size int) [][]float64 {
if size <= 0 {
panic("size must be greater than 0")
}

if len(matrix) <= size {
return matrix
}

perm := rand.Perm(len(matrix))
sampled := make([][]float64, size)
for i := 0; i < size; i++ {
sampled[i] = matrix[perm[i]]
}
return sampled
}

// Column returns a slice containing the specified column from the matrix.
func Column(matrix [][]float64, columnIndex int) []float64 {
column := make([]float64, len(matrix))
for i, row := range matrix {
column[i] = row[columnIndex]
}
return column
}

// MinMax returns the minimum and maximum values from a slice of float64.
func MinMax(floats []float64) (float64, float64) {
min, max := math.Inf(1), math.Inf(-1)
for _, v := range floats {
if v < min {
min = v
}
if v > max {
max = v
}
}
return min, max
}
Loading
Loading