diff --git a/go.mod b/go.mod index c286fb079..05c860737 100644 --- a/go.mod +++ b/go.mod @@ -125,7 +125,6 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/narumiruna/go-iforest v0.2.2 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.1 // indirect diff --git a/go.sum b/go.sum index 51afd5fff..c7356ef8b 100644 --- a/go.sum +++ b/go.sum @@ -446,8 +446,6 @@ github.com/muesli/kmeans v0.3.0/go.mod h1:eNyybq0tX9/iBEP6EMU4Y7dpmGK0uEhODdZpnG github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= -github.com/narumiruna/go-iforest v0.2.2 h1:48GGRVLSlgtV3vGr+eedXODn5RT3WvYroqpMNEoQvkk= -github.com/narumiruna/go-iforest v0.2.2/go.mod h1:2pumoiqKf0Lr+KvLECMC8uNrbRkxtSvUwMJC/6AW7DM= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= diff --git a/pkg/ensemble/iforest/forest.go b/pkg/ensemble/iforest/forest.go new file mode 100644 index 000000000..d0add3174 --- /dev/null +++ b/pkg/ensemble/iforest/forest.go @@ -0,0 +1,186 @@ +package iforest + +import ( + "math" + "math/rand" + "sync" +) + +const ( + defaultNumTrees = 100 + defaultSampleSize = 256 + defaultScoreThreshold = 0.6 + defaultDetectionType = DetectionTypeThreshold + offset = 0.5 +) + +type DetectionType string + +const ( + DetectionTypeThreshold DetectionType = "threshold" + DetectionTypeProportion DetectionType = "proportion" +) + +type Options struct { + // The method used for anomaly detection + DetectionType DetectionType `json:"detectionType"` + + // The anomaly score threshold + Threshold float64 `json:"threshold"` + + // The proportion of outliers in the dataset + Proportion float64 `json:"proportion"` + + // The number of trees to build in the forest + NumTrees int `json:"numTrees"` + + // The sample size for each isolation tree + SampleSize int `json:"sampleSize"` + + // The maximum depth of each isolation tree + MaxDepth int `json:"maxDepth"` +} + +// SetDefaultValues applies default settings to unspecified fields +func (o *Options) SetDefaultValues() { + if o.DetectionType == "" { + o.DetectionType = defaultDetectionType + } + + if o.Threshold == 0 { + o.Threshold = defaultScoreThreshold + } + + if o.NumTrees == 0 { + o.NumTrees = defaultNumTrees + } + + if o.SampleSize == 0 { + o.SampleSize = defaultSampleSize + } + + if o.MaxDepth == 0 { + o.MaxDepth = int(math.Ceil(math.Log2(float64(o.SampleSize)))) + } +} + +// IsolationForest orchestrates anomaly detection using isolation trees +type IsolationForest struct { + *Options + + Trees []*TreeNode +} + +// New creates an IsolationForest with default options. +func New() *IsolationForest { + options := &Options{} + options.SetDefaultValues() + return &IsolationForest{Options: options} +} + +// NewWithOptions creates an IsolationForest with the specified options. +func NewWithOptions(options Options) *IsolationForest { + options.SetDefaultValues() + return &IsolationForest{Options: &options} +} + +// Fit constructs isolation trees from a given dataset +func (f *IsolationForest) Fit(samples [][]float64) { + wg := sync.WaitGroup{} + wg.Add(f.NumTrees) + + f.Trees = make([]*TreeNode, f.NumTrees) + for i := 0; i < f.NumTrees; i++ { + sampled := SampleRows(samples, f.SampleSize) + go func(index int) { + defer wg.Done() + tree := f.BuildTree(sampled, 0) + f.Trees[index] = tree + }(i) + } + wg.Wait() +} + +// BuildTree recursively partitions samples to isolate outliers +func (f *IsolationForest) BuildTree(samples [][]float64, depth int) *TreeNode { + numSamples := len(samples) + if numSamples == 0 { + return &TreeNode{} + } + numFeatures := len(samples[0]) + if depth >= f.MaxDepth || numSamples <= 1 { + return &TreeNode{Size: numSamples} + } + + splitIndex := rand.Intn(numFeatures) + column := Column(samples, splitIndex) + minValue, maxValue := MinMax(column) + splitValue := rand.Float64()*(maxValue-minValue) + minValue + + leftSamples := make([][]float64, 0) + rightSamples := make([][]float64, 0) + for _, sample := range samples { + if sample[splitIndex] < splitValue { + leftSamples = append(leftSamples, sample) + } else { + rightSamples = append(rightSamples, sample) + } + } + + return &TreeNode{ + Left: f.BuildTree(leftSamples, depth+1), + Right: f.BuildTree(rightSamples, depth+1), + SplitIndex: splitIndex, + SplitValue: splitValue, + } +} + +// Score computes anomaly scores for each sample +func (f *IsolationForest) Score(samples [][]float64) []float64 { + scores := make([]float64, len(samples)) + for i, sample := range samples { + score := 0.0 + for _, tree := range f.Trees { + score += pathLength(sample, tree, 0) + } + scores[i] = math.Pow(2.0, -score/float64(len(f.Trees))/averagePathLength(float64(f.SampleSize))) + } + return scores +} + +// Predict labels samples as outliers (1) or normal (0) based on the detection type +func (f *IsolationForest) Predict(samples [][]float64) []int { + predictions := make([]int, len(samples)) + scores := f.Score(samples) + + var threshold float64 + switch f.DetectionType { + case DetectionTypeThreshold: + threshold = f.Threshold + case DetectionTypeProportion: + threshold = Quantile(f.Score(samples), 1-f.Proportion) + default: + panic("Invalid detection type") + } + + for i, score := range scores { + if score >= threshold { + predictions[i] = 1 + } else { + predictions[i] = 0 + } + } + + return predictions +} + +// FeatureImportance computes an importance score for each feature +func (f *IsolationForest) FeatureImportance(sample []float64) []int { + importance := make([]int, len(sample)) + for _, tree := range f.Trees { + for i, value := range tree.FeatureImportance(sample) { + importance[i] += value + } + } + return importance +} diff --git a/pkg/ensemble/iforest/forest_test.go b/pkg/ensemble/iforest/forest_test.go new file mode 100644 index 000000000..2213cb9c9 --- /dev/null +++ b/pkg/ensemble/iforest/forest_test.go @@ -0,0 +1,34 @@ +package iforest + +import ( + "testing" +) + +func TestIsolationForest(t *testing.T) { + tests := []struct { + features [][]float64 + predictions []int + }{ + { + [][]float64{ + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {1, 1, 1}, + }, + []int{0, 0, 0, 1}, + }, + } + + for _, tt := range tests { + forest := New() + forest.Fit(tt.features) + + preds := forest.Predict(tt.features) + for i, pred := range preds { + if pred != tt.predictions[i] { + t.Errorf("expected %v, got %v", tt.predictions[i], pred) + } + } + } +} diff --git a/pkg/ensemble/iforest/matrix.go b/pkg/ensemble/iforest/matrix.go new file mode 100644 index 000000000..89a77b3ef --- /dev/null +++ b/pkg/ensemble/iforest/matrix.go @@ -0,0 +1,47 @@ +package iforest + +import ( + "math" + "math/rand" +) + +// SampleRows randomly selects 'size' rows from the matrix. +func SampleRows(matrix [][]float64, size int) [][]float64 { + if size <= 0 { + panic("size must be greater than 0") + } + + if len(matrix) <= size { + return matrix + } + + perm := rand.Perm(len(matrix)) + sampled := make([][]float64, size) + for i := 0; i < size; i++ { + sampled[i] = matrix[perm[i]] + } + return sampled +} + +// Column returns a slice containing the specified column from the matrix. +func Column(matrix [][]float64, columnIndex int) []float64 { + column := make([]float64, len(matrix)) + for i, row := range matrix { + column[i] = row[columnIndex] + } + return column +} + +// MinMax returns the minimum and maximum values from a slice of float64. +func MinMax(floats []float64) (float64, float64) { + min, max := math.Inf(1), math.Inf(-1) + for _, v := range floats { + if v < min { + min = v + } + if v > max { + max = v + } + } + return min, max +} diff --git a/pkg/ensemble/iforest/matrix_test.go b/pkg/ensemble/iforest/matrix_test.go new file mode 100644 index 000000000..19928de69 --- /dev/null +++ b/pkg/ensemble/iforest/matrix_test.go @@ -0,0 +1,137 @@ +package iforest + +import ( + "reflect" + "testing" +) + +func TestSampleRows(t *testing.T) { + tests := []struct { + name string + matrix [][]float64 + size int + }{ + { + name: "Sample size less than matrix length", + matrix: [][]float64{ + {1.0, 2.0}, + {3.0, 4.0}, + {5.0, 6.0}, + }, + size: 2, + }, + { + name: "Sample size equal to matrix length", + matrix: [][]float64{ + {1.0, 2.0}, + {3.0, 4.0}, + {5.0, 6.0}, + }, + size: 3, + }, + { + name: "Sample size greater than matrix length", + matrix: [][]float64{ + {1.0, 2.0}, + {3.0, 4.0}, + }, + size: 3, + }, + { + name: "Sample size zero", + matrix: [][]float64{ + {1.0, 2.0}, + {3.0, 4.0}, + }, + size: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + defer func() { + if r := recover(); r != nil { + if tt.size > 0 { + t.Errorf("SampleRows() panicked with size %d", tt.size) + } + } + }() + + got := SampleRows(tt.matrix, tt.size) + if tt.size > 0 && len(tt.matrix) > tt.size { + if len(got) != tt.size { + t.Errorf("SampleRows() = %v, want length %d", got, tt.size) + } + } else { + if !reflect.DeepEqual(got, tt.matrix) { + t.Errorf("SampleRows() = %v, want %v", got, tt.matrix) + } + } + }) + } +} + +func TestColumn(t *testing.T) { + tests := []struct { + name string + matrix [][]float64 + columnIndex int + want []float64 + }{ + { + name: "Valid column index", + matrix: [][]float64{ + {1.0, 2.0, 3.0}, + {4.0, 5.0, 6.0}, + {7.0, 8.0, 9.0}, + }, + columnIndex: 1, + want: []float64{2.0, 5.0, 8.0}, + }, + { + name: "First column", + matrix: [][]float64{ + {1.0, 2.0}, + {3.0, 4.0}, + }, + columnIndex: 0, + want: []float64{1.0, 3.0}, + }, + { + name: "Last column", + matrix: [][]float64{ + {1.0, 2.0}, + {3.0, 4.0}, + }, + columnIndex: 1, + want: []float64{2.0, 4.0}, + }, + { + name: "Single row matrix", + matrix: [][]float64{ + {1.0, 2.0, 3.0}, + }, + columnIndex: 2, + want: []float64{3.0}, + }, + { + name: "Single column matrix", + matrix: [][]float64{ + {1.0}, + {2.0}, + {3.0}, + }, + columnIndex: 0, + want: []float64{1.0, 2.0, 3.0}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := Column(tt.matrix, tt.columnIndex) + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("Column() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/ensemble/iforest/path.go b/pkg/ensemble/iforest/path.go new file mode 100644 index 000000000..0fc761b0e --- /dev/null +++ b/pkg/ensemble/iforest/path.go @@ -0,0 +1,36 @@ +package iforest + +import "math" + +const eulerGamma = 0.5772156649 + +// harmonicNumber returns the harmonic number for x using a known constant. +func harmonicNumber(x float64) float64 { + return math.Log(x) + eulerGamma +} + +// averagePathLength calculates expected path length for a given number of samples. +func averagePathLength(x float64) float64 { + if x > 2 { + return 2.0*harmonicNumber(x-1) - 2.0*(x-1)/x + } else if x == 2 { + return 1.0 + } else { + return 0.0 + } +} + +// pathLength traverses the given tree node and returns the path length for the vector. +func pathLength(vector []float64, node *TreeNode, currentPathLength int) float64 { + if node.IsLeaf() { + return float64(currentPathLength) + averagePathLength(float64(node.Size)) + } + + splitAttribute := node.SplitIndex + splitValue := node.SplitValue + if vector[splitAttribute] < splitValue { + return pathLength(vector, node.Left, currentPathLength+1) + } else { + return pathLength(vector, node.Right, currentPathLength+1) + } +} diff --git a/pkg/ensemble/iforest/quantile.go b/pkg/ensemble/iforest/quantile.go new file mode 100644 index 000000000..37811cdf2 --- /dev/null +++ b/pkg/ensemble/iforest/quantile.go @@ -0,0 +1,33 @@ +package iforest + +import ( + "fmt" + "math" + "sort" +) + +// Quantile computes the q-th quantile (0 <= q <= 1) of a slice of float64 values. +func Quantile(numbers []float64, q float64) float64 { + if len(numbers) == 0 { + panic("numbers must not be empty") + } + if q < 0 || q > 1 { + panic(fmt.Sprintf("q must be in [0, 1], got %v", q)) + } + + sortedNumbers := make([]float64, len(numbers)) + copy(sortedNumbers, numbers) + sort.Float64s(sortedNumbers) + + n := float64(len(sortedNumbers)) + pos := q * (n - 1) + lowerIndex := int(math.Floor(pos)) + upperIndex := int(math.Ceil(pos)) + if lowerIndex == upperIndex { + return sortedNumbers[lowerIndex] + } + + // linear interpolation + fraction := pos - float64(lowerIndex) + return sortedNumbers[lowerIndex] + fraction*(sortedNumbers[upperIndex]-sortedNumbers[lowerIndex]) +} diff --git a/pkg/ensemble/iforest/quantile_test.go b/pkg/ensemble/iforest/quantile_test.go new file mode 100644 index 000000000..06c73da8d --- /dev/null +++ b/pkg/ensemble/iforest/quantile_test.go @@ -0,0 +1,48 @@ +package iforest + +import ( + "testing" +) + +func TestQuantile(t *testing.T) { + tests := []struct { + numbers []float64 + q float64 + expected float64 + }{ + {[]float64{1, 2}, 0.5, 1.5}, + {[]float64{1, 2, 3, 4, 5}, 0.5, 3}, + {[]float64{1, 2, 3, 4, 5}, 1.0, 5}, + {[]float64{1, 2, 3, 4, 5}, 0.0, 1}, + {[]float64{1, 3, 3, 6, 7, 8, 9}, 0.25, 3}, + {[]float64{1, 3, 3, 6, 7, 8, 9}, 0.75, 7.5}, + {[]float64{1, 2, 3, 4, 5, 6, 8, 9}, 0.5, 4.5}, + } + + for _, tt := range tests { + actual := Quantile(tt.numbers, tt.q) + if actual != tt.expected { + t.Errorf("Quantile(%v, %v) == %v, expected %v", tt.numbers, tt.q, actual, tt.expected) + } + } +} + +func TestQuantilePanics(t *testing.T) { + tests := []struct { + numbers []float64 + q float64 + }{ + {[]float64{}, 0.5}, + {[]float64{1, 2, 3}, -0.1}, + {[]float64{1, 2, 3}, 1.1}, + } + + for _, tt := range tests { + defer func() { + if r := recover(); r == nil { + t.Errorf("Quantile(%v, %v) did not panic", tt.numbers, tt.q) + } + }() + Quantile(tt.numbers, tt.q) + } +} diff --git a/pkg/ensemble/iforest/tree.go b/pkg/ensemble/iforest/tree.go new file mode 100644 index 000000000..4230f9dc0 --- /dev/null +++ b/pkg/ensemble/iforest/tree.go @@ -0,0 +1,39 @@ +package iforest + +type TreeNode struct { + Left *TreeNode + Right *TreeNode + Size int + SplitIndex int + SplitValue float64 +} + +// IsLeaf checks if the node is a leaf (no children). +func (t *TreeNode) IsLeaf() bool { + return t.Left == nil && t.Right == nil +} + +// traceSplitIndices traces the indices of the splits along the path of this tree node. +func (t *TreeNode) traceSplitIndices(sample []float64, indices []int) []int { + if t.IsLeaf() { + return indices + } + + if sample[t.SplitIndex] < t.SplitValue { + indices = append(indices, t.SplitIndex) + return t.Left.traceSplitIndices(sample, indices) + } else { + indices = append(indices, t.SplitIndex) + return t.Right.traceSplitIndices(sample, indices) + } +} + +// FeatureImportance calculates how many times each feature index is used during splits for the given sample. +func (t *TreeNode) FeatureImportance(sample []float64) []int { + indices := t.traceSplitIndices(sample, []int{}) + importance := make([]int, len(sample)) + for _, index := range indices { + importance[index]++ + } + return importance +} diff --git a/pkg/strategy/sentinel/strategy.go b/pkg/strategy/sentinel/strategy.go index e7233a19e..a6698e87f 100644 --- a/pkg/strategy/sentinel/strategy.go +++ b/pkg/strategy/sentinel/strategy.go @@ -7,9 +7,9 @@ import ( "github.com/c9s/bbgo/pkg/bbgo" "github.com/c9s/bbgo/pkg/datatype/floats" + "github.com/c9s/bbgo/pkg/ensemble/iforest" "github.com/c9s/bbgo/pkg/exchange/batch" "github.com/c9s/bbgo/pkg/types" - "github.com/narumiruna/go-iforest/pkg/iforest" log "github.com/sirupsen/logrus" "golang.org/x/time/rate" )