-
Notifications
You must be signed in to change notification settings - Fork 29
/
levenshtein.go
101 lines (88 loc) · 2.52 KB
/
levenshtein.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Package levenshtein is a Go implementation to calculate Levenshtein Distance.
//
// Implementation taken from
// https://gist.github.com/andrei-m/982927#gistcomment-1931258
package levenshtein
import "unicode/utf8"
// minLengthThreshold is the length of the string beyond which
// an allocation will be made. Strings smaller than this will be
// zero alloc.
const minLengthThreshold = 32
// ComputeDistance computes the levenshtein distance between the two
// strings passed as an argument. The return value is the levenshtein distance
//
// Works on runes (Unicode code points) but does not normalize
// the input strings. See https://blog.golang.org/normalization
// and the golang.org/x/text/unicode/norm package.
func ComputeDistance(a, b string) int {
if len(a) == 0 {
return utf8.RuneCountInString(b)
}
if len(b) == 0 {
return utf8.RuneCountInString(a)
}
if a == b {
return 0
}
// We need to convert to []rune if the strings are non-ASCII.
// This could be avoided by using utf8.RuneCountInString
// and then doing some juggling with rune indices,
// but leads to far more bounds checks. It is a reasonable trade-off.
s1 := []rune(a)
s2 := []rune(b)
// swap to save some memory O(min(a,b)) instead of O(a)
if len(s1) > len(s2) {
s1, s2 = s2, s1
}
// remove trailing identical runes.
for i := 0; i < len(s1); i++ {
if s1[len(s1)-1-i] != s2[len(s2)-1-i] {
s1 = s1[:len(s1)-i]
s2 = s2[:len(s2)-i]
break
}
}
// Remove leading identical runes.
for i := 0; i < len(s1); i++ {
if s1[i] != s2[i] {
s1 = s1[i:]
s2 = s2[i:]
break
}
}
lenS1 := len(s1)
lenS2 := len(s2)
// Init the row.
var x []uint16
if lenS1+1 > minLengthThreshold {
x = make([]uint16, lenS1+1)
} else {
// We make a small optimization here for small strings.
// Because a slice of constant length is effectively an array,
// it does not allocate. So we can re-slice it to the right length
// as long as it is below a desired threshold.
x = make([]uint16, minLengthThreshold)
x = x[:lenS1+1]
}
// we start from 1 because index 0 is already 0.
for i := 1; i < len(x); i++ {
x[i] = uint16(i)
}
// make a dummy bounds check to prevent the 2 bounds check down below.
// The one inside the loop is particularly costly.
_ = x[lenS1]
// fill in the rest
for i := 1; i <= lenS2; i++ {
prev := uint16(i)
for j := 1; j <= lenS1; j++ {
current := x[j-1] // match
if s2[i-1] != s1[j-1] {
current = min(x[j-1]+1, prev+1, x[j]+1)
}
x[j-1] = prev
prev = current
}
x[lenS1] = prev
}
return int(x[lenS1])
}