Skip to content

Commit

Permalink
golang: switch back to standard Go Regexp
Browse files Browse the repository at this point in the history
See #348 (comment)

Also, it turned out to be faster if regexps are checked individually,
not as one large |-concatenation of regexps. One regexp check consumes
66 microseconds on Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz.
  • Loading branch information
starius committed Apr 4, 2024
1 parent f08c3de commit a32a149
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 105 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f
* Python: `if re.search(entry['pattern'], ua): ...`
* Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`),
functions `IsCrawler` and `MatchingCrawlers`. To achieve the best performance possible in functions
`IsCrawler` and `MatchingCrawlers`, install C++ RE2 into your system: `sudo apt-get install libre2-dev`
and pass tag: `-tags re2_cgo`.
functions `IsCrawler` and `MatchingCrawlers`.

Example of Go program:

Expand Down
7 changes: 0 additions & 7 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
module github.com/monperrus/crawler-user-agents

go 1.19

require github.com/wasilibs/go-re2 v1.5.1

require (
github.com/magefile/mage v1.14.0 // indirect
github.com/tetratelabs/wazero v1.7.0 // indirect
)
7 changes: 0 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +0,0 @@
github.com/magefile/mage v1.14.0 h1:6QDX3g6z1YvJ4olPhT1wksUcSa/V0a1B+pJb73fBjyo=
github.com/magefile/mage v1.14.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A=
github.com/tetratelabs/wazero v1.7.0 h1:jg5qPydno59wqjpGrHph81lbtHzTrWzwwtD4cD88+hQ=
github.com/tetratelabs/wazero v1.7.0/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y=
github.com/wasilibs/go-re2 v1.5.1 h1:a+Gb1mx6Q7MmU4d+3BCnnN28U2/cnADmY1oRRanQi10=
github.com/wasilibs/go-re2 v1.5.1/go.mod h1:UqqxQ1O99boQUm1r61H/IYGiGQOS/P88K7hU5nLNkEg=
github.com/wasilibs/nottinygc v0.4.0 h1:h1TJMihMC4neN6Zq+WKpLxgd9xCFMw7O9ETLwY2exJQ=
103 changes: 15 additions & 88 deletions validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@ import (
_ "embed"
"encoding/json"
"fmt"
"strings"
"regexp"
"time"

regexp "github.com/wasilibs/go-re2"
)

//go:embed crawler-user-agents.json
Expand Down Expand Up @@ -82,102 +80,31 @@ var Crawlers = func() []Crawler {
return crawlers
}()

func joinRes(begin, end int) string {
regexps := make([]string, 0, len(Crawlers))
for _, crawler := range Crawlers[begin:end] {
regexps = append(regexps, "("+crawler.Pattern+")")
var regexps = func() []*regexp.Regexp {
regexps := make([]*regexp.Regexp, len(Crawlers))
for i, crawler := range Crawlers {
regexps[i] = regexp.MustCompile(crawler.Pattern)
}
return strings.Join(regexps, "|")
}

var allRegexps = joinRes(0, len(Crawlers))

var allRegexpsRe = regexp.MustCompile(allRegexps)
return regexps
}()

// Returns if User Agent string matches any of crawler patterns.
func IsCrawler(userAgent string) bool {
return allRegexpsRe.MatchString(userAgent)
}

// With RE2 it is fast to check the text against a large regexp.
// To find matching regexps faster, built a binary tree of regexps.

type regexpNode struct {
re *regexp.Regexp
left *regexpNode
right *regexpNode
index int
}

var regexpsTree = func() *regexpNode {
nodes := make([]*regexpNode, len(Crawlers))
starts := make([]int, len(Crawlers)+1)
for i, crawler := range Crawlers {
nodes[i] = &regexpNode{
re: regexp.MustCompile(crawler.Pattern),
index: i,
for _, re := range regexps {
if re.MatchString(userAgent) {
return true
}
starts[i] = i
}
starts[len(Crawlers)] = len(Crawlers) // To get end of interval.

for len(nodes) > 1 {
// Join into pairs.
nodes2 := make([]*regexpNode, (len(nodes)+1)/2)
starts2 := make([]int, 0, len(nodes2)+1)
for i := 0; i < len(nodes)/2; i++ {
leftIndex := 2 * i
rightIndex := 2*i + 1
nodes2[i] = &regexpNode{
left: nodes[leftIndex],
right: nodes[rightIndex],
}
if len(nodes2) != 1 {
// Skip regexp for root node, it is not used.
joinedRe := joinRes(starts[leftIndex], starts[rightIndex+1])
nodes2[i].re = regexp.MustCompile(joinedRe)
}
starts2 = append(starts2, starts[leftIndex])
}
if len(nodes)%2 == 1 {
nodes2[len(nodes2)-1] = nodes[len(nodes)-1]
starts2 = append(starts2, starts[len(starts)-2])
}
starts2 = append(starts2, starts[len(starts)-1])

nodes = nodes2
starts = starts2
}

root := nodes[0]

if root.left == nil {
panic("the algoriths does not work with just one regexp")
}

return root
}()
return false
}

// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers.
func MatchingCrawlers(userAgent string) []int {
indices := []int{}

var visit func(node *regexpNode)
visit = func(node *regexpNode) {
if node.left != nil {
if node.left.re.MatchString(userAgent) {
visit(node.left)
}
if node.right.re.MatchString(userAgent) {
visit(node.right)
}
} else {
// Leaf.
indices = append(indices, node.index)
for i, re := range regexps {
if re.MatchString(userAgent) {
indices = append(indices, i)
}
}

visit(regexpsTree)

return indices
}

0 comments on commit a32a149

Please sign in to comment.