diff --git a/README.md b/README.md index 2d991ce..49e7834 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,7 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f * Python: `if re.search(entry['pattern'], ua): ...` * Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents), it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`), - functions `IsCrawler` and `MatchingCrawlers`. To achieve the best performance possible in functions - `IsCrawler` and `MatchingCrawlers`, install C++ RE2 into your system: `sudo apt-get install libre2-dev` - and pass tag: `-tags re2_cgo`. + functions `IsCrawler` and `MatchingCrawlers`. Example of Go program: diff --git a/go.mod b/go.mod index cfc2c1c..54db180 100644 --- a/go.mod +++ b/go.mod @@ -1,10 +1,3 @@ module github.com/monperrus/crawler-user-agents go 1.19 - -require github.com/wasilibs/go-re2 v1.5.1 - -require ( - github.com/magefile/mage v1.14.0 // indirect - github.com/tetratelabs/wazero v1.7.0 // indirect -) diff --git a/go.sum b/go.sum index cbc2578..e69de29 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +0,0 @@ -github.com/magefile/mage v1.14.0 h1:6QDX3g6z1YvJ4olPhT1wksUcSa/V0a1B+pJb73fBjyo= -github.com/magefile/mage v1.14.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= -github.com/tetratelabs/wazero v1.7.0 h1:jg5qPydno59wqjpGrHph81lbtHzTrWzwwtD4cD88+hQ= -github.com/tetratelabs/wazero v1.7.0/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y= -github.com/wasilibs/go-re2 v1.5.1 h1:a+Gb1mx6Q7MmU4d+3BCnnN28U2/cnADmY1oRRanQi10= -github.com/wasilibs/go-re2 v1.5.1/go.mod h1:UqqxQ1O99boQUm1r61H/IYGiGQOS/P88K7hU5nLNkEg= -github.com/wasilibs/nottinygc v0.4.0 h1:h1TJMihMC4neN6Zq+WKpLxgd9xCFMw7O9ETLwY2exJQ= diff --git a/validate.go b/validate.go index 3df52ec..41ab9d1 100644 --- a/validate.go +++ b/validate.go @@ -4,10 +4,8 @@ import ( _ "embed" "encoding/json" "fmt" - "strings" + "regexp" "time" - - regexp "github.com/wasilibs/go-re2" ) //go:embed crawler-user-agents.json @@ -82,102 +80,31 @@ var Crawlers = func() []Crawler { return crawlers }() -func joinRes(begin, end int) string { - regexps := make([]string, 0, len(Crawlers)) - for _, crawler := range Crawlers[begin:end] { - regexps = append(regexps, "("+crawler.Pattern+")") +var regexps = func() []*regexp.Regexp { + regexps := make([]*regexp.Regexp, len(Crawlers)) + for i, crawler := range Crawlers { + regexps[i] = regexp.MustCompile(crawler.Pattern) } - return strings.Join(regexps, "|") -} - -var allRegexps = joinRes(0, len(Crawlers)) - -var allRegexpsRe = regexp.MustCompile(allRegexps) + return regexps +}() // Returns if User Agent string matches any of crawler patterns. func IsCrawler(userAgent string) bool { - return allRegexpsRe.MatchString(userAgent) -} - -// With RE2 it is fast to check the text against a large regexp. -// To find matching regexps faster, built a binary tree of regexps. - -type regexpNode struct { - re *regexp.Regexp - left *regexpNode - right *regexpNode - index int -} - -var regexpsTree = func() *regexpNode { - nodes := make([]*regexpNode, len(Crawlers)) - starts := make([]int, len(Crawlers)+1) - for i, crawler := range Crawlers { - nodes[i] = ®expNode{ - re: regexp.MustCompile(crawler.Pattern), - index: i, + for _, re := range regexps { + if re.MatchString(userAgent) { + return true } - starts[i] = i } - starts[len(Crawlers)] = len(Crawlers) // To get end of interval. - - for len(nodes) > 1 { - // Join into pairs. - nodes2 := make([]*regexpNode, (len(nodes)+1)/2) - starts2 := make([]int, 0, len(nodes2)+1) - for i := 0; i < len(nodes)/2; i++ { - leftIndex := 2 * i - rightIndex := 2*i + 1 - nodes2[i] = ®expNode{ - left: nodes[leftIndex], - right: nodes[rightIndex], - } - if len(nodes2) != 1 { - // Skip regexp for root node, it is not used. - joinedRe := joinRes(starts[leftIndex], starts[rightIndex+1]) - nodes2[i].re = regexp.MustCompile(joinedRe) - } - starts2 = append(starts2, starts[leftIndex]) - } - if len(nodes)%2 == 1 { - nodes2[len(nodes2)-1] = nodes[len(nodes)-1] - starts2 = append(starts2, starts[len(starts)-2]) - } - starts2 = append(starts2, starts[len(starts)-1]) - - nodes = nodes2 - starts = starts2 - } - - root := nodes[0] - - if root.left == nil { - panic("the algoriths does not work with just one regexp") - } - - return root -}() + return false +} // Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers. func MatchingCrawlers(userAgent string) []int { indices := []int{} - - var visit func(node *regexpNode) - visit = func(node *regexpNode) { - if node.left != nil { - if node.left.re.MatchString(userAgent) { - visit(node.left) - } - if node.right.re.MatchString(userAgent) { - visit(node.right) - } - } else { - // Leaf. - indices = append(indices, node.index) + for i, re := range regexps { + if re.MatchString(userAgent) { + indices = append(indices, i) } } - - visit(regexpsTree) - return indices }