Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Aho-Corasick #66

Merged
merged 1 commit into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions bench-out/bench-out.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
Name,Mean,MeanLB,MeanUB,Stddev,StddevLB,StddevUB
AhoCorasick/build from few long/100,8.513422680432697e-6,8.488458633778344e-6,8.546705080177561e-6,9.969128415851875e-8,6.874945026166872e-8,1.2551484185948497e-7
AhoCorasick/build from few long/10000,1.7882787405552921e-3,1.7827405684590716e-3,1.800476608986399e-3,2.6489400757498618e-5,1.527119101355923e-5,4.8771893418822536e-5
AhoCorasick/build from few long/1000000,0.23589482944379495,0.23200599944383513,0.24093268611088406,5.659296396731304e-3,2.7851112029909936e-3,8.843550322711289e-3
AhoCorasick/build from few long/500000,0.2870396793284454,0.2765157213341444,0.29758385999011805,1.3635218294303319e-2,1.0899630511983244e-2,1.5095991361441054e-2
AhoCorasick/match few long/100,2.2989241446890707e-6,2.2842848889416615e-6,2.3288720778934305e-6,6.554756497943676e-8,4.269301973156141e-8,1.0536142485745498e-7
AhoCorasick/match few long/10000,3.507593566002027e-4,3.503681577904525e-4,3.5142937242386426e-4,1.6338270733078287e-6,1.092881336848686e-6,2.3304923525208636e-6
AhoCorasick/match few long/1000000,3.656548186761784e-2,3.652561409766825e-2,3.6622430453914626e-2,9.672715275543123e-5,7.193501134881048e-5,1.2520862590684369e-4
AhoCorasick/match few long/500000,1.6762486934380014e-2,1.67489177556701e-2,1.6781974996993375e-2,3.836560959812748e-5,2.6144824792401538e-5,5.4936047377959083e-5
AhoCorasick/build from many short/100,7.620756091721171e-6,7.600920298876813e-6,7.653999891156354e-6,8.583816941679067e-8,6.086177964688925e-8,1.1094562635143705e-7
AhoCorasick/build from many short/10000,2.3220817337715476e-3,2.3162026496528912e-3,2.331857585301445e-3,2.41756256291307e-5,1.6287139995232412e-5,4.143502420584936e-5
AhoCorasick/build from many short/1000000,0.5978469729166894,0.5650704708333858,0.630623474999993,4.111337847925013e-2,2.18953305723231e-2,5.379875863123568e-2
AhoCorasick/build from many short/500000,0.28803835700508595,0.27732268700230633,0.3016158820060082,1.5092218078034075e-2,9.070650038611277e-3,1.903580993386635e-2
AhoCorasick/match many short/100,1.4163937802165651e-6,1.4148630918882928e-6,1.4183771713672137e-6,5.718962793971818e-9,4.258549520434209e-9,7.600752211313617e-9
AhoCorasick/match many short/10000,5.07770151244229e-4,5.070275720290223e-4,5.087876294835994e-4,3.054487854355818e-6,1.992049199816893e-6,5.195600275955076e-6
AhoCorasick/match many short/1000000,0.1395871239583336,0.1380405541666697,0.14196939375000284,2.908507943265563e-3,4.963682171872912e-4,3.706065436758448e-3
AhoCorasick/match many short/500000,5.7279700557087754e-2,5.6926181796010715e-2,5.7917360827559605e-2,8.894385491499692e-4,5.265095924127337e-4,1.3826368162908529e-3
Array/Arr/listArray @UArr @X/100,9.069885103472647e-7,9.035551382906603e-7,9.112235943143424e-7,1.2415186423055452e-8,1.1004677274345127e-8,1.4144533541032416e-8
Array/Arr/listArray @UArr @X/10000,8.69223926122827e-5,8.683319070226885e-5,8.703410027438146e-5,3.2818784665200795e-7,2.7162401018303904e-7,3.940282006642692e-7
Array/Arr/listArray @UArr @X/1000000,1.192827563639611e-2,1.1582270679330178e-2,1.244559316429011e-2,1.1138309892690973e-3,6.627342910107843e-4,1.7758030775786929e-3
Expand Down
10 changes: 5 additions & 5 deletions bench-out/bench-out.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ For details about any benchmark see its source file.

AhoCorasick
┌───────────────────────┬──────────┬──────────┬──────────┐
│ Name │ 100 │ 10000 │ 1000000
│ Name │ 100 │ 10000 │ 500000
╞═══════════════════════╪══════════╪══════════╪══════════╡
│ build from few long │ 8.513 μs │ 1.788 ms │ 235.9 ms │
│ build from few long │ 8.513 μs │ 1.788 ms │ 287.0 ms │
├───────────────────────┼──────────┼──────────┼──────────┤
│ match few long │ 2.299 μs │ 350.8 μs │ 36.57 ms │
│ match few long │ 2.299 μs │ 350.8 μs │ 16.76 ms │
├───────────────────────┼──────────┼──────────┼──────────┤
│ build from many short │ 7.621 μs │ 2.322 ms │ 597.8 ms │
│ build from many short │ 7.621 μs │ 2.322 ms │ 288.0 ms │
├───────────────────────┼──────────┼──────────┼──────────┤
│ match many short │ 1.416 μs │ 507.8 μs │ 139.6 ms │
│ match many short │ 1.416 μs │ 507.8 μs │ 57.28 ms │
└───────────────────────┴──────────┴──────────┴──────────┘

Array
Expand Down
4 changes: 2 additions & 2 deletions bench/AhoCorasickBench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ benchmark = bgroup "AhoCorasick"

-- Build the Aho-Corasick automaton from n/20 a-z strings of length 20.
, bgroup "build from many short" $ map (benchBuild genMany) sizes

-- Match an Aho-Corasick automaton built from n/20 a-z strings of length 20 on a string of
-- length n.
, bgroup "match many short" $ map (benchMatch genMany) sizes
]

sizes :: [Int]
sizes = [100, 10000, 1000000]
sizes = [100, 10000, 500000]

benchBuild :: (Int -> RandStd [(C.ByteString, Int)]) -> Int -> Benchmark
benchBuild genps n = sizedBench n gen $ nf (fromTrieAC . fromListTAC) where
Expand Down
78 changes: 42 additions & 36 deletions src/AhoCorasick.hs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
{-# LANGUAGE BangPatterns #-}

{-|
Aho-Corasick algorithm

The Aho-Corasick algorithm builds an automaton from a set of pattern strings, and then uses it to
find positions in a search string where each of the pattern strings occur.

This implementation only works on ByteStrings, to keep things fast. If required it can be adapted
to work on Strings, or even more generally (Ord a, Foldable f) => f a.
to work on other sequence types.

A TrieAC a can be constructed from pattern strings with associated values a, which can be then be
turned into an ACRoot a. An ACRoot a can then be run on a search string to find matches.
Expand All @@ -18,18 +20,22 @@ Sources:
* Stanford CS166 Aho-Corasick lecture slides
https://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/04/Slides04.pdf

Let k be the alphabet size. Let the complexity of IntMap operations be f(n), where n is the size of
the map. f(n) is O(min(n, word size)), see IntMap documentation for details.
Implementation notes:
* We have to be lazy in the (Maybe (ACNode a)) and the [a] in fromTrieAC because we build the tree
depth-first and strictly (due to IntMap.Strict). If we could build it breadth-first, then we
could be strict in these, but I don't see an easy way to do that.

For complexities below, k is the alphabet range (max 256).

emptyTAC
An empty trie.

insertTAC
Inserts a string with an associated value into a trie. O(n * f(k)) where n is the length of the
Inserts a string with an associated value into a trie. O(n log k) where n is the length of the
string.

fromListTAC
Builds a trie from a list of strings and associated values. O(n * f(k)) where n is total length of
Builds a trie from a list of strings and associated values. O(n log k) where n is total length of
the strings.

fromTrieAC
Expand All @@ -41,7 +47,7 @@ Returns a list of length (m + 1) where m is the length of the search string. Thi
list of pattern matches for every position in the string, including before the first character. A
match at a position is present as the associated value of the pattern string found to be ending at
that position.
O(m * f(k) + z), where m is the length of the string and z is the total number of matches.
O(m log k + z), where m is the length of the string and z is the total number of matches.
-}

module AhoCorasick
Expand All @@ -54,61 +60,61 @@ module AhoCorasick
, matchAC
) where

import Control.Applicative
import Control.DeepSeq
import Data.List
import Data.Maybe
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString as B
import qualified Data.IntMap.Strict as IM

data ACRoot a = ACRoot (IM.IntMap (ACNode a)) [a]
data ACNode a = ACNode (IM.IntMap (ACNode a)) [a] (ACLink a)
data ACLink a = RootL | NodeL !(ACNode a)
data ACRoot a = ACRoot !(IM.IntMap (ACNode a)) [a]
data ACNode a = ACNode !(IM.IntMap (ACNode a)) (Maybe (ACNode a)) [a]

fromTrieAC :: TrieAC a -> ACRoot a
fromTrieAC (TrieAC tm tvs) = ACRoot rmp tvs where
fromTrieAC (TrieAC tm routs) = ACRoot rmp routs where
rmp = IM.map go1 tm
go1 (TrieAC m vs) = ACNode (IM.mapWithKey (go RootL) m) (vs ++ tvs) RootL
go psuf c (TrieAC m vs) = ACNode (IM.mapWithKey (go suf) m) outs suf where
go1 (TrieAC m vs) = ACNode (IM.mapWithKey (go Nothing) m) Nothing (vs ++ routs)
go psuf !c (TrieAC m vs) = ACNode (IM.mapWithKey (go suf) m) suf outs where
suf = getSuf psuf
getSuf RootL = maybe RootL NodeL (IM.lookup c rmp)
getSuf (NodeL (ACNode mp' _ suf')) = maybe (getSuf suf') NodeL (IM.lookup c mp')
outs = vs ++ case suf of
RootL -> tvs
NodeL (ACNode _ outs' _) -> outs'

matchAC :: ACRoot a -> C.ByteString -> [[a]]
matchAC (ACRoot rmp routs) = (routs:) . go1 where
go1 = go rmp $ const ((routs:) . go1)
go2 (ACNode mp _ suf) = go mp $ const . case suf of
RootL -> go1
NodeL x -> go2 x
go mp miss s = case C.uncons s of
getSuf Nothing = IM.lookup c rmp
getSuf (Just (ACNode mp' suf' _)) = IM.lookup c mp' <|> getSuf suf'
outs = vs ++ maybe routs (\(ACNode _ _ outs') -> outs') suf

matchAC :: ACRoot a -> B.ByteString -> [[a]]
matchAC (ACRoot rmp routs) !s0 = routs : gor s0 where
gor s = case B.uncons s of
Nothing -> []
Just (c,s') -> case IM.lookup (fromEnum c) rmp of
Nothing -> routs : gor s'
Just (ACNode mp suf outs) -> outs : go mp suf s'
go mp suf s = case B.uncons s of
Nothing -> []
Just (c, s') -> case IM.lookup (fromEnum c) mp of
Nothing -> miss s s'
Just x@(ACNode _ outs _) -> outs : go2 x s'
Nothing -> maybe gor (\(ACNode mp' suf' _) -> go mp' suf') suf s
Just (ACNode mp' suf' outs) -> outs : go mp' suf' s'

data TrieAC a = TrieAC (IM.IntMap (TrieAC a)) [a] deriving Show
data TrieAC a = TrieAC !(IM.IntMap (TrieAC a)) ![a] deriving Show

emptyTAC :: TrieAC a
emptyTAC = TrieAC IM.empty []

insertTAC :: C.ByteString -> a -> TrieAC a -> TrieAC a
insertTAC :: B.ByteString -> a -> TrieAC a -> TrieAC a
insertTAC s v = go s where
go cs (TrieAC m vs) = case C.uncons cs of
go cs (TrieAC m vs) = case B.uncons cs of
Nothing -> TrieAC m (v:vs)
Just (c, cs') -> TrieAC m' vs where
m' = IM.alter (Just . go cs' . fromMaybe emptyTAC) (fromEnum c) m
m' = IM.alter ((Just $!) . go cs' . fromMaybe emptyTAC) (fromIntegral c) m

fromListTAC :: [(C.ByteString, a)] -> TrieAC a
fromListTAC :: [(B.ByteString, a)] -> TrieAC a
fromListTAC = foldl' (\t (s, v) -> insertTAC s v t) emptyTAC

--------------------------------------------------------------------------------
-- For tests

-- outs of nodes share structure, so rnf is O(n^2)
instance NFData a => NFData (ACNode a) where
rnf (ACNode mp outs suf) = rnf outs `seq` suf `seq` rnf mp
rnf (ACNode mp _outs suf) = suf `seq` rnf mp
-- outs of nodes share structure, so it is not forced
-- the suf link is forced only to WHNF, otherwise it would be reevaluating various parts of the tree

instance NFData a => NFData (ACRoot a) where
rnf (ACRoot mp outs) = rnf outs `seq` rnf mp
rnf (ACRoot mp _outs) = rnf mp
Loading