From cf4e25b339aed88f39f2d7cf8eca39a5d871b01d Mon Sep 17 00:00:00 2001 From: meooow25 Date: Sun, 24 Nov 2024 09:15:43 +0530 Subject: [PATCH] Compare with pcre-heavy and pcre2 --- bench/Compare.hs | 229 ++++++++++++++++++++++++++++++++++++++++------ bench/README.md | 164 +++++++++++++++++++++------------ bench/bench.cabal | 2 + 3 files changed, 304 insertions(+), 91 deletions(-) diff --git a/bench/Compare.hs b/bench/Compare.hs index ba1b1a4..afea987 100644 --- a/bench/Compare.hs +++ b/bench/Compare.hs @@ -3,13 +3,14 @@ {-# LANGUAGE OverloadedStrings #-} module Compare (benches) where -import Control.Applicative -import Control.DeepSeq -import Control.Monad -import Data.Char -import Data.Array +import Control.Applicative (Alternative(..), optional) +import Control.DeepSeq (NFData(..)) +import Control.Monad (replicateM_) +import Data.Char (digitToInt, chr) +import Data.Array ((!)) import qualified Data.Foldable as F -import Data.Maybe +import qualified Data.List.NonEmpty as NE +import Data.Maybe (fromJust) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as BC @@ -17,9 +18,10 @@ import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as TEnc import GHC.Generics (Generic) +import System.Mem (performMinorGC) import Test.Tasty (testGroup) -import Test.Tasty.Bench +import Test.Tasty.Bench hiding (nf) import Test.Tasty.HUnit (testCase, (@?=)) -- parser-regex @@ -45,6 +47,12 @@ import qualified Text.RE.TDFA.Text as TDFAReplace -- regex-with-pcre import qualified Text.RE.PCRE.ByteString as PCREReplace +-- pcre-heavy +import qualified Text.Regex.PCRE.Heavy as Heavy + +-- pcre2 +import qualified Text.Regex.Pcre2 as Pcre2 + benches :: Benchmark benches = bgroup "compare" [ env englishText $ \ ~(t,b,s) -> @@ -54,12 +62,16 @@ benches = bgroup "compare" , bench "regex-applicative S" $ nf english1RA s , bench "regex-tdfa T" $ nf english1TDFA t , bench "regex-pcre-builtin BS" $ nf english1PCRE b + , bench "pcre-heavy T" $ nf english1PCREHeavy t + , bench "pcre2 T" $ nf english1Pcre2 t , testGroup "tests" [ testCase "check count" $ length (english1PR t) @?= 900 , testCase "S == L" $ map T.pack (english1PRS s) @?= english1PR t , testCase "regex-applicative ==" $ map T.pack (english1RA s) @?= english1PR t , testCase "regex-tdfa ==" $ english1TDFA t @?= english1PR t , testCase "regex-pcre-builtin ==" $ map TEnc.decodeUtf8 (english1PCRE b) @?= english1PR t + , testCase "pcre-heavy ==" $ english1PCREHeavy t @?= english1PR t + , testCase "pcre2 ==" $ english1Pcre2 t @?= english1PR t ] ] , env englishText $ \ ~(t,b,s) -> @@ -69,13 +81,20 @@ benches = bgroup "compare" , bench "regex-applicative S" $ nf english2RA s , bench "regex-tdfa T" $ nf english2TDFA t , bench "regex-pcre-builtin BS" $ nf english2PCRE b + , bench "pcre-heavy T" $ nf english2PCREHeavy t + , bench "pcre2 T" $ nf english2Pcre2 t , testGroup "tests" [ testCase "check count" $ length (english2PR t) @?= 365 , testCase "S == T" $ map T.pack (english2PRS s) @?= english2PR t , testCase "regex-applicative ==" $ map T.pack (english2RA s) @?= english2PR t , testCase "regex-tdfa ==" $ english2TDFA t @?= english2PR t + + -- Cannot compare results for regex-pcre-builtin and pcre-heavy. Their + -- counts don't match because they match bytes and not Chars. , testCase "regex-pcre-builtin ==" $ length (english2PCRE b) @?= 354 - -- pcre count doesn't match because it matches bytes and not Chars + , testCase "pcre-heavy ==" $ length (english2PCREHeavy t) @?= 354 + + , testCase "pcre2 ==" $ english2Pcre2 t @?= english2PR t ] ] , env englishText $ \ ~(t,b,s) -> @@ -85,11 +104,15 @@ benches = bgroup "compare" , bench "regex-applicative S" $ nf englishReplaceRA s , bench "regex-tdfa T" $ nf englishReplaceTDFA t , bench "regex-pcre-builtin BS" $ nf englishReplacePCRE b + , bench "pcre-heavy T" $ nf englishReplacePCREHeavy t + , bench "pcre2 T" $ nf englishReplacePcre2 t , testGroup "tests" [ testCase "S == T" $ T.pack (englishReplacePRS s) @?= englishReplacePR t , testCase "regex-applicative ==" $ T.pack (englishReplaceRA s) @?= englishReplacePR t , testCase "regex-tdfa ==" $ englishReplaceTDFA t @?= englishReplacePR t , testCase "regex-pcre-builtin ==" $ englishReplacePCRE b @?= TEnc.encodeUtf8 (englishReplacePR t) + , testCase "pcre-heavy ==" $ englishReplacePCREHeavy t @?= englishReplacePR t + , testCase "pcre2 ==" $ englishReplacePcre2 t @?= englishReplacePR t ] ] , env caseFoldingTxt $ \ ~(t,b,s) -> @@ -99,12 +122,16 @@ benches = bgroup "compare" , bench "regex-applicative S" $ nf caseFoldingRA s , bench "regex-tdfa T" $ nf caseFoldingTDFA t , bench "regex-pcre-builtin BS" $ nf caseFoldingPCRE b + , bench "pcre-heavy T" $ nf caseFoldingPCREHeavy t + , bench "pcre2 T" $ nf caseFoldingPcre2 t , testGroup "tests" [ testCase "check count" $ length (caseFoldingPR t) @?= 1563 , testCase "S == T" $ caseFoldingPRS s @?= caseFoldingPR t , testCase "regex-applicative ==" $ caseFoldingRA s @?= caseFoldingPR t , testCase "regex-tdfa ==" $ caseFoldingTDFA t @?= caseFoldingPR t , testCase "regex-pcre-builtin ==" $ caseFoldingPCRE b @?= caseFoldingPR t + , testCase "pcre-heavy ==" $ caseFoldingPCREHeavy t @?= caseFoldingPR t + , testCase "pcre2 ==" $ caseFoldingPcre2 t @?= caseFoldingPcre2 t ] ] , env htmlText $ \ ~(t,b,s) -> @@ -114,30 +141,49 @@ benches = bgroup "compare" , bench "regex-applicative S" $ nf uriRA s , bench "regex-tdfa T" $ nf uriTDFA t , bench "regex-pcre-builtin BS" $ nf uriPCRE b + , bench "pcre-heavy T" $ nf uriPCREHeavy t + -- , bench "pcre2 T" $ nf uriPcre2 t , testGroup "tests" [ testCase "check count" $ length (uriPR t) @?= 4277 , testCase "S == T" $ map uriS2T (uriPRS s) @?= uriPR t , testCase "regex-applicative ==" $ map uriS2T (uriRA s) @?= uriPR t , testCase "regex-tdfa ==" $ uriTDFA t @?= uriPR t , testCase "regex-pcre-builtin ==" $ uriPCRE b @?= map uriT2BS (uriPR t) + + -- Only check length. Comparing results fails because we cannot + -- distinguish between optional no capture and empty capture using + -- pcre-heavy. + , testCase "pcre-heavy ==" $ length (uriPCREHeavy t) @?= 4277 + + -- Exception: pcre2: UTF-8 error: isolated byte with 0x80 bit set + -- , testCase "pcre2 ==" $ uriPcre2 t @?= uriPR t ] ] , bgroup "Exponential backtracking" - [ bench "parser-regex T" $ whnf expPR expText - , bench "parser-regex S" $ whnf expPRS expString - , bench "regex-applicative S" $ whnf expRA expString - , bench "regex-tdfa T" $ whnf expTDFA expText - , bench "regex-pcre-builtin BS" $ whnf expPCRE expBS + [ bench "parser-regex T" $ nf expPR expText + , bench "parser-regex S" $ nf expPRS expString + , bench "regex-applicative S" $ nf expRA expString + , bench "regex-tdfa T" $ nf expTDFA expText + , bench "regex-pcre-builtin BS" $ nf expPCRE expBS + , bench "pcre-heavy T" $ nf expPCREHeavy expText + , bench "pcre2 T" $ nf expPcre2 expText , testGroup "tests" [ testCase "parser-regex T True" $ expPR expText @?= True , testCase "parser-regex S True" $ expPRS expString @?= True , testCase "regex-applicative True" $ expRA expString @?= True , testCase "regex-tdfa True" $ expTDFA expText @?= True , testCase "regex-pcre-builtin True" $ expPCRE expBS @?= True + , testCase "pcre-heavy True" $ expPCREHeavy expText @?= True + , testCase "pcre2 True" $ expPcre2 expText @?= True ] ] ] +-- Need to perform GC to get correct memory stats +-- See https://github.com/Bodigrim/tasty-bench/issues/62 +nf :: NFData b => (a -> b) -> a -> Benchmarkable +nf f = whnfAppIO $ \x -> case rnf (f x) of () -> performMinorGC + ------------------- -- English text 1 ------------------- @@ -182,6 +228,17 @@ english1PCRE = map (fst . (! 0)) . RBase.matchAllText re re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt ("Tom|Sawyer|Huckleberry|Finn" :: ByteString) +-- pcre-heavy +english1PCREHeavy :: Text -> [Text] +english1PCREHeavy = map fst . Heavy.scan re + where + re = either error id $ + Heavy.compileM (TEnc.encodeUtf8 "Tom|Sawyer|Huckleberry|Finn") [] + +-- pcre2 +english1Pcre2 :: Text -> [Text] +english1Pcre2 = Pcre2.match "Tom|Sawyer|Huckleberry|Finn" + ------------------- -- English text 2 ------------------- @@ -235,6 +292,17 @@ english2PCRE = map (fst . (! 0)) . RBase.matchAllText re re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt (TEnc.encodeUtf8 "“[^?!.]{0,30}[?!.]”") +-- pcre-heavy +english2PCREHeavy :: Text -> [Text] +english2PCREHeavy = map fst . Heavy.scan re + where + re = either error id $ + Heavy.compileM (TEnc.encodeUtf8 "“[^?!.]{0,30}[?!.]”") [] + +-- pcre2 +english2Pcre2 :: Text -> [Text] +english2Pcre2 = Pcre2.match "“[^?!.]{0,30}[?!.]”" + -------------------- -- English replace -------------------- @@ -262,7 +330,7 @@ englishReplaceRA = RA.replace $ <|> "Tom" <$ RA.string "Huckleberry" <|> "Sawyer" <$ RA.string "Finn" --- regex +-- regex-tdfa englishReplaceTDFA :: Text -> Text englishReplaceTDFA t = Replace.replaceAllCaptures Replace.TOP repl $ t TDFAReplace.*=~ re @@ -275,6 +343,7 @@ englishReplaceTDFA t = "Finn" -> Just "Sawyer" _ -> error "impossible" +-- regex-pcre-builtin englishReplacePCRE :: ByteString -> ByteString englishReplacePCRE t = Replace.replaceAllCaptures Replace.TOP repl $ t PCREReplace.*=~ re @@ -287,6 +356,26 @@ englishReplacePCRE t = "Finn" -> Just "Sawyer" _ -> error "impossible" +-- pcre-heavy +englishReplacePCREHeavy :: Text -> Text +englishReplacePCREHeavy = Heavy.gsub re repl + where + re = either error id $ + Heavy.compileM (TEnc.encodeUtf8 "Tom|Sawyer|Huckleberry|Finn") [] + repl :: Text -> Text + repl cap = case cap of + "Tom" -> "Huckleberry" + "Sawyer" -> "Finn" + "Huckleberry" -> "Tom" + "Finn" -> "Sawyer" + _ -> error "impossible" + +-- pcre2 +englishReplacePcre2 :: Text -> Text +englishReplacePcre2 = Pcre2.gsub pat "${*MARK}" + where + pat = "(*MARK:Huckleberry)Tom|(*MARK:Finn)Sawyer|(*MARK:Tom)Huckleberry|(*MARK:Sawyer)Finn" + -------------------- -- CaseFolding.txt -------------------- @@ -346,7 +435,8 @@ caseFoldingRA = fromJust . RA.match re -- regex-tdfa caseFoldingTDFA :: Text -> [CaseFold] -caseFoldingTDFA = map (toCaseFold toc T.words) . RBase.matchAllText re +caseFoldingTDFA = + map (matchTextToCaseFold textHexToChar T.words) . RBase.matchAllText re where re :: TDFA.Regex re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt $ @@ -356,11 +446,11 @@ caseFoldingTDFA = map (toCaseFold toc T.words) . RBase.matchAllText re , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|" , "(([0-9A-F]*); T; ([0-9A-F]*))" ] - toc = chr . T.foldl' (\acc x -> acc * 16 + digitToInt x) 0 -- regex-pcre-builtin caseFoldingPCRE :: ByteString -> [CaseFold] -caseFoldingPCRE = map (toCaseFold toc BC.words) . RBase.matchAllText re +caseFoldingPCRE = + map (matchTextToCaseFold bcHexToChar BC.words) . RBase.matchAllText re where re :: PCREBS.Regex re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt $ @@ -370,26 +460,68 @@ caseFoldingPCRE = map (toCaseFold toc BC.words) . RBase.matchAllText re , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|" , "(([0-9A-F]*); T; ([0-9A-F]*))" ] - toc = chr . BC.foldl' (\acc x -> acc * 16 + digitToInt x) 0 + +-- pcre-heavy +caseFoldingPCREHeavy :: Text -> [CaseFold] +caseFoldingPCREHeavy = map (listToCaseFold . extend13 . snd) . Heavy.scan re + where + re = either error id $ + Heavy.compileM + (B.concat + [ "(([0-9A-F]*); C; ([0-9A-F]*))|" + , "(([0-9A-F]*); S; ([0-9A-F]*))|" + , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|" + , "(([0-9A-F]*); T; ([0-9A-F]*))" + ]) [] + + -- The list does not extend beyond the last capture :< + extend13 = take 13 . (++ repeat T.empty) + +-- pcre2 +caseFoldingPcre2 :: Text -> [CaseFold] +caseFoldingPcre2 = map (listToCaseFold . NE.tail) . Pcre2.captures re + where + re = T.concat + [ "(([0-9A-F]*); C; ([0-9A-F]*))|" + , "(([0-9A-F]*); S; ([0-9A-F]*))|" + , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|" + , "(([0-9A-F]*); T; ([0-9A-F]*))" + ] + -- Note: regex with only submatches is incapable of parsing the nested -- space separated codes in the F case. -- So the string is captured and parsed after the regex delivers its results. -toCaseFold +bcHexToChar :: ByteString -> Char +bcHexToChar = chr . BC.foldl' (\acc x -> acc * 16 + digitToInt x) 0 + +textHexToChar :: Text -> Char +textHexToChar = chr . T.foldl' (\acc x -> acc * 16 + digitToInt x) 0 + +matchTextToCaseFold :: (t -> Char) -- hex to Char -> (t -> [t]) -- words -> RBase.MatchText t -> CaseFold -toCaseFold toc ws m - | Just _ <- idxMay 1 = Common (toc (idx 2)) (toc (idx 3)) - | Just _ <- idxMay 4 = Simple (toc (idx 5)) (toc (idx 6)) - | Just _ <- idxMay 7 = Full (toc (idx 8)) (tocs (idx 9)) - | otherwise = Turkic (toc (idx 12)) (toc (idx 13)) +matchTextToCaseFold toc ws m + | Just _ <- idxMay 1 = Common (toc (idx 2)) (toc (idx 3)) + | Just _ <- idxMay 4 = Simple (toc (idx 5)) (toc (idx 6)) + | Just _ <- idxMay 7 = Full (toc (idx 8)) (map toc (ws (idx 9))) + | Just _ <- idxMay 11 = Turkic (toc (idx 12)) (toc (idx 13)) + | otherwise = error "impossible" where idx i = fst (m ! i) idxMay i = let (t,(o,_)) = m ! i in if o == -1 then Nothing else Just t - tocs = map toc . ws + +listToCaseFold :: [Text] -> CaseFold +listToCaseFold [x1,x2,x3,x4,x5,x6,x7,x8,x9,_x10,x11,x12,x13] + | not (T.null x1) = Common (textHexToChar x2) (textHexToChar x3) + | not (T.null x4) = Simple (textHexToChar x5) (textHexToChar x6) + | not (T.null x7) = Full (textHexToChar x8) (map textHexToChar (T.words x9)) + | not (T.null x11) = Turkic (textHexToChar x12) (textHexToChar x13) + | otherwise = error "impossible" +listToCaseFold x = error $ show $ length x -------- -- URI @@ -459,7 +591,7 @@ uriRA = fromJust . RA.match re -- regex-tdfa uriTDFA :: Text -> [URI Text] -uriTDFA = map toURI . RBase.matchAllText re +uriTDFA = map matchTextToURI . RBase.matchAllText re where re :: TDFA.Regex re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt @@ -467,18 +599,43 @@ uriTDFA = map toURI . RBase.matchAllText re -- regex-pcre-builtin uriPCRE :: ByteString -> [URI ByteString] -uriPCRE = map toURI . RBase.matchAllText re +uriPCRE = map matchTextToURI . RBase.matchAllText re where re :: PCREBS.Regex re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt ("href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\"" :: ByteString) -toURI :: RBase.MatchText t -> URI t -toURI m = URI (idxMay 2) (idxMay 4) (idx 5) (idxMay 7) (idxMay 9) +matchTextToURI :: RBase.MatchText t -> URI t +matchTextToURI m = URI (idxMay 2) (idxMay 4) (idx 5) (idxMay 7) (idxMay 9) where idx i = fst (m ! i) idxMay i = let (t,(o,_)) = m ! i in if o == -1 then Nothing else Just t +-- pcre-heavy +uriPCREHeavy :: Text -> [URI Text] +uriPCREHeavy = map (listToURI . extend9 . snd) . Heavy.scan re + where + re = either error id $ + Heavy.compileM + ("href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\"" :: ByteString) [] + + -- The list does not extend beyond the last capture :< + extend9 = take 9 . (++ repeat T.empty) + +-- pcre2 +uriPcre2 :: Text -> [URI Text] +uriPcre2 = map (listToURI . NE.tail) . Pcre2.captures re + where + re = "href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\"" + +listToURI :: [Text] -> URI Text +listToURI [_x1,x2,_x3,x4,x5,_x6,x7,_x8,x9] = + URI (notEmptyT x2) (notEmptyT x4) x5 (notEmptyT x7) (notEmptyT x9) + where + notEmptyT "" = Nothing + notEmptyT t = Just t +listToURI _ = error "impossible" + ----------------------------- -- Exponential backtracking ----------------------------- @@ -534,6 +691,19 @@ expPCRE = RBase.matchTest re re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt (BC.pack $ concat $ ["^"] <> replicate expN "a?" <> replicate expN "a" <> ["$"]) +-- pcre-heavy +expPCREHeavy :: Text -> Bool +expPCREHeavy = (Heavy.=~ re) + where + re = either error id $ + Heavy.compileM + (BC.pack $ concat $ ["^"] <> replicate expN "a?" <> replicate expN "a" <> ["$"]) [] + +-- pcre2 +expPcre2 :: Text -> Bool +expPcre2 = Pcre2.matches + ("^" <> T.replicate expN "a?" <> T.replicate expN "a" <> "$") + --------------- -- File utils --------------- @@ -570,4 +740,3 @@ replicateAppendMRA n0 re re1 = go n0 where go 0 = re1 go n = liftA2 (:) re (go (n-1)) - diff --git a/bench/README.md b/bench/README.md index f131440..67d4f65 100644 --- a/bench/README.md +++ b/bench/README.md @@ -6,92 +6,134 @@ A comparison of some Haskell regex libraries: * [`regex-applicative`](https://hackage.haskell.org/package/regex-applicative) * [`regex-tdfa`](https://hackage.haskell.org/package/regex-tdfa) * [`regex-pcre-builtin`](https://hackage.haskell.org/package/regex-pcre-builtin) - -| | `parser-regex` | `regex-applicative` | `regex-tdfa` | `regex-pcre-builtin` | -| --- | --- | --- | --- | --- | -| Regex construction | Combinators | Combinators | Pattern | Pattern | -| Unicode aware | Yes | Yes | Yes | No[1] | -| Parsing features | Yes | Yes | Submatch only | Submatch only | -| Extensions | No | No | No | Yes (lookahead, backreferences, etc.) | -| Text matching speed
(`English text 1,2`) | Baseline | Slower | Faster | Very fast | -| Text replace speed
(`English replace all`) | Baseline | Slower | Slower[2] | Comparable[3] | -| Parsing speed
(`CaseFolding.txt`,`URI`) | Baseline | Slower | Comparable | Very fast | -| Regex compilation complexity | $O(m)$ | Undocumented, $O(m^2)$ judging by source code | Undocumented | Undocumented | -| Parsing complexity | $O(mn \log m)$ | Documented "roughly linear complexity", $O(m^2 n \log m)$ judging by source code | $O(n)$ claimed[4] | Undocumented, $O(2^n)$ seen experimentally | +* [`pcre-heavy`](https://hackage.haskell.org/package/pcre-heavy) +* [`pcre2`](https://hackage.haskell.org/package/pcre2) + +| | `parser-regex` | `regex-applicative` | `regex-tdfa` | `regex-pcre-builtin` | `pcre-heavy` | `pcre2` | +| --- | --- | --- | --- | --- | --- | --- | +| Regex construction | Combinators | Combinators | Pattern | Pattern | Pattern | Pattern | +| Unicode aware | Yes | Yes | Yes | No[1] | No | Yes | +| Parsing features | Yes | Yes | Submatch only | Submatch only | Submatch only | Submatch only | +| Extensions | No | No | No | Yes[2] | Yes[2] | Yes[2] | +| Text matching speed
(`English text 1,2`) | Baseline | Very slow | Fast | Very fast | Very fast | Slow | +| Text replace speed
(`English replace all`) | Baseline | Slow | Slow[3] | Comparable[4] | Very fast | Fast | +| Parsing speed
(`CaseFolding.txt`,`URI`) | Baseline | Slow | Comparable | Very fast | Very fast | ⚠ UTF-8 error | +| Regex compilation complexity | $O(m)$ | $O(m^2)$ judging by source code | Unclear | Unclear | Unclear | Unclear | +| Parsing complexity | $O(mn \log m)$ | $O(m^2 n \log m)$ judging by source code | $O(n)$ claimed[4] | $O(2^n)$ seen experimentally | $O(2^n)$ seen experimentally | $O(2^n)$ seen experimentally | 1 [`regex-pcre-builtin#3`](https://github.com/audreyt/regex-pcre-builtin/issues/3) -2 Replacement requires a separate library, [`regex`](https://hackage.haskell.org/package/regex) -3 Replacement requires a separate library, [`regex-with-pcre`](https://hackage.haskell.org/package/regex-with-pcre) -4 I do not know if this is accurate, since $O(n)$ is only possible by spending - $O(2^m)$ on compilation, which libraries usually consider too great a cost. - `regex-tdfa` mentions that it is based on the [`tre`](https://github.com/laurikari/tre/) - library, which claims $O(m^2 n)$ time. This could be true of `regex-tdfa` also. +2 Supports lookahead, backreferences, etc. +3 Replacement requires a separate library, [`regex`](https://hackage.haskell.org/package/regex) +4 Replacement requires a separate library, [`regex-with-pcre`](https://hackage.haskell.org/package/regex-with-pcre) +5 This is unlikely to be true, since $O(n)$ is only possible by + spending $O(2^m)$ on compilation, which libraries usually consider too great + a cost. `regex-tdfa` mentions that it is based on the [`tre`](https://github.com/laurikari/tre/) + library, which claims $O(m^2 n)$ time. This may be true of `regex-tdfa` also. + +Classifications (time): <0.25x = Very fast, >0.25x and <0.5x = Fast, >0.5x +and <2x = Comparable, >2x and <4x = Slow, time >4x = Very slow ## Benchmarks Benchmarks of regex libraries on some simple use cases. See `Compare.hs` for details. -Performed using GHC 9.8.1. +Performed using GHC 9.10.1. The suffixes indicate the sequence used for the benchmarks, `T` for `Text`, `S` for `String`, `BS` for `ByteString`. #### English text 1 -| Library | Time | Alloc | Copied | Peak | -| --- | --- | --- | --- | --- | -| parser-regex T | 55.6 ms ± 2.8 ms | 379 MB | 91 KB | 27 MB | -| parser-regex S | 72.2 ms ± 2.9 ms | 366 MB | 90 KB | 27 MB | -| regex-applicative S | 371 ms ± 27 ms | 1.6 GB | 286 MB | 158 MB | -| regex-tdfa T | 38.6 ms ± 1.5 ms | 110 MB | 63 KB | 27 MB | -| regex-pcre-builtin BS | 13.4 ms ± 698 μs | 406 KB | 8.4 KB | 27 MB | +``` +┌────────────────────────────────────────────────────────────┐ +│ │ Time │ Alloc │ Copied │ Peak │ +│───────────────────────│─────────│────────│────────│────────│ +│ parser-regex T │ 59.3 ms │ 380 MB │ 79 KB │ 27 MB │ +│ parser-regex S │ 83.2 ms │ 368 MB │ 83 KB │ 27 MB │ +│ regex-applicative S │ 385 ms │ 1.6 GB │ 286 MB │ 157 MB │ +│ regex-tdfa T │ 38.2 ms │ 111 MB │ 11 KB │ 27 MB │ +│ regex-pcre-builtin BS │ 13.4 ms │ 708 KB │ 598 B │ 27 MB │ +│ pcre-heavy T │ 13.5 ms │ 1.3 MB │ 620 B │ 27 MB │ +│ pcre2 T │ 228 ms │ 1.9 MB │ 2.5 KB │ 27 MB │ +└────────────────────────────────────────────────────────────┘ +``` #### English text 2 -| Library | Time | Alloc | Copied | Peak | -| --- | --- | --- | --- | --- | -| parser-regex T | 49.6 ms ± 2.8 ms | 315 MB | 102 KB | 27 MB | -| parser-regex S | 49.6 ms ± 2.8 ms | 319 MB | 891 KB | 28 MB | -| regex-applicative S | 379 ms ± 15 ms | 2.1 GB | 284 MB | 217 MB | -| regex-tdfa T | 27.2 ms ± 1.4 ms | 112 MB | 33 KB | 28 MB | -| regex-pcre-builtin BS | 388 μs ± 30 μs | 298 KB | 3.1 KB | 27 MB | + +``` +┌────────────────────────────────────────────────────────────┐ +│ │ Time │ Alloc │ Copied │ Peak │ +│───────────────────────│─────────│────────│────────│────────│ +│ parser-regex T │ 50.6 ms │ 315 MB │ 96 KB │ 27 MB │ +│ parser-regex S │ 52.0 ms │ 320 MB │ 974 KB │ 28 MB │ +│ regex-applicative S │ 399 ms │ 2.1 GB │ 284 MB │ 239 MB │ +│ regex-tdfa T │ 26.4 ms │ 113 MB │ 12 KB │ 28 MB │ +│ regex-pcre-builtin BS │ 378 μs │ 277 KB │ 104 B │ 27 MB │ +│ pcre-heavy T │ 418 μs │ 788 KB │ 102 B │ 27 MB │ +│ pcre2 T │ 96.4 ms │ 1.0 MB │ 2.4 KB │ 27 MB │ +└────────────────────────────────────────────────────────────┘ +``` #### English replace all -| Library | Time | Alloc | Copied | Peak | -| --- | --- | --- | --- | --- | -| parser-regex T | 81.4 ms ± 2.9 ms | 357 MB | 65 MB | 73 MB | -| parser-regex S | 121 ms ± 5.9 ms | 396 MB | 100 MB | 93 MB | -| regex-applicative S | 368 ms ± 5.3 ms | 2.2 GB | 63 MB | 50 MB | -| regex-tdfa T | 200 ms ± 17 ms | 696 MB | 29 MB | 923 MB | -| regex-pcre-builtin BS | 122 ms ± 12 ms | 586 MB | 29 MB | 921 MB | +``` +┌────────────────────────────────────────────────────────────┐ +│ │ Time │ Alloc │ Copied │ Peak │ +│───────────────────────│─────────│────────│────────│────────│ +│ parser-regex T │ 84.0 ms │ 358 MB │ 65 MB │ 73 MB │ +│ parser-regex S │ 121 ms │ 396 MB │ 93 MB │ 90 MB │ +│ regex-applicative S │ 386 ms │ 2.2 GB │ 63 MB │ 40 MB │ +│ regex-tdfa T │ 187 ms │ 696 MB │ 26 MB │ 758 MB │ +│ regex-pcre-builtin BS │ 125 ms │ 586 MB │ 44 MB │ 490 MB │ +│ pcre-heavy T │ 22.7 ms │ 378 MB │ 13 KB │ 28 MB │ +│ pcre2 T │ 36.0 ms │ 1.6 MB │ 2.1 KB │ 27 MB │ +└────────────────────────────────────────────────────────────┘ +``` #### Parse CaseFolding.txt -| Library | Time | Alloc | Copied | Peak | -| --- | --- | --- | --- | --- | -| parser-regex T | 60.1 ms ± 3.9 ms | 324 MB | 2.2 MB | 14 MB | -| parser-regex S | 60.1 ms ± 3.6 ms | 321 MB | 1.7 MB | 11 MB | -| regex-applicative S | 143 ms ± 4.2 ms | 921 MB | 48 MB | 65 MB | -| regex-tdfa T | 39.7 ms ± 1.3 ms | 108 MB | 168 KB | 11 MB | -| regex-pcre-builtin BS | 12.3 ms ± 683 μs | 4.0 MB | 402 KB | 11 MB | +``` +┌────────────────────────────────────────────────────────────┐ +│ │ Time │ Alloc │ Copied │ Peak │ +│───────────────────────│─────────│────────│────────│────────│ +│ parser-regex T │ 58.7 ms │ 325 MB │ 1.8 MB │ 11 MB │ +│ parser-regex S │ 59.8 ms │ 322 MB │ 2.3 MB │ 13 MB │ +│ regex-applicative S │ 146 ms │ 922 MB │ 48 MB │ 67 MB │ +│ regex-tdfa T │ 40.6 ms │ 108 MB │ 83 KB │ 11 MB │ +│ regex-pcre-builtin BS │ 12.1 ms │ 4.3 MB │ 63 KB │ 10 MB │ +│ pcre-heavy T │ 12.4 ms │ 5.2 MB │ 1.2 KB │ 10 MB │ +│ pcre2 T │ 115 ms │ 9.7 MB │ 6.0 KB │ 10 MB │ +└────────────────────────────────────────────────────────────┘ +``` #### Parse URI -| Library | Time | Alloc | Copied | Peak | -| --- | --- | --- | --- | --- | -| parser-regex T | 70.5 ms ± 3.8 ms | 454 MB | 3.3 MB | 33 MB | -| parser-regex S | 80.9 ms ± 3.4 ms | 439 MB | 9.5 MB | 35 MB | -| regex-applicative S | 426 ms ± 20 ms | 2.1 GB | 284 MB | 165 MB | -| regex-tdfa T | 192 ms ± 3.4 ms | 246 MB | 579 KB | 32 MB | -| regex-pcre-builtin BS | 7.13 ms ± 259 μs | 13 MB | 9.2 MB | 48 MB | +``` +┌────────────────────────────────────────────────────────────┐ +│ │ Time │ Alloc │ Copied │ Peak │ +│───────────────────────│─────────│────────│────────│────────│ +│ parser-regex T │ 73.8 ms │ 455 MB │ 3.9 MB │ 34 MB │ +│ parser-regex S │ 86.6 ms │ 441 MB │ 11 MB │ 38 MB │ +│ regex-applicative S │ 451 ms │ 2.1 GB │ 288 MB │ 162 MB │ +│ regex-tdfa T │ 193 ms │ 246 MB │ 100 KB │ 31 MB │ +│ regex-pcre-builtin BS │ 6.52 ms │ 12 MB │ 7.3 MB │ 46 MB │ +│ pcre-heavy T │ 3.84 ms │ 15 MB │ 385 KB │ 59 MB │ +└────────────────────────────────────────────────────────────┘ +``` #### Exponential backtracking -| Library | Time | Alloc | Copied | Peak | -| --- | --- | --- | --- | --- | -| parser-regex T | 17.1 μs ± 1.3 μs | 95 KB | 135 B | 7.0 MB | -| parser-regex S | 16.8 μs ± 1.4 μs | 99 KB | 137 B | 7.0 MB | -| regex-applicative S | 20.5 μs ± 893 ns | 44 KB | 8 B | 7.0 MB | -| regex-tdfa T | 401 ns ± 22 ns | 3.6 KB | 0 B | 9.0 MB | -| regex-pcre-builtin BS | 160 ms ± 6.6 ms | 0 B | 0 B | 6.0 MB | +``` +┌────────────────────────────────────────────────────────────┐ +│ │ Time │ Alloc │ Copied │ Peak │ +│───────────────────────│─────────│────────│────────│────────│ +│ parser-regex T │ 18.5 μs │ 96 KB │ 88 B │ 6.0 MB │ +│ parser-regex S │ 18.0 μs │ 100 KB │ 89 B │ 6.0 MB │ +│ regex-applicative S │ 22.9 μs │ 44 KB │ 89 B │ 7.0 MB │ +│ regex-tdfa T │ 2.37 μs │ 3.6 KB │ 88 B │ 8.0 MB │ +│ regex-pcre-builtin BS │ 161 ms │ 563 KB │ 7.6 KB │ 6.0 MB │ +│ pcre-heavy T │ 160 ms │ 562 KB │ 51 KB │ 6.0 MB │ +│ pcre2 T │ 257 ms │ 114 KB │ 49 KB │ 6.0 MB │ +└────────────────────────────────────────────────────────────┘ +``` diff --git a/bench/bench.cabal b/bench/bench.cabal index e9e96a5..e9e792f 100644 --- a/bench/bench.cabal +++ b/bench/bench.cabal @@ -26,6 +26,8 @@ benchmark bench , text , parser-regex + , pcre-heavy == 1.0.0.3 + , pcre2 == 2.2.1 , regex == 1.1.0.2 , regex-applicative == 0.3.4 , regex-base