diff --git a/bench/Compare.hs b/bench/Compare.hs
index ba1b1a4..afea987 100644
--- a/bench/Compare.hs
+++ b/bench/Compare.hs
@@ -3,13 +3,14 @@
{-# LANGUAGE OverloadedStrings #-}
module Compare (benches) where
-import Control.Applicative
-import Control.DeepSeq
-import Control.Monad
-import Data.Char
-import Data.Array
+import Control.Applicative (Alternative(..), optional)
+import Control.DeepSeq (NFData(..))
+import Control.Monad (replicateM_)
+import Data.Char (digitToInt, chr)
+import Data.Array ((!))
import qualified Data.Foldable as F
-import Data.Maybe
+import qualified Data.List.NonEmpty as NE
+import Data.Maybe (fromJust)
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
@@ -17,9 +18,10 @@ import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TEnc
import GHC.Generics (Generic)
+import System.Mem (performMinorGC)
import Test.Tasty (testGroup)
-import Test.Tasty.Bench
+import Test.Tasty.Bench hiding (nf)
import Test.Tasty.HUnit (testCase, (@?=))
-- parser-regex
@@ -45,6 +47,12 @@ import qualified Text.RE.TDFA.Text as TDFAReplace
-- regex-with-pcre
import qualified Text.RE.PCRE.ByteString as PCREReplace
+-- pcre-heavy
+import qualified Text.Regex.PCRE.Heavy as Heavy
+
+-- pcre2
+import qualified Text.Regex.Pcre2 as Pcre2
+
benches :: Benchmark
benches = bgroup "compare"
[ env englishText $ \ ~(t,b,s) ->
@@ -54,12 +62,16 @@ benches = bgroup "compare"
, bench "regex-applicative S" $ nf english1RA s
, bench "regex-tdfa T" $ nf english1TDFA t
, bench "regex-pcre-builtin BS" $ nf english1PCRE b
+ , bench "pcre-heavy T" $ nf english1PCREHeavy t
+ , bench "pcre2 T" $ nf english1Pcre2 t
, testGroup "tests"
[ testCase "check count" $ length (english1PR t) @?= 900
, testCase "S == L" $ map T.pack (english1PRS s) @?= english1PR t
, testCase "regex-applicative ==" $ map T.pack (english1RA s) @?= english1PR t
, testCase "regex-tdfa ==" $ english1TDFA t @?= english1PR t
, testCase "regex-pcre-builtin ==" $ map TEnc.decodeUtf8 (english1PCRE b) @?= english1PR t
+ , testCase "pcre-heavy ==" $ english1PCREHeavy t @?= english1PR t
+ , testCase "pcre2 ==" $ english1Pcre2 t @?= english1PR t
]
]
, env englishText $ \ ~(t,b,s) ->
@@ -69,13 +81,20 @@ benches = bgroup "compare"
, bench "regex-applicative S" $ nf english2RA s
, bench "regex-tdfa T" $ nf english2TDFA t
, bench "regex-pcre-builtin BS" $ nf english2PCRE b
+ , bench "pcre-heavy T" $ nf english2PCREHeavy t
+ , bench "pcre2 T" $ nf english2Pcre2 t
, testGroup "tests"
[ testCase "check count" $ length (english2PR t) @?= 365
, testCase "S == T" $ map T.pack (english2PRS s) @?= english2PR t
, testCase "regex-applicative ==" $ map T.pack (english2RA s) @?= english2PR t
, testCase "regex-tdfa ==" $ english2TDFA t @?= english2PR t
+
+ -- Cannot compare results for regex-pcre-builtin and pcre-heavy. Their
+ -- counts don't match because they match bytes and not Chars.
, testCase "regex-pcre-builtin ==" $ length (english2PCRE b) @?= 354
- -- pcre count doesn't match because it matches bytes and not Chars
+ , testCase "pcre-heavy ==" $ length (english2PCREHeavy t) @?= 354
+
+ , testCase "pcre2 ==" $ english2Pcre2 t @?= english2PR t
]
]
, env englishText $ \ ~(t,b,s) ->
@@ -85,11 +104,15 @@ benches = bgroup "compare"
, bench "regex-applicative S" $ nf englishReplaceRA s
, bench "regex-tdfa T" $ nf englishReplaceTDFA t
, bench "regex-pcre-builtin BS" $ nf englishReplacePCRE b
+ , bench "pcre-heavy T" $ nf englishReplacePCREHeavy t
+ , bench "pcre2 T" $ nf englishReplacePcre2 t
, testGroup "tests"
[ testCase "S == T" $ T.pack (englishReplacePRS s) @?= englishReplacePR t
, testCase "regex-applicative ==" $ T.pack (englishReplaceRA s) @?= englishReplacePR t
, testCase "regex-tdfa ==" $ englishReplaceTDFA t @?= englishReplacePR t
, testCase "regex-pcre-builtin ==" $ englishReplacePCRE b @?= TEnc.encodeUtf8 (englishReplacePR t)
+ , testCase "pcre-heavy ==" $ englishReplacePCREHeavy t @?= englishReplacePR t
+ , testCase "pcre2 ==" $ englishReplacePcre2 t @?= englishReplacePR t
]
]
, env caseFoldingTxt $ \ ~(t,b,s) ->
@@ -99,12 +122,16 @@ benches = bgroup "compare"
, bench "regex-applicative S" $ nf caseFoldingRA s
, bench "regex-tdfa T" $ nf caseFoldingTDFA t
, bench "regex-pcre-builtin BS" $ nf caseFoldingPCRE b
+ , bench "pcre-heavy T" $ nf caseFoldingPCREHeavy t
+ , bench "pcre2 T" $ nf caseFoldingPcre2 t
, testGroup "tests"
[ testCase "check count" $ length (caseFoldingPR t) @?= 1563
, testCase "S == T" $ caseFoldingPRS s @?= caseFoldingPR t
, testCase "regex-applicative ==" $ caseFoldingRA s @?= caseFoldingPR t
, testCase "regex-tdfa ==" $ caseFoldingTDFA t @?= caseFoldingPR t
, testCase "regex-pcre-builtin ==" $ caseFoldingPCRE b @?= caseFoldingPR t
+ , testCase "pcre-heavy ==" $ caseFoldingPCREHeavy t @?= caseFoldingPR t
+ , testCase "pcre2 ==" $ caseFoldingPcre2 t @?= caseFoldingPcre2 t
]
]
, env htmlText $ \ ~(t,b,s) ->
@@ -114,30 +141,49 @@ benches = bgroup "compare"
, bench "regex-applicative S" $ nf uriRA s
, bench "regex-tdfa T" $ nf uriTDFA t
, bench "regex-pcre-builtin BS" $ nf uriPCRE b
+ , bench "pcre-heavy T" $ nf uriPCREHeavy t
+ -- , bench "pcre2 T" $ nf uriPcre2 t
, testGroup "tests"
[ testCase "check count" $ length (uriPR t) @?= 4277
, testCase "S == T" $ map uriS2T (uriPRS s) @?= uriPR t
, testCase "regex-applicative ==" $ map uriS2T (uriRA s) @?= uriPR t
, testCase "regex-tdfa ==" $ uriTDFA t @?= uriPR t
, testCase "regex-pcre-builtin ==" $ uriPCRE b @?= map uriT2BS (uriPR t)
+
+ -- Only check length. Comparing results fails because we cannot
+ -- distinguish between optional no capture and empty capture using
+ -- pcre-heavy.
+ , testCase "pcre-heavy ==" $ length (uriPCREHeavy t) @?= 4277
+
+ -- Exception: pcre2: UTF-8 error: isolated byte with 0x80 bit set
+ -- , testCase "pcre2 ==" $ uriPcre2 t @?= uriPR t
]
]
, bgroup "Exponential backtracking"
- [ bench "parser-regex T" $ whnf expPR expText
- , bench "parser-regex S" $ whnf expPRS expString
- , bench "regex-applicative S" $ whnf expRA expString
- , bench "regex-tdfa T" $ whnf expTDFA expText
- , bench "regex-pcre-builtin BS" $ whnf expPCRE expBS
+ [ bench "parser-regex T" $ nf expPR expText
+ , bench "parser-regex S" $ nf expPRS expString
+ , bench "regex-applicative S" $ nf expRA expString
+ , bench "regex-tdfa T" $ nf expTDFA expText
+ , bench "regex-pcre-builtin BS" $ nf expPCRE expBS
+ , bench "pcre-heavy T" $ nf expPCREHeavy expText
+ , bench "pcre2 T" $ nf expPcre2 expText
, testGroup "tests"
[ testCase "parser-regex T True" $ expPR expText @?= True
, testCase "parser-regex S True" $ expPRS expString @?= True
, testCase "regex-applicative True" $ expRA expString @?= True
, testCase "regex-tdfa True" $ expTDFA expText @?= True
, testCase "regex-pcre-builtin True" $ expPCRE expBS @?= True
+ , testCase "pcre-heavy True" $ expPCREHeavy expText @?= True
+ , testCase "pcre2 True" $ expPcre2 expText @?= True
]
]
]
+-- Need to perform GC to get correct memory stats
+-- See https://github.com/Bodigrim/tasty-bench/issues/62
+nf :: NFData b => (a -> b) -> a -> Benchmarkable
+nf f = whnfAppIO $ \x -> case rnf (f x) of () -> performMinorGC
+
-------------------
-- English text 1
-------------------
@@ -182,6 +228,17 @@ english1PCRE = map (fst . (! 0)) . RBase.matchAllText re
re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt
("Tom|Sawyer|Huckleberry|Finn" :: ByteString)
+-- pcre-heavy
+english1PCREHeavy :: Text -> [Text]
+english1PCREHeavy = map fst . Heavy.scan re
+ where
+ re = either error id $
+ Heavy.compileM (TEnc.encodeUtf8 "Tom|Sawyer|Huckleberry|Finn") []
+
+-- pcre2
+english1Pcre2 :: Text -> [Text]
+english1Pcre2 = Pcre2.match "Tom|Sawyer|Huckleberry|Finn"
+
-------------------
-- English text 2
-------------------
@@ -235,6 +292,17 @@ english2PCRE = map (fst . (! 0)) . RBase.matchAllText re
re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt
(TEnc.encodeUtf8 "“[^?!.]{0,30}[?!.]”")
+-- pcre-heavy
+english2PCREHeavy :: Text -> [Text]
+english2PCREHeavy = map fst . Heavy.scan re
+ where
+ re = either error id $
+ Heavy.compileM (TEnc.encodeUtf8 "“[^?!.]{0,30}[?!.]”") []
+
+-- pcre2
+english2Pcre2 :: Text -> [Text]
+english2Pcre2 = Pcre2.match "“[^?!.]{0,30}[?!.]”"
+
--------------------
-- English replace
--------------------
@@ -262,7 +330,7 @@ englishReplaceRA = RA.replace $
<|> "Tom" <$ RA.string "Huckleberry"
<|> "Sawyer" <$ RA.string "Finn"
--- regex
+-- regex-tdfa
englishReplaceTDFA :: Text -> Text
englishReplaceTDFA t =
Replace.replaceAllCaptures Replace.TOP repl $ t TDFAReplace.*=~ re
@@ -275,6 +343,7 @@ englishReplaceTDFA t =
"Finn" -> Just "Sawyer"
_ -> error "impossible"
+-- regex-pcre-builtin
englishReplacePCRE :: ByteString -> ByteString
englishReplacePCRE t =
Replace.replaceAllCaptures Replace.TOP repl $ t PCREReplace.*=~ re
@@ -287,6 +356,26 @@ englishReplacePCRE t =
"Finn" -> Just "Sawyer"
_ -> error "impossible"
+-- pcre-heavy
+englishReplacePCREHeavy :: Text -> Text
+englishReplacePCREHeavy = Heavy.gsub re repl
+ where
+ re = either error id $
+ Heavy.compileM (TEnc.encodeUtf8 "Tom|Sawyer|Huckleberry|Finn") []
+ repl :: Text -> Text
+ repl cap = case cap of
+ "Tom" -> "Huckleberry"
+ "Sawyer" -> "Finn"
+ "Huckleberry" -> "Tom"
+ "Finn" -> "Sawyer"
+ _ -> error "impossible"
+
+-- pcre2
+englishReplacePcre2 :: Text -> Text
+englishReplacePcre2 = Pcre2.gsub pat "${*MARK}"
+ where
+ pat = "(*MARK:Huckleberry)Tom|(*MARK:Finn)Sawyer|(*MARK:Tom)Huckleberry|(*MARK:Sawyer)Finn"
+
--------------------
-- CaseFolding.txt
--------------------
@@ -346,7 +435,8 @@ caseFoldingRA = fromJust . RA.match re
-- regex-tdfa
caseFoldingTDFA :: Text -> [CaseFold]
-caseFoldingTDFA = map (toCaseFold toc T.words) . RBase.matchAllText re
+caseFoldingTDFA =
+ map (matchTextToCaseFold textHexToChar T.words) . RBase.matchAllText re
where
re :: TDFA.Regex
re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt $
@@ -356,11 +446,11 @@ caseFoldingTDFA = map (toCaseFold toc T.words) . RBase.matchAllText re
, "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|"
, "(([0-9A-F]*); T; ([0-9A-F]*))"
]
- toc = chr . T.foldl' (\acc x -> acc * 16 + digitToInt x) 0
-- regex-pcre-builtin
caseFoldingPCRE :: ByteString -> [CaseFold]
-caseFoldingPCRE = map (toCaseFold toc BC.words) . RBase.matchAllText re
+caseFoldingPCRE =
+ map (matchTextToCaseFold bcHexToChar BC.words) . RBase.matchAllText re
where
re :: PCREBS.Regex
re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt $
@@ -370,26 +460,68 @@ caseFoldingPCRE = map (toCaseFold toc BC.words) . RBase.matchAllText re
, "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|"
, "(([0-9A-F]*); T; ([0-9A-F]*))"
]
- toc = chr . BC.foldl' (\acc x -> acc * 16 + digitToInt x) 0
+
+-- pcre-heavy
+caseFoldingPCREHeavy :: Text -> [CaseFold]
+caseFoldingPCREHeavy = map (listToCaseFold . extend13 . snd) . Heavy.scan re
+ where
+ re = either error id $
+ Heavy.compileM
+ (B.concat
+ [ "(([0-9A-F]*); C; ([0-9A-F]*))|"
+ , "(([0-9A-F]*); S; ([0-9A-F]*))|"
+ , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|"
+ , "(([0-9A-F]*); T; ([0-9A-F]*))"
+ ]) []
+
+ -- The list does not extend beyond the last capture :<
+ extend13 = take 13 . (++ repeat T.empty)
+
+-- pcre2
+caseFoldingPcre2 :: Text -> [CaseFold]
+caseFoldingPcre2 = map (listToCaseFold . NE.tail) . Pcre2.captures re
+ where
+ re = T.concat
+ [ "(([0-9A-F]*); C; ([0-9A-F]*))|"
+ , "(([0-9A-F]*); S; ([0-9A-F]*))|"
+ , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|"
+ , "(([0-9A-F]*); T; ([0-9A-F]*))"
+ ]
+
-- Note: regex with only submatches is incapable of parsing the nested
-- space separated codes in the F case.
-- So the string is captured and parsed after the regex delivers its results.
-toCaseFold
+bcHexToChar :: ByteString -> Char
+bcHexToChar = chr . BC.foldl' (\acc x -> acc * 16 + digitToInt x) 0
+
+textHexToChar :: Text -> Char
+textHexToChar = chr . T.foldl' (\acc x -> acc * 16 + digitToInt x) 0
+
+matchTextToCaseFold
:: (t -> Char) -- hex to Char
-> (t -> [t]) -- words
-> RBase.MatchText t
-> CaseFold
-toCaseFold toc ws m
- | Just _ <- idxMay 1 = Common (toc (idx 2)) (toc (idx 3))
- | Just _ <- idxMay 4 = Simple (toc (idx 5)) (toc (idx 6))
- | Just _ <- idxMay 7 = Full (toc (idx 8)) (tocs (idx 9))
- | otherwise = Turkic (toc (idx 12)) (toc (idx 13))
+matchTextToCaseFold toc ws m
+ | Just _ <- idxMay 1 = Common (toc (idx 2)) (toc (idx 3))
+ | Just _ <- idxMay 4 = Simple (toc (idx 5)) (toc (idx 6))
+ | Just _ <- idxMay 7 = Full (toc (idx 8)) (map toc (ws (idx 9)))
+ | Just _ <- idxMay 11 = Turkic (toc (idx 12)) (toc (idx 13))
+ | otherwise = error "impossible"
where
idx i = fst (m ! i)
idxMay i = let (t,(o,_)) = m ! i in if o == -1 then Nothing else Just t
- tocs = map toc . ws
+
+listToCaseFold :: [Text] -> CaseFold
+listToCaseFold [x1,x2,x3,x4,x5,x6,x7,x8,x9,_x10,x11,x12,x13]
+ | not (T.null x1) = Common (textHexToChar x2) (textHexToChar x3)
+ | not (T.null x4) = Simple (textHexToChar x5) (textHexToChar x6)
+ | not (T.null x7) = Full (textHexToChar x8) (map textHexToChar (T.words x9))
+ | not (T.null x11) = Turkic (textHexToChar x12) (textHexToChar x13)
+ | otherwise = error "impossible"
+listToCaseFold x = error $ show $ length x
--------
-- URI
@@ -459,7 +591,7 @@ uriRA = fromJust . RA.match re
-- regex-tdfa
uriTDFA :: Text -> [URI Text]
-uriTDFA = map toURI . RBase.matchAllText re
+uriTDFA = map matchTextToURI . RBase.matchAllText re
where
re :: TDFA.Regex
re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt
@@ -467,18 +599,43 @@ uriTDFA = map toURI . RBase.matchAllText re
-- regex-pcre-builtin
uriPCRE :: ByteString -> [URI ByteString]
-uriPCRE = map toURI . RBase.matchAllText re
+uriPCRE = map matchTextToURI . RBase.matchAllText re
where
re :: PCREBS.Regex
re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt
("href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\"" :: ByteString)
-toURI :: RBase.MatchText t -> URI t
-toURI m = URI (idxMay 2) (idxMay 4) (idx 5) (idxMay 7) (idxMay 9)
+matchTextToURI :: RBase.MatchText t -> URI t
+matchTextToURI m = URI (idxMay 2) (idxMay 4) (idx 5) (idxMay 7) (idxMay 9)
where
idx i = fst (m ! i)
idxMay i = let (t,(o,_)) = m ! i in if o == -1 then Nothing else Just t
+-- pcre-heavy
+uriPCREHeavy :: Text -> [URI Text]
+uriPCREHeavy = map (listToURI . extend9 . snd) . Heavy.scan re
+ where
+ re = either error id $
+ Heavy.compileM
+ ("href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\"" :: ByteString) []
+
+ -- The list does not extend beyond the last capture :<
+ extend9 = take 9 . (++ repeat T.empty)
+
+-- pcre2
+uriPcre2 :: Text -> [URI Text]
+uriPcre2 = map (listToURI . NE.tail) . Pcre2.captures re
+ where
+ re = "href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\""
+
+listToURI :: [Text] -> URI Text
+listToURI [_x1,x2,_x3,x4,x5,_x6,x7,_x8,x9] =
+ URI (notEmptyT x2) (notEmptyT x4) x5 (notEmptyT x7) (notEmptyT x9)
+ where
+ notEmptyT "" = Nothing
+ notEmptyT t = Just t
+listToURI _ = error "impossible"
+
-----------------------------
-- Exponential backtracking
-----------------------------
@@ -534,6 +691,19 @@ expPCRE = RBase.matchTest re
re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt
(BC.pack $ concat $ ["^"] <> replicate expN "a?" <> replicate expN "a" <> ["$"])
+-- pcre-heavy
+expPCREHeavy :: Text -> Bool
+expPCREHeavy = (Heavy.=~ re)
+ where
+ re = either error id $
+ Heavy.compileM
+ (BC.pack $ concat $ ["^"] <> replicate expN "a?" <> replicate expN "a" <> ["$"]) []
+
+-- pcre2
+expPcre2 :: Text -> Bool
+expPcre2 = Pcre2.matches
+ ("^" <> T.replicate expN "a?" <> T.replicate expN "a" <> "$")
+
---------------
-- File utils
---------------
@@ -570,4 +740,3 @@ replicateAppendMRA n0 re re1 = go n0
where
go 0 = re1
go n = liftA2 (:) re (go (n-1))
-
diff --git a/bench/README.md b/bench/README.md
index f131440..67d4f65 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -6,92 +6,134 @@ A comparison of some Haskell regex libraries:
* [`regex-applicative`](https://hackage.haskell.org/package/regex-applicative)
* [`regex-tdfa`](https://hackage.haskell.org/package/regex-tdfa)
* [`regex-pcre-builtin`](https://hackage.haskell.org/package/regex-pcre-builtin)
-
-| | `parser-regex` | `regex-applicative` | `regex-tdfa` | `regex-pcre-builtin` |
-| --- | --- | --- | --- | --- |
-| Regex construction | Combinators | Combinators | Pattern | Pattern |
-| Unicode aware | Yes | Yes | Yes | No[1] |
-| Parsing features | Yes | Yes | Submatch only | Submatch only |
-| Extensions | No | No | No | Yes (lookahead, backreferences, etc.) |
-| Text matching speed
(`English text 1,2`) | Baseline | Slower | Faster | Very fast |
-| Text replace speed
(`English replace all`) | Baseline | Slower | Slower[2] | Comparable[3] |
-| Parsing speed
(`CaseFolding.txt`,`URI`) | Baseline | Slower | Comparable | Very fast |
-| Regex compilation complexity | $O(m)$ | Undocumented, $O(m^2)$ judging by source code | Undocumented | Undocumented |
-| Parsing complexity | $O(mn \log m)$ | Documented "roughly linear complexity", $O(m^2 n \log m)$ judging by source code | $O(n)$ claimed[4] | Undocumented, $O(2^n)$ seen experimentally |
+* [`pcre-heavy`](https://hackage.haskell.org/package/pcre-heavy)
+* [`pcre2`](https://hackage.haskell.org/package/pcre2)
+
+| | `parser-regex` | `regex-applicative` | `regex-tdfa` | `regex-pcre-builtin` | `pcre-heavy` | `pcre2` |
+| --- | --- | --- | --- | --- | --- | --- |
+| Regex construction | Combinators | Combinators | Pattern | Pattern | Pattern | Pattern |
+| Unicode aware | Yes | Yes | Yes | No[1] | No | Yes |
+| Parsing features | Yes | Yes | Submatch only | Submatch only | Submatch only | Submatch only |
+| Extensions | No | No | No | Yes[2] | Yes[2] | Yes[2] |
+| Text matching speed
(`English text 1,2`) | Baseline | Very slow | Fast | Very fast | Very fast | Slow |
+| Text replace speed
(`English replace all`) | Baseline | Slow | Slow[3] | Comparable[4] | Very fast | Fast |
+| Parsing speed
(`CaseFolding.txt`,`URI`) | Baseline | Slow | Comparable | Very fast | Very fast | ⚠ UTF-8 error |
+| Regex compilation complexity | $O(m)$ | $O(m^2)$ judging by source code | Unclear | Unclear | Unclear | Unclear |
+| Parsing complexity | $O(mn \log m)$ | $O(m^2 n \log m)$ judging by source code | $O(n)$ claimed[4] | $O(2^n)$ seen experimentally | $O(2^n)$ seen experimentally | $O(2^n)$ seen experimentally |
1 [`regex-pcre-builtin#3`](https://github.com/audreyt/regex-pcre-builtin/issues/3)
-2 Replacement requires a separate library, [`regex`](https://hackage.haskell.org/package/regex)
-3 Replacement requires a separate library, [`regex-with-pcre`](https://hackage.haskell.org/package/regex-with-pcre)
-4 I do not know if this is accurate, since $O(n)$ is only possible by spending
- $O(2^m)$ on compilation, which libraries usually consider too great a cost.
- `regex-tdfa` mentions that it is based on the [`tre`](https://github.com/laurikari/tre/)
- library, which claims $O(m^2 n)$ time. This could be true of `regex-tdfa` also.
+2 Supports lookahead, backreferences, etc.
+3 Replacement requires a separate library, [`regex`](https://hackage.haskell.org/package/regex)
+4 Replacement requires a separate library, [`regex-with-pcre`](https://hackage.haskell.org/package/regex-with-pcre)
+5 This is unlikely to be true, since $O(n)$ is only possible by
+ spending $O(2^m)$ on compilation, which libraries usually consider too great
+ a cost. `regex-tdfa` mentions that it is based on the [`tre`](https://github.com/laurikari/tre/)
+ library, which claims $O(m^2 n)$ time. This may be true of `regex-tdfa` also.
+
+Classifications (time): <0.25x = Very fast, >0.25x and <0.5x = Fast, >0.5x
+and <2x = Comparable, >2x and <4x = Slow, time >4x = Very slow
## Benchmarks
Benchmarks of regex libraries on some simple use cases. See `Compare.hs` for
details.
-Performed using GHC 9.8.1.
+Performed using GHC 9.10.1.
The suffixes indicate the sequence used for the benchmarks, `T` for `Text`,
`S` for `String`, `BS` for `ByteString`.
#### English text 1
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 55.6 ms ± 2.8 ms | 379 MB | 91 KB | 27 MB |
-| parser-regex S | 72.2 ms ± 2.9 ms | 366 MB | 90 KB | 27 MB |
-| regex-applicative S | 371 ms ± 27 ms | 1.6 GB | 286 MB | 158 MB |
-| regex-tdfa T | 38.6 ms ± 1.5 ms | 110 MB | 63 KB | 27 MB |
-| regex-pcre-builtin BS | 13.4 ms ± 698 μs | 406 KB | 8.4 KB | 27 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│ │ Time │ Alloc │ Copied │ Peak │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T │ 59.3 ms │ 380 MB │ 79 KB │ 27 MB │
+│ parser-regex S │ 83.2 ms │ 368 MB │ 83 KB │ 27 MB │
+│ regex-applicative S │ 385 ms │ 1.6 GB │ 286 MB │ 157 MB │
+│ regex-tdfa T │ 38.2 ms │ 111 MB │ 11 KB │ 27 MB │
+│ regex-pcre-builtin BS │ 13.4 ms │ 708 KB │ 598 B │ 27 MB │
+│ pcre-heavy T │ 13.5 ms │ 1.3 MB │ 620 B │ 27 MB │
+│ pcre2 T │ 228 ms │ 1.9 MB │ 2.5 KB │ 27 MB │
+└────────────────────────────────────────────────────────────┘
+```
#### English text 2
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 49.6 ms ± 2.8 ms | 315 MB | 102 KB | 27 MB |
-| parser-regex S | 49.6 ms ± 2.8 ms | 319 MB | 891 KB | 28 MB |
-| regex-applicative S | 379 ms ± 15 ms | 2.1 GB | 284 MB | 217 MB |
-| regex-tdfa T | 27.2 ms ± 1.4 ms | 112 MB | 33 KB | 28 MB |
-| regex-pcre-builtin BS | 388 μs ± 30 μs | 298 KB | 3.1 KB | 27 MB |
+
+```
+┌────────────────────────────────────────────────────────────┐
+│ │ Time │ Alloc │ Copied │ Peak │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T │ 50.6 ms │ 315 MB │ 96 KB │ 27 MB │
+│ parser-regex S │ 52.0 ms │ 320 MB │ 974 KB │ 28 MB │
+│ regex-applicative S │ 399 ms │ 2.1 GB │ 284 MB │ 239 MB │
+│ regex-tdfa T │ 26.4 ms │ 113 MB │ 12 KB │ 28 MB │
+│ regex-pcre-builtin BS │ 378 μs │ 277 KB │ 104 B │ 27 MB │
+│ pcre-heavy T │ 418 μs │ 788 KB │ 102 B │ 27 MB │
+│ pcre2 T │ 96.4 ms │ 1.0 MB │ 2.4 KB │ 27 MB │
+└────────────────────────────────────────────────────────────┘
+```
#### English replace all
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 81.4 ms ± 2.9 ms | 357 MB | 65 MB | 73 MB |
-| parser-regex S | 121 ms ± 5.9 ms | 396 MB | 100 MB | 93 MB |
-| regex-applicative S | 368 ms ± 5.3 ms | 2.2 GB | 63 MB | 50 MB |
-| regex-tdfa T | 200 ms ± 17 ms | 696 MB | 29 MB | 923 MB |
-| regex-pcre-builtin BS | 122 ms ± 12 ms | 586 MB | 29 MB | 921 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│ │ Time │ Alloc │ Copied │ Peak │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T │ 84.0 ms │ 358 MB │ 65 MB │ 73 MB │
+│ parser-regex S │ 121 ms │ 396 MB │ 93 MB │ 90 MB │
+│ regex-applicative S │ 386 ms │ 2.2 GB │ 63 MB │ 40 MB │
+│ regex-tdfa T │ 187 ms │ 696 MB │ 26 MB │ 758 MB │
+│ regex-pcre-builtin BS │ 125 ms │ 586 MB │ 44 MB │ 490 MB │
+│ pcre-heavy T │ 22.7 ms │ 378 MB │ 13 KB │ 28 MB │
+│ pcre2 T │ 36.0 ms │ 1.6 MB │ 2.1 KB │ 27 MB │
+└────────────────────────────────────────────────────────────┘
+```
#### Parse CaseFolding.txt
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 60.1 ms ± 3.9 ms | 324 MB | 2.2 MB | 14 MB |
-| parser-regex S | 60.1 ms ± 3.6 ms | 321 MB | 1.7 MB | 11 MB |
-| regex-applicative S | 143 ms ± 4.2 ms | 921 MB | 48 MB | 65 MB |
-| regex-tdfa T | 39.7 ms ± 1.3 ms | 108 MB | 168 KB | 11 MB |
-| regex-pcre-builtin BS | 12.3 ms ± 683 μs | 4.0 MB | 402 KB | 11 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│ │ Time │ Alloc │ Copied │ Peak │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T │ 58.7 ms │ 325 MB │ 1.8 MB │ 11 MB │
+│ parser-regex S │ 59.8 ms │ 322 MB │ 2.3 MB │ 13 MB │
+│ regex-applicative S │ 146 ms │ 922 MB │ 48 MB │ 67 MB │
+│ regex-tdfa T │ 40.6 ms │ 108 MB │ 83 KB │ 11 MB │
+│ regex-pcre-builtin BS │ 12.1 ms │ 4.3 MB │ 63 KB │ 10 MB │
+│ pcre-heavy T │ 12.4 ms │ 5.2 MB │ 1.2 KB │ 10 MB │
+│ pcre2 T │ 115 ms │ 9.7 MB │ 6.0 KB │ 10 MB │
+└────────────────────────────────────────────────────────────┘
+```
#### Parse URI
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 70.5 ms ± 3.8 ms | 454 MB | 3.3 MB | 33 MB |
-| parser-regex S | 80.9 ms ± 3.4 ms | 439 MB | 9.5 MB | 35 MB |
-| regex-applicative S | 426 ms ± 20 ms | 2.1 GB | 284 MB | 165 MB |
-| regex-tdfa T | 192 ms ± 3.4 ms | 246 MB | 579 KB | 32 MB |
-| regex-pcre-builtin BS | 7.13 ms ± 259 μs | 13 MB | 9.2 MB | 48 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│ │ Time │ Alloc │ Copied │ Peak │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T │ 73.8 ms │ 455 MB │ 3.9 MB │ 34 MB │
+│ parser-regex S │ 86.6 ms │ 441 MB │ 11 MB │ 38 MB │
+│ regex-applicative S │ 451 ms │ 2.1 GB │ 288 MB │ 162 MB │
+│ regex-tdfa T │ 193 ms │ 246 MB │ 100 KB │ 31 MB │
+│ regex-pcre-builtin BS │ 6.52 ms │ 12 MB │ 7.3 MB │ 46 MB │
+│ pcre-heavy T │ 3.84 ms │ 15 MB │ 385 KB │ 59 MB │
+└────────────────────────────────────────────────────────────┘
+```
#### Exponential backtracking
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 17.1 μs ± 1.3 μs | 95 KB | 135 B | 7.0 MB |
-| parser-regex S | 16.8 μs ± 1.4 μs | 99 KB | 137 B | 7.0 MB |
-| regex-applicative S | 20.5 μs ± 893 ns | 44 KB | 8 B | 7.0 MB |
-| regex-tdfa T | 401 ns ± 22 ns | 3.6 KB | 0 B | 9.0 MB |
-| regex-pcre-builtin BS | 160 ms ± 6.6 ms | 0 B | 0 B | 6.0 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│ │ Time │ Alloc │ Copied │ Peak │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T │ 18.5 μs │ 96 KB │ 88 B │ 6.0 MB │
+│ parser-regex S │ 18.0 μs │ 100 KB │ 89 B │ 6.0 MB │
+│ regex-applicative S │ 22.9 μs │ 44 KB │ 89 B │ 7.0 MB │
+│ regex-tdfa T │ 2.37 μs │ 3.6 KB │ 88 B │ 8.0 MB │
+│ regex-pcre-builtin BS │ 161 ms │ 563 KB │ 7.6 KB │ 6.0 MB │
+│ pcre-heavy T │ 160 ms │ 562 KB │ 51 KB │ 6.0 MB │
+│ pcre2 T │ 257 ms │ 114 KB │ 49 KB │ 6.0 MB │
+└────────────────────────────────────────────────────────────┘
+```
diff --git a/bench/bench.cabal b/bench/bench.cabal
index e9e96a5..e9e792f 100644
--- a/bench/bench.cabal
+++ b/bench/bench.cabal
@@ -26,6 +26,8 @@ benchmark bench
, text
, parser-regex
+ , pcre-heavy == 1.0.0.3
+ , pcre2 == 2.2.1
, regex == 1.1.0.2
, regex-applicative == 0.3.4
, regex-base