From cf4e25b339aed88f39f2d7cf8eca39a5d871b01d Mon Sep 17 00:00:00 2001
From: meooow25 <soumiksarkar.3120@gmail.com>
Date: Sun, 24 Nov 2024 09:15:43 +0530
Subject: [PATCH] Compare with pcre-heavy and pcre2

---
 bench/Compare.hs  | 229 ++++++++++++++++++++++++++++++++++++++++------
 bench/README.md   | 164 +++++++++++++++++++++------------
 bench/bench.cabal |   2 +
 3 files changed, 304 insertions(+), 91 deletions(-)

diff --git a/bench/Compare.hs b/bench/Compare.hs
index ba1b1a4..afea987 100644
--- a/bench/Compare.hs
+++ b/bench/Compare.hs
@@ -3,13 +3,14 @@
 {-# LANGUAGE OverloadedStrings #-}
 module Compare (benches) where
 
-import Control.Applicative
-import Control.DeepSeq
-import Control.Monad
-import Data.Char
-import Data.Array
+import Control.Applicative (Alternative(..), optional)
+import Control.DeepSeq (NFData(..))
+import Control.Monad (replicateM_)
+import Data.Char (digitToInt, chr)
+import Data.Array ((!))
 import qualified Data.Foldable as F
-import Data.Maybe
+import qualified Data.List.NonEmpty as NE
+import Data.Maybe (fromJust)
 import Data.ByteString (ByteString)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as BC
@@ -17,9 +18,10 @@ import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as TEnc
 import GHC.Generics (Generic)
+import System.Mem (performMinorGC)
 
 import Test.Tasty (testGroup)
-import Test.Tasty.Bench
+import Test.Tasty.Bench hiding (nf)
 import Test.Tasty.HUnit (testCase, (@?=))
 
 -- parser-regex
@@ -45,6 +47,12 @@ import qualified Text.RE.TDFA.Text as TDFAReplace
 -- regex-with-pcre
 import qualified Text.RE.PCRE.ByteString as PCREReplace
 
+-- pcre-heavy
+import qualified Text.Regex.PCRE.Heavy as Heavy
+
+-- pcre2
+import qualified Text.Regex.Pcre2 as Pcre2
+
 benches :: Benchmark
 benches = bgroup "compare"
   [ env englishText $ \ ~(t,b,s) ->
@@ -54,12 +62,16 @@ benches = bgroup "compare"
     , bench "regex-applicative S" $ nf english1RA s
     , bench "regex-tdfa T" $ nf english1TDFA t
     , bench "regex-pcre-builtin BS" $ nf english1PCRE b
+    , bench "pcre-heavy T" $ nf english1PCREHeavy t
+    , bench "pcre2 T" $ nf english1Pcre2 t
     , testGroup "tests"
       [ testCase "check count" $ length (english1PR t) @?= 900
       , testCase "S == L" $ map T.pack (english1PRS s) @?= english1PR t
       , testCase "regex-applicative ==" $ map T.pack (english1RA s) @?= english1PR t
       , testCase "regex-tdfa ==" $ english1TDFA t @?= english1PR t
       , testCase "regex-pcre-builtin ==" $ map TEnc.decodeUtf8 (english1PCRE b) @?= english1PR t
+      , testCase "pcre-heavy ==" $ english1PCREHeavy t @?= english1PR t
+      , testCase "pcre2 ==" $ english1Pcre2 t @?= english1PR t
       ]
     ]
   , env englishText $ \ ~(t,b,s) ->
@@ -69,13 +81,20 @@ benches = bgroup "compare"
     , bench "regex-applicative S" $ nf english2RA s
     , bench "regex-tdfa T" $ nf english2TDFA t
     , bench "regex-pcre-builtin BS" $ nf english2PCRE b
+    , bench "pcre-heavy T" $ nf english2PCREHeavy t
+    , bench "pcre2 T" $ nf english2Pcre2 t
     , testGroup "tests"
       [ testCase "check count" $ length (english2PR t) @?= 365
       , testCase "S == T" $ map T.pack (english2PRS s) @?= english2PR t
       , testCase "regex-applicative ==" $ map T.pack (english2RA s) @?= english2PR t
       , testCase "regex-tdfa ==" $ english2TDFA t @?= english2PR t
+
+      -- Cannot compare results for regex-pcre-builtin and pcre-heavy. Their
+      -- counts don't match because they match bytes and not Chars.
       , testCase "regex-pcre-builtin ==" $ length (english2PCRE b) @?= 354
-        -- pcre count doesn't match because it matches bytes and not Chars
+      , testCase "pcre-heavy ==" $ length (english2PCREHeavy t) @?= 354
+
+      , testCase "pcre2 ==" $ english2Pcre2 t @?= english2PR t
       ]
     ]
   , env englishText $ \ ~(t,b,s) ->
@@ -85,11 +104,15 @@ benches = bgroup "compare"
     , bench "regex-applicative S" $ nf englishReplaceRA s
     , bench "regex-tdfa T" $ nf englishReplaceTDFA t
     , bench "regex-pcre-builtin BS" $ nf englishReplacePCRE b
+    , bench "pcre-heavy T" $ nf englishReplacePCREHeavy t
+    , bench "pcre2 T" $ nf englishReplacePcre2 t
     , testGroup "tests"
       [ testCase "S == T" $ T.pack (englishReplacePRS s) @?= englishReplacePR t
       , testCase "regex-applicative ==" $ T.pack (englishReplaceRA s) @?= englishReplacePR t
       , testCase "regex-tdfa ==" $ englishReplaceTDFA t @?= englishReplacePR t
       , testCase "regex-pcre-builtin ==" $ englishReplacePCRE b @?= TEnc.encodeUtf8 (englishReplacePR t)
+      , testCase "pcre-heavy ==" $ englishReplacePCREHeavy t @?= englishReplacePR t
+      , testCase "pcre2 ==" $ englishReplacePcre2 t @?= englishReplacePR t
       ]
     ]
   , env caseFoldingTxt $ \ ~(t,b,s) ->
@@ -99,12 +122,16 @@ benches = bgroup "compare"
     , bench "regex-applicative S" $ nf caseFoldingRA s
     , bench "regex-tdfa T" $ nf caseFoldingTDFA t
     , bench "regex-pcre-builtin BS" $ nf caseFoldingPCRE b
+    , bench "pcre-heavy T" $ nf caseFoldingPCREHeavy t
+    , bench "pcre2 T" $ nf caseFoldingPcre2 t
     , testGroup "tests"
       [ testCase "check count" $ length (caseFoldingPR t) @?= 1563
       , testCase "S == T" $ caseFoldingPRS s @?= caseFoldingPR t
       , testCase "regex-applicative ==" $ caseFoldingRA s @?= caseFoldingPR t
       , testCase "regex-tdfa ==" $ caseFoldingTDFA t @?= caseFoldingPR t
       , testCase "regex-pcre-builtin ==" $ caseFoldingPCRE b @?= caseFoldingPR t
+      , testCase "pcre-heavy ==" $ caseFoldingPCREHeavy t @?= caseFoldingPR t
+      , testCase "pcre2 ==" $ caseFoldingPcre2 t @?= caseFoldingPcre2 t
       ]
     ]
   , env htmlText $ \ ~(t,b,s) ->
@@ -114,30 +141,49 @@ benches = bgroup "compare"
     , bench "regex-applicative S" $ nf uriRA s
     , bench "regex-tdfa T" $ nf uriTDFA t
     , bench "regex-pcre-builtin BS" $ nf uriPCRE b
+    , bench "pcre-heavy T" $ nf uriPCREHeavy t
+    -- , bench "pcre2 T" $ nf uriPcre2 t
     , testGroup "tests"
       [ testCase "check count" $ length (uriPR t) @?= 4277
       , testCase "S == T" $ map uriS2T (uriPRS s) @?= uriPR t
       , testCase "regex-applicative ==" $ map uriS2T (uriRA s) @?= uriPR t
       , testCase "regex-tdfa ==" $ uriTDFA t @?= uriPR t
       , testCase "regex-pcre-builtin ==" $ uriPCRE b @?= map uriT2BS (uriPR t)
+
+        -- Only check length. Comparing results fails because we cannot
+        -- distinguish between optional no capture and empty capture using
+        -- pcre-heavy.
+      , testCase "pcre-heavy ==" $ length (uriPCREHeavy t) @?= 4277
+
+        -- Exception: pcre2: UTF-8 error: isolated byte with 0x80 bit set
+      -- , testCase "pcre2 ==" $ uriPcre2 t @?= uriPR t
       ]
     ]
   , bgroup "Exponential backtracking"
-    [ bench "parser-regex T" $ whnf expPR expText
-    , bench "parser-regex S" $ whnf expPRS expString
-    , bench "regex-applicative S" $ whnf expRA expString
-    , bench "regex-tdfa T" $ whnf expTDFA expText
-    , bench "regex-pcre-builtin BS" $ whnf expPCRE expBS
+    [ bench "parser-regex T" $ nf expPR expText
+    , bench "parser-regex S" $ nf expPRS expString
+    , bench "regex-applicative S" $ nf expRA expString
+    , bench "regex-tdfa T" $ nf expTDFA expText
+    , bench "regex-pcre-builtin BS" $ nf expPCRE expBS
+    , bench "pcre-heavy T" $ nf expPCREHeavy expText
+    , bench "pcre2 T" $ nf expPcre2 expText
     , testGroup "tests"
       [ testCase "parser-regex T True" $ expPR expText @?= True
       , testCase "parser-regex S True" $ expPRS expString @?= True
       , testCase "regex-applicative True" $ expRA expString @?= True
       , testCase "regex-tdfa True" $ expTDFA expText @?= True
       , testCase "regex-pcre-builtin True" $ expPCRE expBS @?= True
+      , testCase "pcre-heavy True" $ expPCREHeavy expText @?= True
+      , testCase "pcre2 True" $ expPcre2 expText @?= True
       ]
     ]
   ]
 
+-- Need to perform GC to get correct memory stats
+-- See https://github.com/Bodigrim/tasty-bench/issues/62
+nf :: NFData b => (a -> b) -> a -> Benchmarkable
+nf f = whnfAppIO $ \x -> case rnf (f x) of () -> performMinorGC
+
 -------------------
 -- English text 1
 -------------------
@@ -182,6 +228,17 @@ english1PCRE = map (fst . (! 0)) . RBase.matchAllText re
     re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt
          ("Tom|Sawyer|Huckleberry|Finn" :: ByteString)
 
+-- pcre-heavy
+english1PCREHeavy :: Text -> [Text]
+english1PCREHeavy = map fst . Heavy.scan re
+  where
+    re = either error id $
+      Heavy.compileM (TEnc.encodeUtf8 "Tom|Sawyer|Huckleberry|Finn") []
+
+-- pcre2
+english1Pcre2 :: Text -> [Text]
+english1Pcre2 = Pcre2.match "Tom|Sawyer|Huckleberry|Finn"
+
 -------------------
 -- English text 2
 -------------------
@@ -235,6 +292,17 @@ english2PCRE = map (fst . (! 0)) . RBase.matchAllText re
     re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt
          (TEnc.encodeUtf8 "“[^?!.]{0,30}[?!.]”")
 
+-- pcre-heavy
+english2PCREHeavy :: Text -> [Text]
+english2PCREHeavy = map fst . Heavy.scan re
+  where
+    re = either error id $
+      Heavy.compileM (TEnc.encodeUtf8 "“[^?!.]{0,30}[?!.]”") []
+
+-- pcre2
+english2Pcre2 :: Text -> [Text]
+english2Pcre2 = Pcre2.match "“[^?!.]{0,30}[?!.]”"
+
 --------------------
 -- English replace
 --------------------
@@ -262,7 +330,7 @@ englishReplaceRA = RA.replace $
   <|> "Tom"         <$ RA.string "Huckleberry"
   <|> "Sawyer"      <$ RA.string "Finn"
 
--- regex
+-- regex-tdfa
 englishReplaceTDFA :: Text -> Text
 englishReplaceTDFA t =
   Replace.replaceAllCaptures Replace.TOP repl $ t TDFAReplace.*=~ re
@@ -275,6 +343,7 @@ englishReplaceTDFA t =
       "Finn"        -> Just "Sawyer"
       _             -> error "impossible"
 
+-- regex-pcre-builtin
 englishReplacePCRE :: ByteString -> ByteString
 englishReplacePCRE t =
   Replace.replaceAllCaptures Replace.TOP repl $ t PCREReplace.*=~ re
@@ -287,6 +356,26 @@ englishReplacePCRE t =
       "Finn"        -> Just "Sawyer"
       _             -> error "impossible"
 
+-- pcre-heavy
+englishReplacePCREHeavy :: Text -> Text
+englishReplacePCREHeavy = Heavy.gsub re repl
+  where
+    re = either error id $
+      Heavy.compileM (TEnc.encodeUtf8 "Tom|Sawyer|Huckleberry|Finn") []
+    repl :: Text -> Text
+    repl cap = case cap of
+      "Tom"         -> "Huckleberry"
+      "Sawyer"      -> "Finn"
+      "Huckleberry" -> "Tom"
+      "Finn"        -> "Sawyer"
+      _             -> error "impossible"
+
+-- pcre2
+englishReplacePcre2 :: Text -> Text
+englishReplacePcre2 = Pcre2.gsub pat "${*MARK}"
+  where
+    pat = "(*MARK:Huckleberry)Tom|(*MARK:Finn)Sawyer|(*MARK:Tom)Huckleberry|(*MARK:Sawyer)Finn"
+
 --------------------
 -- CaseFolding.txt
 --------------------
@@ -346,7 +435,8 @@ caseFoldingRA = fromJust . RA.match re
 
 -- regex-tdfa
 caseFoldingTDFA :: Text -> [CaseFold]
-caseFoldingTDFA = map (toCaseFold toc T.words) . RBase.matchAllText re
+caseFoldingTDFA =
+  map (matchTextToCaseFold textHexToChar T.words) . RBase.matchAllText re
   where
     re :: TDFA.Regex
     re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt $
@@ -356,11 +446,11 @@ caseFoldingTDFA = map (toCaseFold toc T.words) . RBase.matchAllText re
          , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|"
          , "(([0-9A-F]*); T; ([0-9A-F]*))"
          ]
-    toc = chr . T.foldl' (\acc x -> acc * 16 + digitToInt x) 0
 
 -- regex-pcre-builtin
 caseFoldingPCRE :: ByteString -> [CaseFold]
-caseFoldingPCRE = map (toCaseFold toc BC.words) . RBase.matchAllText re
+caseFoldingPCRE =
+  map (matchTextToCaseFold bcHexToChar BC.words) . RBase.matchAllText re
   where
     re :: PCREBS.Regex
     re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt $
@@ -370,26 +460,68 @@ caseFoldingPCRE = map (toCaseFold toc BC.words) . RBase.matchAllText re
          , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|"
          , "(([0-9A-F]*); T; ([0-9A-F]*))"
          ]
-    toc = chr . BC.foldl' (\acc x -> acc * 16 + digitToInt x) 0
+
+-- pcre-heavy
+caseFoldingPCREHeavy :: Text -> [CaseFold]
+caseFoldingPCREHeavy = map (listToCaseFold . extend13 . snd) . Heavy.scan re
+  where
+    re = either error id $
+      Heavy.compileM
+         (B.concat
+         [ "(([0-9A-F]*); C; ([0-9A-F]*))|"
+         , "(([0-9A-F]*); S; ([0-9A-F]*))|"
+         , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|"
+         , "(([0-9A-F]*); T; ([0-9A-F]*))"
+         ]) []
+
+    -- The list does not extend beyond the last capture :<
+    extend13 = take 13 . (++ repeat T.empty)
+
+-- pcre2
+caseFoldingPcre2 :: Text -> [CaseFold]
+caseFoldingPcre2 = map (listToCaseFold . NE.tail) . Pcre2.captures re
+  where
+    re = T.concat
+         [ "(([0-9A-F]*); C; ([0-9A-F]*))|"
+         , "(([0-9A-F]*); S; ([0-9A-F]*))|"
+         , "(([0-9A-F]*); F; ([0-9A-F]*( [0-9A-F]*)*))|"
+         , "(([0-9A-F]*); T; ([0-9A-F]*))"
+         ]
+
 
 -- Note: regex with only submatches is incapable of parsing the nested
 -- space separated codes in the F case.
 -- So the string is captured and parsed after the regex delivers its results.
 
-toCaseFold
+bcHexToChar :: ByteString -> Char
+bcHexToChar = chr . BC.foldl' (\acc x -> acc * 16 + digitToInt x) 0
+
+textHexToChar :: Text -> Char
+textHexToChar = chr . T.foldl' (\acc x -> acc * 16 + digitToInt x) 0
+
+matchTextToCaseFold
   :: (t -> Char) -- hex to Char
   -> (t -> [t]) -- words
   -> RBase.MatchText t
   -> CaseFold
-toCaseFold toc ws m
-   | Just _ <- idxMay 1 = Common (toc (idx 2)) (toc (idx 3))
-   | Just _ <- idxMay 4 = Simple (toc (idx 5)) (toc (idx 6))
-   | Just _ <- idxMay 7 = Full (toc (idx 8)) (tocs (idx 9))
-   | otherwise          = Turkic (toc (idx 12)) (toc (idx 13))
+matchTextToCaseFold toc ws m
+   | Just _ <- idxMay 1  = Common (toc (idx 2)) (toc (idx 3))
+   | Just _ <- idxMay 4  = Simple (toc (idx 5)) (toc (idx 6))
+   | Just _ <- idxMay 7  = Full (toc (idx 8)) (map toc (ws (idx 9)))
+   | Just _ <- idxMay 11 = Turkic (toc (idx 12)) (toc (idx 13))
+   | otherwise           = error "impossible"
   where
     idx i = fst (m ! i)
     idxMay i = let (t,(o,_)) = m ! i in if o == -1 then Nothing else Just t
-    tocs = map toc . ws
+
+listToCaseFold :: [Text] -> CaseFold
+listToCaseFold [x1,x2,x3,x4,x5,x6,x7,x8,x9,_x10,x11,x12,x13]
+  | not (T.null x1)  = Common (textHexToChar x2) (textHexToChar x3)
+  | not (T.null x4)  = Simple (textHexToChar x5) (textHexToChar x6)
+  | not (T.null x7)  = Full (textHexToChar x8) (map textHexToChar (T.words x9))
+  | not (T.null x11) = Turkic (textHexToChar x12) (textHexToChar x13)
+  | otherwise        = error "impossible"
+listToCaseFold x = error $ show $ length x
 
 --------
 -- URI
@@ -459,7 +591,7 @@ uriRA = fromJust . RA.match re
 
 -- regex-tdfa
 uriTDFA :: Text -> [URI Text]
-uriTDFA = map toURI . RBase.matchAllText re
+uriTDFA = map matchTextToURI . RBase.matchAllText re
   where
     re :: TDFA.Regex
     re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt
@@ -467,18 +599,43 @@ uriTDFA = map toURI . RBase.matchAllText re
 
 -- regex-pcre-builtin
 uriPCRE :: ByteString -> [URI ByteString]
-uriPCRE = map toURI . RBase.matchAllText re
+uriPCRE = map matchTextToURI . RBase.matchAllText re
   where
     re :: PCREBS.Regex
     re = RBase.makeRegexOpts PCREBS.compDotAll RBase.blankExecOpt
          ("href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\"" :: ByteString)
 
-toURI :: RBase.MatchText t -> URI t
-toURI m = URI (idxMay 2) (idxMay 4) (idx 5) (idxMay 7) (idxMay 9)
+matchTextToURI :: RBase.MatchText t -> URI t
+matchTextToURI m = URI (idxMay 2) (idxMay 4) (idx 5) (idxMay 7) (idxMay 9)
   where
     idx i = fst (m ! i)
     idxMay i = let (t,(o,_)) = m ! i in if o == -1 then Nothing else Just t
 
+-- pcre-heavy
+uriPCREHeavy :: Text -> [URI Text]
+uriPCREHeavy = map (listToURI . extend9 . snd) . Heavy.scan re
+  where
+    re = either error id $
+      Heavy.compileM
+        ("href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\"" :: ByteString) []
+
+    -- The list does not extend beyond the last capture :<
+    extend9 = take 9 . (++ repeat T.empty)
+
+-- pcre2
+uriPcre2 :: Text -> [URI Text]
+uriPcre2 = map (listToURI . NE.tail) . Pcre2.captures re
+  where
+    re = "href=\"(([^:/?#\"]+):)?(//([^/?#\"]*))?([^?#\"]*)(\\?([^#\"]*))?(#([^\"]*))?\""
+
+listToURI :: [Text] -> URI Text
+listToURI [_x1,x2,_x3,x4,x5,_x6,x7,_x8,x9] =
+  URI (notEmptyT x2) (notEmptyT x4) x5 (notEmptyT x7) (notEmptyT x9)
+  where
+    notEmptyT "" = Nothing
+    notEmptyT t = Just t
+listToURI _ = error "impossible"
+
 -----------------------------
 -- Exponential backtracking
 -----------------------------
@@ -534,6 +691,19 @@ expPCRE = RBase.matchTest re
     re = RBase.makeRegexOpts RBase.blankCompOpt RBase.blankExecOpt
            (BC.pack $ concat $ ["^"] <> replicate expN "a?" <> replicate expN "a" <> ["$"])
 
+-- pcre-heavy
+expPCREHeavy :: Text -> Bool
+expPCREHeavy = (Heavy.=~ re)
+  where
+    re = either error id $
+      Heavy.compileM
+        (BC.pack $ concat $ ["^"] <> replicate expN "a?" <> replicate expN "a" <> ["$"]) []
+
+-- pcre2
+expPcre2 :: Text -> Bool
+expPcre2 = Pcre2.matches
+  ("^" <> T.replicate expN "a?" <> T.replicate expN "a" <> "$")
+
 ---------------
 -- File utils
 ---------------
@@ -570,4 +740,3 @@ replicateAppendMRA n0 re re1 = go n0
   where
     go 0 = re1
     go n = liftA2 (:) re (go (n-1))
-
diff --git a/bench/README.md b/bench/README.md
index f131440..67d4f65 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -6,92 +6,134 @@ A comparison of some Haskell regex libraries:
 * [`regex-applicative`](https://hackage.haskell.org/package/regex-applicative)
 * [`regex-tdfa`](https://hackage.haskell.org/package/regex-tdfa)
 * [`regex-pcre-builtin`](https://hackage.haskell.org/package/regex-pcre-builtin)
-
-| | `parser-regex` | `regex-applicative` | `regex-tdfa` | `regex-pcre-builtin` |
-| --- | --- | --- | --- | --- |
-| Regex construction | Combinators | Combinators | Pattern | Pattern |
-| Unicode aware | Yes | Yes | Yes | No<sup>[1]</sup> |
-| Parsing features | Yes | Yes | Submatch only | Submatch only |
-| Extensions | No | No | No | Yes (lookahead, backreferences, etc.) |
-| Text matching speed<br/>(`English text 1,2`) | Baseline | Slower | Faster | Very fast |
-| Text replace speed<br/>(`English replace all`) | Baseline | Slower | Slower<sup>[2]</sup> | Comparable<sup>[3]</sup> |
-| Parsing speed<br/>(`CaseFolding.txt`,`URI`) | Baseline | Slower | Comparable | Very fast |
-| Regex compilation complexity | $O(m)$ | Undocumented, $O(m^2)$ judging by source code | Undocumented | Undocumented |
-| Parsing complexity | $O(mn \log m)$ | Documented "roughly linear complexity", $O(m^2 n \log m)$ judging by source code | $O(n)$ claimed<sup>[4]</sup> | Undocumented, $O(2^n)$ seen experimentally |
+* [`pcre-heavy`](https://hackage.haskell.org/package/pcre-heavy)
+* [`pcre2`](https://hackage.haskell.org/package/pcre2)
+
+| | `parser-regex` | `regex-applicative` | `regex-tdfa` | `regex-pcre-builtin` | `pcre-heavy` | `pcre2` |
+| --- | --- | --- | --- | --- | --- | --- |
+| Regex construction | Combinators | Combinators | Pattern | Pattern | Pattern | Pattern |
+| Unicode aware | Yes | Yes | Yes | No<sup>[1]</sup> | No | Yes |
+| Parsing features | Yes | Yes | Submatch only | Submatch only | Submatch only | Submatch only |
+| Extensions | No | No | No | Yes<sup>[2]</sup> | Yes<sup>[2]</sup> | Yes<sup>[2]</sup> |
+| Text matching speed<br/>(`English text 1,2`) | Baseline | Very slow | Fast | Very fast | Very fast | Slow |
+| Text replace speed<br/>(`English replace all`) | Baseline | Slow | Slow<sup>[3]</sup> | Comparable<sup>[4]</sup> | Very fast | Fast |
+| Parsing speed<br/>(`CaseFolding.txt`,`URI`) | Baseline | Slow | Comparable | Very fast | Very fast | ⚠ UTF-8 error |
+| Regex compilation complexity | $O(m)$ | $O(m^2)$ judging by source code | Unclear | Unclear | Unclear | Unclear |
+| Parsing complexity | $O(mn \log m)$ | $O(m^2 n \log m)$ judging by source code | $O(n)$ claimed<sup>[4]</sup> | $O(2^n)$ seen experimentally | $O(2^n)$ seen experimentally | $O(2^n)$ seen experimentally |
 
 <sup>1</sup> [`regex-pcre-builtin#3`](https://github.com/audreyt/regex-pcre-builtin/issues/3)  
-<sup>2</sup> Replacement requires a separate library, [`regex`](https://hackage.haskell.org/package/regex)  
-<sup>3</sup> Replacement requires a separate library, [`regex-with-pcre`](https://hackage.haskell.org/package/regex-with-pcre)  
-<sup>4</sup> I do not know if this is accurate, since $O(n)$ is only possible by spending
-   $O(2^m)$ on compilation, which libraries usually consider too great a cost.
-   `regex-tdfa` mentions that it is based on the [`tre`](https://github.com/laurikari/tre/)
-   library, which claims $O(m^2 n)$ time. This could be true of `regex-tdfa` also.
+<sup>2</sup> Supports lookahead, backreferences, etc.  
+<sup>3</sup> Replacement requires a separate library, [`regex`](https://hackage.haskell.org/package/regex)  
+<sup>4</sup> Replacement requires a separate library, [`regex-with-pcre`](https://hackage.haskell.org/package/regex-with-pcre)  
+<sup>5</sup> This is unlikely to be true, since $O(n)$ is only possible by
+   spending $O(2^m)$ on compilation, which libraries usually consider too great
+   a cost. `regex-tdfa` mentions that it is based on the [`tre`](https://github.com/laurikari/tre/)
+   library, which claims $O(m^2 n)$ time. This may be true of `regex-tdfa` also.
+
+Classifications (time): <0.25x = Very fast, >0.25x and <0.5x = Fast, >0.5x
+and <2x = Comparable, >2x and <4x = Slow, time >4x = Very slow
 
 ## Benchmarks
 
 Benchmarks of regex libraries on some simple use cases. See `Compare.hs` for
 details.  
-Performed using GHC 9.8.1.
+Performed using GHC 9.10.1.
 
 The suffixes indicate the sequence used for the benchmarks, `T` for `Text`,
 `S` for `String`, `BS` for `ByteString`.
 
 #### English text 1
 
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 55.6 ms ± 2.8 ms | 379 MB | 91 KB | 27 MB |
-| parser-regex S | 72.2 ms ± 2.9 ms | 366 MB | 90 KB | 27 MB |
-| regex-applicative S | 371 ms ± 27 ms | 1.6 GB | 286 MB | 158 MB |
-| regex-tdfa T | 38.6 ms ± 1.5 ms | 110 MB | 63 KB | 27 MB |
-| regex-pcre-builtin BS | 13.4 ms ± 698 μs | 406 KB | 8.4 KB | 27 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│                       │  Time   │ Alloc  │ Copied │  Peak  │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T        │ 59.3 ms │ 380 MB │  79 KB │  27 MB │
+│ parser-regex S        │ 83.2 ms │ 368 MB │  83 KB │  27 MB │
+│ regex-applicative S   │ 385  ms │ 1.6 GB │ 286 MB │ 157 MB │
+│ regex-tdfa T          │ 38.2 ms │ 111 MB │  11 KB │  27 MB │
+│ regex-pcre-builtin BS │ 13.4 ms │ 708 KB │ 598 B  │  27 MB │
+│ pcre-heavy T          │ 13.5 ms │ 1.3 MB │ 620 B  │  27 MB │
+│ pcre2 T               │ 228  ms │ 1.9 MB │ 2.5 KB │  27 MB │
+└────────────────────────────────────────────────────────────┘
+```
 
 #### English text 2
 
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 49.6 ms ± 2.8 ms | 315 MB | 102 KB | 27 MB |
-| parser-regex S | 49.6 ms ± 2.8 ms | 319 MB | 891 KB | 28 MB |
-| regex-applicative S | 379 ms ± 15 ms | 2.1 GB | 284 MB | 217 MB |
-| regex-tdfa T | 27.2 ms ± 1.4 ms | 112 MB | 33 KB | 28 MB |
-| regex-pcre-builtin BS | 388 μs ± 30 μs | 298 KB | 3.1 KB | 27 MB |
+
+```
+┌────────────────────────────────────────────────────────────┐
+│                       │  Time   │ Alloc  │ Copied │  Peak  │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T        │ 50.6 ms │ 315 MB │  96 KB │  27 MB │
+│ parser-regex S        │ 52.0 ms │ 320 MB │ 974 KB │  28 MB │
+│ regex-applicative S   │ 399  ms │ 2.1 GB │ 284 MB │ 239 MB │
+│ regex-tdfa T          │ 26.4 ms │ 113 MB │  12 KB │  28 MB │
+│ regex-pcre-builtin BS │ 378  μs │ 277 KB │ 104 B  │  27 MB │
+│ pcre-heavy T          │ 418  μs │ 788 KB │ 102 B  │  27 MB │
+│ pcre2 T               │ 96.4 ms │ 1.0 MB │ 2.4 KB │  27 MB │
+└────────────────────────────────────────────────────────────┘
+```
 
 #### English replace all
 
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 81.4 ms ± 2.9 ms | 357 MB | 65 MB | 73 MB |
-| parser-regex S | 121 ms ± 5.9 ms | 396 MB | 100 MB | 93 MB |
-| regex-applicative S | 368 ms ± 5.3 ms | 2.2 GB | 63 MB | 50 MB |
-| regex-tdfa T | 200 ms ± 17 ms | 696 MB | 29 MB | 923 MB |
-| regex-pcre-builtin BS | 122 ms ± 12 ms | 586 MB | 29 MB | 921 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│                       │  Time   │ Alloc  │ Copied │  Peak  │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T        │ 84.0 ms │ 358 MB │  65 MB │  73 MB │
+│ parser-regex S        │ 121  ms │ 396 MB │  93 MB │  90 MB │
+│ regex-applicative S   │ 386  ms │ 2.2 GB │  63 MB │  40 MB │
+│ regex-tdfa T          │ 187  ms │ 696 MB │  26 MB │ 758 MB │
+│ regex-pcre-builtin BS │ 125  ms │ 586 MB │  44 MB │ 490 MB │
+│ pcre-heavy T          │ 22.7 ms │ 378 MB │  13 KB │  28 MB │
+│ pcre2 T               │ 36.0 ms │ 1.6 MB │ 2.1 KB │  27 MB │
+└────────────────────────────────────────────────────────────┘
+```
 
 #### Parse CaseFolding.txt
 
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 60.1 ms ± 3.9 ms | 324 MB | 2.2 MB | 14 MB |
-| parser-regex S | 60.1 ms ± 3.6 ms | 321 MB | 1.7 MB | 11 MB |
-| regex-applicative S | 143 ms ± 4.2 ms | 921 MB | 48 MB | 65 MB |
-| regex-tdfa T | 39.7 ms ± 1.3 ms | 108 MB | 168 KB | 11 MB |
-| regex-pcre-builtin BS | 12.3 ms ± 683 μs | 4.0 MB | 402 KB | 11 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│                       │  Time   │ Alloc  │ Copied │  Peak  │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T        │ 58.7 ms │ 325 MB │ 1.8 MB │  11 MB │
+│ parser-regex S        │ 59.8 ms │ 322 MB │ 2.3 MB │  13 MB │
+│ regex-applicative S   │ 146  ms │ 922 MB │  48 MB │  67 MB │
+│ regex-tdfa T          │ 40.6 ms │ 108 MB │  83 KB │  11 MB │
+│ regex-pcre-builtin BS │ 12.1 ms │ 4.3 MB │  63 KB │  10 MB │
+│ pcre-heavy T          │ 12.4 ms │ 5.2 MB │ 1.2 KB │  10 MB │
+│ pcre2 T               │ 115  ms │ 9.7 MB │ 6.0 KB │  10 MB │
+└────────────────────────────────────────────────────────────┘
+```
 
 #### Parse URI
 
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 70.5 ms ± 3.8 ms | 454 MB | 3.3 MB | 33 MB |
-| parser-regex S | 80.9 ms ± 3.4 ms | 439 MB | 9.5 MB | 35 MB |
-| regex-applicative S | 426 ms ± 20 ms | 2.1 GB | 284 MB | 165 MB |
-| regex-tdfa T | 192 ms ± 3.4 ms | 246 MB | 579 KB | 32 MB |
-| regex-pcre-builtin BS | 7.13 ms ± 259 μs | 13 MB | 9.2 MB | 48 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│                       │  Time   │ Alloc  │ Copied │  Peak  │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T        │ 73.8 ms │ 455 MB │ 3.9 MB │  34 MB │
+│ parser-regex S        │ 86.6 ms │ 441 MB │  11 MB │  38 MB │
+│ regex-applicative S   │ 451  ms │ 2.1 GB │ 288 MB │ 162 MB │
+│ regex-tdfa T          │ 193  ms │ 246 MB │ 100 KB │  31 MB │
+│ regex-pcre-builtin BS │ 6.52 ms │  12 MB │ 7.3 MB │  46 MB │
+│ pcre-heavy T          │ 3.84 ms │  15 MB │ 385 KB │  59 MB │
+└────────────────────────────────────────────────────────────┘
+```
 
 #### Exponential backtracking
 
-| Library | Time | Alloc | Copied | Peak |
-| --- | --- | --- | --- | --- |
-| parser-regex T | 17.1 μs ± 1.3 μs | 95 KB | 135 B | 7.0 MB |
-| parser-regex S | 16.8 μs ± 1.4 μs | 99 KB | 137 B | 7.0 MB |
-| regex-applicative S | 20.5 μs ± 893 ns | 44 KB | 8 B | 7.0 MB |
-| regex-tdfa T | 401 ns ± 22 ns | 3.6 KB | 0 B | 9.0 MB |
-| regex-pcre-builtin BS | 160 ms ± 6.6 ms | 0 B | 0 B | 6.0 MB |
+```
+┌────────────────────────────────────────────────────────────┐
+│                       │  Time   │ Alloc  │ Copied │  Peak  │
+│───────────────────────│─────────│────────│────────│────────│
+│ parser-regex T        │ 18.5 μs │  96 KB │  88 B  │ 6.0 MB │
+│ parser-regex S        │ 18.0 μs │ 100 KB │  89 B  │ 6.0 MB │
+│ regex-applicative S   │ 22.9 μs │  44 KB │  89 B  │ 7.0 MB │
+│ regex-tdfa T          │ 2.37 μs │ 3.6 KB │  88 B  │ 8.0 MB │
+│ regex-pcre-builtin BS │ 161  ms │ 563 KB │ 7.6 KB │ 6.0 MB │
+│ pcre-heavy T          │ 160  ms │ 562 KB │  51 KB │ 6.0 MB │
+│ pcre2 T               │ 257  ms │ 114 KB │  49 KB │ 6.0 MB │
+└────────────────────────────────────────────────────────────┘
+```
diff --git a/bench/bench.cabal b/bench/bench.cabal
index e9e96a5..e9e792f 100644
--- a/bench/bench.cabal
+++ b/bench/bench.cabal
@@ -26,6 +26,8 @@ benchmark bench
       , text
 
       , parser-regex
+      , pcre-heavy == 1.0.0.3
+      , pcre2 == 2.2.1
       , regex == 1.1.0.2
       , regex-applicative == 0.3.4
       , regex-base