Export some internals

meooow25 · Nov 23, 2024 · 78ca69a · 78ca69a
1 parent e5709ea
commit 78ca69a
Show file tree

Hide file tree

Showing 8 changed files with 94 additions and 33 deletions.
diff --git a/parser-regex.cabal b/parser-regex.cabal
@@ -38,19 +38,19 @@ library
         Regex.Base
         Regex.List
         Regex.Text
-
-    other-modules:
         Regex.Internal.CharSet
-        Regex.Internal.CharSets
         Regex.Internal.Debug
-        Regex.Internal.Generated.CaseFold
-        Regex.Internal.List
-        Regex.Internal.Num
         Regex.Internal.Parser
         Regex.Internal.Regex
         Regex.Internal.Text
         Regex.Internal.Unique
 
+    other-modules:
+        Regex.Internal.CharSets
+        Regex.Internal.Generated.CaseFold
+        Regex.Internal.List
+        Regex.Internal.Num
+
     build-depends:
         base >= 4.15 && < 5.0
       , containers >= 0.6.4 && < 0.8

diff --git a/src/Regex/Internal/CharSet.hs b/src/Regex/Internal/CharSet.hs
@@ -1,7 +1,17 @@
 {-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE MagicHash #-}
+{-# OPTIONS_HADDOCK not-home #-}
+
+-- | This is an internal module. You probably don't need to import this. Import
+-- "Data.CharSet" instead.
+--
+-- = WARNING
+--
+-- Definitions in this module allow violating invariants that would otherwise be
+-- guaranteed by non-internal modules. Use at your own risk!
+--
 module Regex.Internal.CharSet
-  ( CharSet
+  ( CharSet(..)
   , empty
   , singleton
   , fromRange

diff --git a/src/Regex/Internal/Debug.hs b/src/Regex/Internal/Debug.hs
@@ -1,6 +1,11 @@
 {-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ScopedTypeVariables #-}
+
+-- | This module provides functions for visualizing @RE@s and @Parser@s.
+-- [See here](https://github.com/meooow25/parser-regex/wiki/Visualizations)
+-- for some examples.
+--
 module Regex.Internal.Debug
   ( reToDot
   , parserToDot
@@ -29,7 +34,7 @@ import qualified Regex.Internal.CharSet as CS
 
 -- | Generate a [Graphviz DOT](https://graphviz.org/doc/info/lang.html)
 -- visualization of a 'RE'. Optionally takes an alphabet @[c]@, which will be
--- tested against the 'token' functions in the 'RE' and accepted characters
+-- tested against the @token@ functions in the 'RE' and accepted characters
 -- displayed.
 reToDot :: forall c a. Maybe ([c], [c] -> String) -> RE c a -> String
 reToDot ma re0 = execM $ do
@@ -69,7 +74,7 @@ reToDot ma re0 = execM $ do
 
 -- | Generate a [Graphviz DOT](https://graphviz.org/doc/info/lang.html)
 -- visualization of a 'Parser'. Optionally takes an alphabet @[c]@, which will
--- be tested against the 'token' functions in the 'Parser' and the accepted
+-- be tested against the @token@ functions in the 'Parser' and the accepted
 -- characters displayed.
 parserToDot :: forall c a. Maybe ([c], [c] -> String) -> Parser c a -> String
 parserToDot ma p0 = execM $ do
@@ -133,6 +138,9 @@ parserToDot ma p0 = execM $ do
 -- Display Chars
 ------------------
 
+-- |
+-- >>> dispCharRanges "abc012def"
+-- "[('0','2'),('a','f')]"
 dispCharRanges :: [Char] -> String
 dispCharRanges = show . CS.ranges . CS.fromList
 

diff --git a/src/Regex/Internal/List.hs b/src/Regex/Internal/List.hs
@@ -428,7 +428,7 @@ toReplace re = liftA2 f manyListMin re <*> manyList
 --
 -- @
 -- sep = 'oneOfChar' "-./"
--- digits n = 'replicateM' n (oneOfChar 'Data.CharSet.digit')
+-- digits n = 'Control.Monad.replicateM' n (oneOfChar 'Data.CharSet.digit')
 -- toYmd d m y = concat [y, \"-\", m, \"-\", d]
 -- date = toYmd \<$> digits 2 \<* sep
 --              \<*> digits 2 \<* sep

diff --git a/src/Regex/Internal/Parser.hs b/src/Regex/Internal/Parser.hs
@@ -2,6 +2,15 @@
 {-# LANGUAGE GADTs #-}
 {-# LANGUAGE RankNTypes #-}
 {-# LANGUAGE ScopedTypeVariables #-}
+{-# OPTIONS_HADDOCK not-home #-}
+
+-- | This is an internal module. You probably don't need to import this.
+--
+-- = WARNING
+--
+-- Definitions in this module allow violating invariants that would otherwise be
+-- guaranteed by non-internal modules. Use at your own risk!
+--
 module Regex.Internal.Parser
   ( Parser(..)
   , Node(..)

diff --git a/src/Regex/Internal/Regex.hs b/src/Regex/Internal/Regex.hs
@@ -1,5 +1,9 @@
 {-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE GADTs #-}
+{-# OPTIONS_HADDOCK not-home #-}
+
+-- | This is an internal module. You probably don't need to import this.
+--
 module Regex.Internal.Regex
   ( RE(..)
   , Strictness(..)
@@ -180,6 +184,7 @@ anySingle = token Just
 -- Many
 ---------
 
+-- | A repeating value or a finite list.
 data Many a
   = Repeat a   -- ^ A single value repeating indefinitely
   | Finite [a] -- ^ A finite list

diff --git a/src/Regex/Internal/Text.hs b/src/Regex/Internal/Text.hs
@@ -1,8 +1,19 @@
 {-# LANGUAGE BangPatterns #-}
+{-# OPTIONS_HADDOCK not-home #-}
+
+-- | This is an internal module. You probably don't need to import this. Import
+-- "Regex.Text" instead.
+--
+-- = WARNING
+--
+-- Definitions in this module allow violating invariants that would otherwise be
+-- guaranteed by non-internal modules. Use at your own risk!
+--
 module Regex.Internal.Text
   (
-    TextToken
+    TextToken(..)
   , REText
+  , textTokenFoldr
 
   , token
   , satisfy
@@ -77,6 +88,10 @@ import qualified Regex.Internal.Generated.CaseFold as CF
 -- This module uses RE TextToken for Text regexes instead of simply RE Char to
 -- support Text slicing. It does mean that use cases not using slicing pay a
 -- small cost, but it is not worth having two separate Text regex APIs.
+--
+-- Slicing is made possible by the unsafeAdjacentAppend function. Of course,
+-- this means that REs using it MUST NOT be used with multiple Texts, such as
+-- trying to parse chunks of a lazy Text.
 data TextToken = TextToken
   { tArr     :: {-# UNPACK #-} !TArray.Array
   , tOffset  :: {-# UNPACK #-} !Int
@@ -137,47 +152,47 @@ text t = t <$ T.foldr' ((*>) . char) (pure ()) t
 -- as described by the Unicode standard.
 textIgnoreCase :: Text -> REText Text
 textIgnoreCase t =
-  T.foldr' (\c cs -> R.liftA2' adjacentAppend (ignoreCaseTokenMatch c) cs)
+  T.foldr' (\c cs -> R.liftA2' unsafeAdjacentAppend (ignoreCaseTokenMatch c) cs)
            (pure T.empty)
            t
 -- See Note [Why simple case fold]
 
 -- | Parse any @Text@. Biased towards matching more.
 manyText :: REText Text
-manyText = R.foldlMany' adjacentAppend T.empty anyTokenMatch
+manyText = R.foldlMany' unsafeAdjacentAppend T.empty anyTokenMatch
 
 -- | Parse any non-empty @Text@. Biased towards matching more.
 someText :: REText Text
-someText = R.liftA2' adjacentAppend anyTokenMatch manyText
+someText = R.liftA2' unsafeAdjacentAppend anyTokenMatch manyText
 
 -- | Parse any @Text@. Minimal, i.e. biased towards matching less.
 manyTextMin :: REText Text
-manyTextMin = R.foldlManyMin' adjacentAppend T.empty anyTokenMatch
+manyTextMin = R.foldlManyMin' unsafeAdjacentAppend T.empty anyTokenMatch
 
 -- | Parse any non-empty @Text@. Minimal, i.e. biased towards matching less.
 someTextMin :: REText Text
-someTextMin = R.liftA2' adjacentAppend anyTokenMatch manyTextMin
+someTextMin = R.liftA2' unsafeAdjacentAppend anyTokenMatch manyTextMin
 
 -- | Parse any @Text@ containing members of the @CharSet@.
 -- Biased towards matching more.
 manyTextOf :: CharSet -> REText Text
-manyTextOf !cs = R.foldlMany' adjacentAppend T.empty (oneOfTokenMatch cs)
+manyTextOf !cs = R.foldlMany' unsafeAdjacentAppend T.empty (oneOfTokenMatch cs)
 
 -- | Parse any non-empty @Text@ containing members of the @CharSet@.
 -- Biased towards matching more.
 someTextOf :: CharSet -> REText Text
-someTextOf !cs = R.liftA2' adjacentAppend (oneOfTokenMatch cs) (manyTextOf cs)
+someTextOf !cs = R.liftA2' unsafeAdjacentAppend (oneOfTokenMatch cs) (manyTextOf cs)
 
 -- | Parse any @Text@ containing members of the @CharSet@.
 -- Minimal, i.e. biased towards matching less.
 manyTextOfMin :: CharSet -> REText Text
-manyTextOfMin !cs = R.foldlManyMin' adjacentAppend T.empty (oneOfTokenMatch cs)
+manyTextOfMin !cs = R.foldlManyMin' unsafeAdjacentAppend T.empty (oneOfTokenMatch cs)
 
 -- | Parse any non-empty @Text@ containing members of the @CharSet@.
 -- Minimal, i.e. biased towards matching less.
 someTextOfMin :: CharSet -> REText Text
 someTextOfMin !cs =
-  R.liftA2' adjacentAppend (oneOfTokenMatch cs) (manyTextOfMin cs)
+  R.liftA2' unsafeAdjacentAppend (oneOfTokenMatch cs) (manyTextOfMin cs)
 
 -----------------
 -- Numeric REs
@@ -314,11 +329,14 @@ toMatch = go
       RFmap _ _ re1 -> go re1
       RFmap_ _ re1 -> go re1
       RPure _ -> RPure T.empty
-      RLiftA2 _ _ re1 re2 -> RLiftA2 Strict adjacentAppend (go re1) (go re2)
+      RLiftA2 _ _ re1 re2 ->
+        RLiftA2 Strict unsafeAdjacentAppend (go re1) (go re2)
       REmpty -> REmpty
       RAlt re1 re2 -> RAlt (go re1) (go re2)
-      RMany _ _ _ _ re1 -> RFold Strict Greedy adjacentAppend T.empty (go re1)
-      RFold _ gr _ _ re1 -> RFold Strict gr adjacentAppend T.empty (go re1)
+      RMany _ _ _ _ re1 ->
+        RFold Strict Greedy unsafeAdjacentAppend T.empty (go re1)
+      RFold _ gr _ _ re1 ->
+        RFold Strict gr unsafeAdjacentAppend T.empty (go re1)
 
 data WithMatch a = WM {-# UNPACK #-} !Text a
 
@@ -330,10 +348,10 @@ fmapWM' f (WM t x) = WM t $! f x
 
 instance Applicative WithMatch where
   pure = WM T.empty
-  liftA2 f (WM t1 x) (WM t2 y) = WM (adjacentAppend t1 t2) (f x y)
+  liftA2 f (WM t1 x) (WM t2 y) = WM (unsafeAdjacentAppend t1 t2) (f x y)
 
 liftA2WM' :: (a1 -> a2 -> b) -> WithMatch a1 -> WithMatch a2 -> WithMatch b
-liftA2WM' f (WM t1 x) (WM t2 y) = WM (adjacentAppend t1 t2) $! f x y
+liftA2WM' f (WM t1 x) (WM t2 y) = WM (unsafeAdjacentAppend t1 t2) $! f x y
 
 -- | Rebuild the @RE@ to include the matched @Text@ alongside the result.
 withMatch :: REText a -> REText (Text, a)
@@ -368,13 +386,13 @@ withMatch = R.fmap' (\(WM t x) -> (t,x)) . go
 -- Parse
 ----------
 
-tokenFoldr :: (TextToken -> b -> b) -> b -> Text -> b
-tokenFoldr f z (TInternal.Text a o0 l) = loop o0
+textTokenFoldr :: (TextToken -> b -> b) -> b -> Text -> b
+textTokenFoldr f z (TInternal.Text a o0 l) = loop o0
   where
     loop o | o - o0 >= l = z
     loop o = case TUnsafe.iterArray a o of
       TUnsafe.Iter c clen -> f (TextToken a o c) (loop (o + clen))
-{-# INLINE tokenFoldr #-}
+{-# INLINE textTokenFoldr #-}
 
 -- | \(O(mn \log m)\). Parse a @Text@ with a @REText@.
 --
@@ -393,7 +411,7 @@ reParse re = let !p = P.compile re in parse p
 
 -- | \(O(mn \log m)\). Parse a @Text@ with a @ParserText@.
 parse :: ParserText a -> Text -> Maybe a
-parse = P.parseFoldr tokenFoldr
+parse = P.parseFoldr textTokenFoldr
 
 -- | \(O(mn \log m)\). Parse a @Text@ with a @ParserText@. Calls 'error' on
 -- parse failure.
@@ -516,7 +534,7 @@ toReplace re = liftA2 f manyTextMin re <*> manyText
 --
 -- @
 -- sep = 'oneOf' "-./"
--- digits n = 'toMatch' ('replicateM_' n (oneOf 'Data.CharSet.digit'))
+-- digits n = 'toMatch' ('Control.Monad.replicateM_' n (oneOf 'Data.CharSet.digit'))
 -- toYmd d m y = mconcat [y, \"-\", m, \"-\", d]
 -- date = toYmd \<$> digits 2 \<* sep
 --              \<*> digits 2 \<* sep
@@ -539,8 +557,8 @@ toReplaceMany re =
 
 -- WARNING: If t1 and t2 are not empty, they must be adjacent slices of the
 -- same Text. In other words, sameByteArray# a1 _a2 && o1 + l1 == _o2.
-adjacentAppend :: Text -> Text -> Text
-adjacentAppend t1@(TInternal.Text a1 o1 l1) t2@(TInternal.Text _a2 _o2 l2)
+unsafeAdjacentAppend :: Text -> Text -> Text
+unsafeAdjacentAppend t1@(TInternal.Text a1 o1 l1) t2@(TInternal.Text _a2 _o2 l2)
   | T.null t1 = t2
   | T.null t2 = t1
   | otherwise = TInternal.Text a1 o1 (l1+l2)

diff --git a/src/Regex/Internal/Unique.hs b/src/Regex/Internal/Unique.hs
@@ -1,3 +1,12 @@
+{-# OPTIONS_HADDOCK not-home #-}
+
+-- | This is an internal module. You probably don't need to import this.
+--
+-- = WARNING
+--
+-- Definitions in this module allow violating invariants that would otherwise be
+-- guaranteed by non-internal modules. Use at your own risk!
+--
 module Regex.Internal.Unique
   ( Unique(..)
   , UniqueSet
@@ -12,7 +21,9 @@ import qualified Data.IntSet as IS
 -- | A unique ID. Must be >= 0.
 newtype Unique = Unique { unUnique :: Int }
 
--- | A set of 'Unique's. The bitmask is a set for IDs 0..63 (0..31 if 32-bit).
+-- | A set of 'Unique's.
+
+-- The bitmask is a set for IDs 0..63 on 64-bit and 0..31 on 32-bit.
 -- Set operations on this are very fast and speed up the common case of small
 -- regexes a little bit, at the cost of a little more memory.
 data UniqueSet = UniqueSet {-# UNPACK #-} !Int !IS.IntSet