Skip to content

Commit

Permalink
Export some internals
Browse files Browse the repository at this point in the history
  • Loading branch information
meooow25 committed Nov 23, 2024
1 parent e5709ea commit 78ca69a
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 33 deletions.
12 changes: 6 additions & 6 deletions parser-regex.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,19 @@ library
Regex.Base
Regex.List
Regex.Text

other-modules:
Regex.Internal.CharSet
Regex.Internal.CharSets
Regex.Internal.Debug
Regex.Internal.Generated.CaseFold
Regex.Internal.List
Regex.Internal.Num
Regex.Internal.Parser
Regex.Internal.Regex
Regex.Internal.Text
Regex.Internal.Unique

other-modules:
Regex.Internal.CharSets
Regex.Internal.Generated.CaseFold
Regex.Internal.List
Regex.Internal.Num

build-depends:
base >= 4.15 && < 5.0
, containers >= 0.6.4 && < 0.8
Expand Down
12 changes: 11 additions & 1 deletion src/Regex/Internal/CharSet.hs
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MagicHash #-}
{-# OPTIONS_HADDOCK not-home #-}

-- | This is an internal module. You probably don't need to import this. Import
-- "Data.CharSet" instead.
--
-- = WARNING
--
-- Definitions in this module allow violating invariants that would otherwise be
-- guaranteed by non-internal modules. Use at your own risk!
--
module Regex.Internal.CharSet
( CharSet
( CharSet(..)
, empty
, singleton
, fromRange
Expand Down
12 changes: 10 additions & 2 deletions src/Regex/Internal/Debug.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- | This module provides functions for visualizing @RE@s and @Parser@s.
-- [See here](https://github.com/meooow25/parser-regex/wiki/Visualizations)
-- for some examples.
--
module Regex.Internal.Debug
( reToDot
, parserToDot
Expand Down Expand Up @@ -29,7 +34,7 @@ import qualified Regex.Internal.CharSet as CS

-- | Generate a [Graphviz DOT](https://graphviz.org/doc/info/lang.html)
-- visualization of a 'RE'. Optionally takes an alphabet @[c]@, which will be
-- tested against the 'token' functions in the 'RE' and accepted characters
-- tested against the @token@ functions in the 'RE' and accepted characters
-- displayed.
reToDot :: forall c a. Maybe ([c], [c] -> String) -> RE c a -> String
reToDot ma re0 = execM $ do
Expand Down Expand Up @@ -69,7 +74,7 @@ reToDot ma re0 = execM $ do

-- | Generate a [Graphviz DOT](https://graphviz.org/doc/info/lang.html)
-- visualization of a 'Parser'. Optionally takes an alphabet @[c]@, which will
-- be tested against the 'token' functions in the 'Parser' and the accepted
-- be tested against the @token@ functions in the 'Parser' and the accepted
-- characters displayed.
parserToDot :: forall c a. Maybe ([c], [c] -> String) -> Parser c a -> String
parserToDot ma p0 = execM $ do
Expand Down Expand Up @@ -133,6 +138,9 @@ parserToDot ma p0 = execM $ do
-- Display Chars
------------------

-- |
-- >>> dispCharRanges "abc012def"
-- "[('0','2'),('a','f')]"
dispCharRanges :: [Char] -> String
dispCharRanges = show . CS.ranges . CS.fromList

Expand Down
2 changes: 1 addition & 1 deletion src/Regex/Internal/List.hs
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ toReplace re = liftA2 f manyListMin re <*> manyList
--
-- @
-- sep = 'oneOfChar' "-./"
-- digits n = 'replicateM' n (oneOfChar 'Data.CharSet.digit')
-- digits n = 'Control.Monad.replicateM' n (oneOfChar 'Data.CharSet.digit')
-- toYmd d m y = concat [y, \"-\", m, \"-\", d]
-- date = toYmd \<$> digits 2 \<* sep
-- \<*> digits 2 \<* sep
Expand Down
9 changes: 9 additions & 0 deletions src/Regex/Internal/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
{-# LANGUAGE GADTs #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# OPTIONS_HADDOCK not-home #-}

-- | This is an internal module. You probably don't need to import this.
--
-- = WARNING
--
-- Definitions in this module allow violating invariants that would otherwise be
-- guaranteed by non-internal modules. Use at your own risk!
--
module Regex.Internal.Parser
( Parser(..)
, Node(..)
Expand Down
5 changes: 5 additions & 0 deletions src/Regex/Internal/Regex.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE GADTs #-}
{-# OPTIONS_HADDOCK not-home #-}

-- | This is an internal module. You probably don't need to import this.
--
module Regex.Internal.Regex
( RE(..)
, Strictness(..)
Expand Down Expand Up @@ -180,6 +184,7 @@ anySingle = token Just
-- Many
---------

-- | A repeating value or a finite list.
data Many a
= Repeat a -- ^ A single value repeating indefinitely
| Finite [a] -- ^ A finite list
Expand Down
62 changes: 40 additions & 22 deletions src/Regex/Internal/Text.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK not-home #-}

-- | This is an internal module. You probably don't need to import this. Import
-- "Regex.Text" instead.
--
-- = WARNING
--
-- Definitions in this module allow violating invariants that would otherwise be
-- guaranteed by non-internal modules. Use at your own risk!
--
module Regex.Internal.Text
(
TextToken
TextToken(..)
, REText
, textTokenFoldr

, token
, satisfy
Expand Down Expand Up @@ -77,6 +88,10 @@ import qualified Regex.Internal.Generated.CaseFold as CF
-- This module uses RE TextToken for Text regexes instead of simply RE Char to
-- support Text slicing. It does mean that use cases not using slicing pay a
-- small cost, but it is not worth having two separate Text regex APIs.
--
-- Slicing is made possible by the unsafeAdjacentAppend function. Of course,
-- this means that REs using it MUST NOT be used with multiple Texts, such as
-- trying to parse chunks of a lazy Text.
data TextToken = TextToken
{ tArr :: {-# UNPACK #-} !TArray.Array
, tOffset :: {-# UNPACK #-} !Int
Expand Down Expand Up @@ -137,47 +152,47 @@ text t = t <$ T.foldr' ((*>) . char) (pure ()) t
-- as described by the Unicode standard.
textIgnoreCase :: Text -> REText Text
textIgnoreCase t =
T.foldr' (\c cs -> R.liftA2' adjacentAppend (ignoreCaseTokenMatch c) cs)
T.foldr' (\c cs -> R.liftA2' unsafeAdjacentAppend (ignoreCaseTokenMatch c) cs)
(pure T.empty)
t
-- See Note [Why simple case fold]

-- | Parse any @Text@. Biased towards matching more.
manyText :: REText Text
manyText = R.foldlMany' adjacentAppend T.empty anyTokenMatch
manyText = R.foldlMany' unsafeAdjacentAppend T.empty anyTokenMatch

-- | Parse any non-empty @Text@. Biased towards matching more.
someText :: REText Text
someText = R.liftA2' adjacentAppend anyTokenMatch manyText
someText = R.liftA2' unsafeAdjacentAppend anyTokenMatch manyText

-- | Parse any @Text@. Minimal, i.e. biased towards matching less.
manyTextMin :: REText Text
manyTextMin = R.foldlManyMin' adjacentAppend T.empty anyTokenMatch
manyTextMin = R.foldlManyMin' unsafeAdjacentAppend T.empty anyTokenMatch

-- | Parse any non-empty @Text@. Minimal, i.e. biased towards matching less.
someTextMin :: REText Text
someTextMin = R.liftA2' adjacentAppend anyTokenMatch manyTextMin
someTextMin = R.liftA2' unsafeAdjacentAppend anyTokenMatch manyTextMin

-- | Parse any @Text@ containing members of the @CharSet@.
-- Biased towards matching more.
manyTextOf :: CharSet -> REText Text
manyTextOf !cs = R.foldlMany' adjacentAppend T.empty (oneOfTokenMatch cs)
manyTextOf !cs = R.foldlMany' unsafeAdjacentAppend T.empty (oneOfTokenMatch cs)

-- | Parse any non-empty @Text@ containing members of the @CharSet@.
-- Biased towards matching more.
someTextOf :: CharSet -> REText Text
someTextOf !cs = R.liftA2' adjacentAppend (oneOfTokenMatch cs) (manyTextOf cs)
someTextOf !cs = R.liftA2' unsafeAdjacentAppend (oneOfTokenMatch cs) (manyTextOf cs)

-- | Parse any @Text@ containing members of the @CharSet@.
-- Minimal, i.e. biased towards matching less.
manyTextOfMin :: CharSet -> REText Text
manyTextOfMin !cs = R.foldlManyMin' adjacentAppend T.empty (oneOfTokenMatch cs)
manyTextOfMin !cs = R.foldlManyMin' unsafeAdjacentAppend T.empty (oneOfTokenMatch cs)

-- | Parse any non-empty @Text@ containing members of the @CharSet@.
-- Minimal, i.e. biased towards matching less.
someTextOfMin :: CharSet -> REText Text
someTextOfMin !cs =
R.liftA2' adjacentAppend (oneOfTokenMatch cs) (manyTextOfMin cs)
R.liftA2' unsafeAdjacentAppend (oneOfTokenMatch cs) (manyTextOfMin cs)

-----------------
-- Numeric REs
Expand Down Expand Up @@ -314,11 +329,14 @@ toMatch = go
RFmap _ _ re1 -> go re1
RFmap_ _ re1 -> go re1
RPure _ -> RPure T.empty
RLiftA2 _ _ re1 re2 -> RLiftA2 Strict adjacentAppend (go re1) (go re2)
RLiftA2 _ _ re1 re2 ->
RLiftA2 Strict unsafeAdjacentAppend (go re1) (go re2)
REmpty -> REmpty
RAlt re1 re2 -> RAlt (go re1) (go re2)
RMany _ _ _ _ re1 -> RFold Strict Greedy adjacentAppend T.empty (go re1)
RFold _ gr _ _ re1 -> RFold Strict gr adjacentAppend T.empty (go re1)
RMany _ _ _ _ re1 ->
RFold Strict Greedy unsafeAdjacentAppend T.empty (go re1)
RFold _ gr _ _ re1 ->
RFold Strict gr unsafeAdjacentAppend T.empty (go re1)

data WithMatch a = WM {-# UNPACK #-} !Text a

Expand All @@ -330,10 +348,10 @@ fmapWM' f (WM t x) = WM t $! f x

instance Applicative WithMatch where
pure = WM T.empty
liftA2 f (WM t1 x) (WM t2 y) = WM (adjacentAppend t1 t2) (f x y)
liftA2 f (WM t1 x) (WM t2 y) = WM (unsafeAdjacentAppend t1 t2) (f x y)

liftA2WM' :: (a1 -> a2 -> b) -> WithMatch a1 -> WithMatch a2 -> WithMatch b
liftA2WM' f (WM t1 x) (WM t2 y) = WM (adjacentAppend t1 t2) $! f x y
liftA2WM' f (WM t1 x) (WM t2 y) = WM (unsafeAdjacentAppend t1 t2) $! f x y

-- | Rebuild the @RE@ to include the matched @Text@ alongside the result.
withMatch :: REText a -> REText (Text, a)
Expand Down Expand Up @@ -368,13 +386,13 @@ withMatch = R.fmap' (\(WM t x) -> (t,x)) . go
-- Parse
----------

tokenFoldr :: (TextToken -> b -> b) -> b -> Text -> b
tokenFoldr f z (TInternal.Text a o0 l) = loop o0
textTokenFoldr :: (TextToken -> b -> b) -> b -> Text -> b
textTokenFoldr f z (TInternal.Text a o0 l) = loop o0
where
loop o | o - o0 >= l = z
loop o = case TUnsafe.iterArray a o of
TUnsafe.Iter c clen -> f (TextToken a o c) (loop (o + clen))
{-# INLINE tokenFoldr #-}
{-# INLINE textTokenFoldr #-}

-- | \(O(mn \log m)\). Parse a @Text@ with a @REText@.
--
Expand All @@ -393,7 +411,7 @@ reParse re = let !p = P.compile re in parse p

-- | \(O(mn \log m)\). Parse a @Text@ with a @ParserText@.
parse :: ParserText a -> Text -> Maybe a
parse = P.parseFoldr tokenFoldr
parse = P.parseFoldr textTokenFoldr

-- | \(O(mn \log m)\). Parse a @Text@ with a @ParserText@. Calls 'error' on
-- parse failure.
Expand Down Expand Up @@ -516,7 +534,7 @@ toReplace re = liftA2 f manyTextMin re <*> manyText
--
-- @
-- sep = 'oneOf' "-./"
-- digits n = 'toMatch' ('replicateM_' n (oneOf 'Data.CharSet.digit'))
-- digits n = 'toMatch' ('Control.Monad.replicateM_' n (oneOf 'Data.CharSet.digit'))
-- toYmd d m y = mconcat [y, \"-\", m, \"-\", d]
-- date = toYmd \<$> digits 2 \<* sep
-- \<*> digits 2 \<* sep
Expand All @@ -539,8 +557,8 @@ toReplaceMany re =

-- WARNING: If t1 and t2 are not empty, they must be adjacent slices of the
-- same Text. In other words, sameByteArray# a1 _a2 && o1 + l1 == _o2.
adjacentAppend :: Text -> Text -> Text
adjacentAppend t1@(TInternal.Text a1 o1 l1) t2@(TInternal.Text _a2 _o2 l2)
unsafeAdjacentAppend :: Text -> Text -> Text
unsafeAdjacentAppend t1@(TInternal.Text a1 o1 l1) t2@(TInternal.Text _a2 _o2 l2)
| T.null t1 = t2
| T.null t2 = t1
| otherwise = TInternal.Text a1 o1 (l1+l2)
Expand Down
13 changes: 12 additions & 1 deletion src/Regex/Internal/Unique.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
{-# OPTIONS_HADDOCK not-home #-}

-- | This is an internal module. You probably don't need to import this.
--
-- = WARNING
--
-- Definitions in this module allow violating invariants that would otherwise be
-- guaranteed by non-internal modules. Use at your own risk!
--
module Regex.Internal.Unique
( Unique(..)
, UniqueSet
Expand All @@ -12,7 +21,9 @@ import qualified Data.IntSet as IS
-- | A unique ID. Must be >= 0.
newtype Unique = Unique { unUnique :: Int }

-- | A set of 'Unique's. The bitmask is a set for IDs 0..63 (0..31 if 32-bit).
-- | A set of 'Unique's.

-- The bitmask is a set for IDs 0..63 on 64-bit and 0..31 on 32-bit.
-- Set operations on this are very fast and speed up the common case of small
-- regexes a little bit, at the cost of a little more memory.
data UniqueSet = UniqueSet {-# UNPACK #-} !Int !IS.IntSet
Expand Down

0 comments on commit 78ca69a

Please sign in to comment.