diff --git a/parser-regex.cabal b/parser-regex.cabal index f53c103..2352585 100644 --- a/parser-regex.cabal +++ b/parser-regex.cabal @@ -2,7 +2,6 @@ cabal-version: 2.4 name: parser-regex version: 0.2.0.0 synopsis: Regex based parsers -description: Regex based parsers. homepage: https://github.com/meooow25/parser-regex bug-reports: https://github.com/meooow25/parser-regex/issues license: BSD-3-Clause @@ -15,6 +14,18 @@ extra-doc-files: README.md CHANGELOG.md +description: + Regex based parsers. See + . + ["Regex.Text"] + To work with @Text@ from the @text@ library. + . + ["Regex.List"] + To work with @String@s or lists. + . + ["Regex.Base"] + To work with other sequences. + tested-with: GHC == 9.0.2 , GHC == 9.2.8 diff --git a/src/Regex/Base.hs b/src/Regex/Base.hs index 8378855..bf9cd7a 100644 --- a/src/Regex/Base.hs +++ b/src/Regex/Base.hs @@ -1,6 +1,9 @@ -- | This module exports base types and functions. You can use these to define --- functions to work on arbitrary sequence types. If you want to work with --- @Text@ or @String@, import and use "Regex.Text" or "Regex.List" instead. +-- functions to work on arbitrary sequence types. +-- +-- If you want to work with @Text@ or @String@, import and use "Regex.Text" or +-- "Regex.List" instead. +-- module Regex.Base ( -- * @RE@ and @Parser@ @@ -57,6 +60,20 @@ module Regex.Base , R.liftA2' , R.foldlMany' , R.foldlManyMin' + + -- * Additional information + + -- ** Recursive definitions + -- $recursive-definitions + + -- ** Laziness + -- $laziness + + -- ** Looping parsers + -- $looping-parsers + + -- ** Performance + -- $performance ) where import qualified Regex.Internal.Regex as R @@ -93,3 +110,113 @@ import qualified Regex.Internal.Parser as P -- /don't use these functions/. Simply use @fmap@, @liftA2@, @foldlMany@ or -- @foldlManyMin@ instead. -- + +-- $recursive-definitions +-- +-- It is not possible to define a @RE@ recursively. If it were permitted, it +-- would be capable of parsing more than +-- [regular languages](https://en.wikipedia.org/wiki/Regular_language). +-- Unfortunately, there is no good way\* to make it impossible to write such +-- a regex in the first place. So it must be avoided by the programmer. As an +-- example, avoid this: +-- +-- @ +-- re :: RE Int [Int] +-- re = liftA2 (:) (single 1) re \<|> [] \<$ single 0 -- diverges! +-- @ +-- +-- Instead, use appropriate combinators from this module: +-- +-- @ +-- re = many (single 1) <* single 0 +-- @ +-- +-- For the same reason, be cautious when using combinators from the other +-- packages on @RE@s. Make sure that they do not attempt to construct a +-- recursive @RE@. +-- +-- If you find that your regex is impossible to write without recursion, +-- you are attempting to parse a non-regular language! You need a more powerful +-- parser than what this library has to offer. +-- +-- \*[Unlifted datatypes](https://ghc.gitlab.haskell.org/ghc/doc/users_guide/exts/primitives.html#unlifted-datatypes) +-- can be used for this but they are too inconvenient to work with. +-- + +-- $laziness +-- +-- Parsing is lazy in the result value, i.e. the @a@ in @RE c a@ or +-- @Parser c a@. In fact, for the algorithm used in this library, this laziness +-- is essential for good runtime complexity. However, there is little reason +-- to be lazy in other aspects, such as the elements of the sequence, @c@, or +-- the functions and regexes used in combinators. Functions are strict in such +-- arguments. +-- +-- @ +-- -- Lazy in the result +-- parseFoldr foldr (compile (pure ⊥)) [] = Just ⊥ +-- parseFoldr foldr (compile (fmap (\\_ -> ⊥) (single 1))) [1] = Just ⊥ +-- +-- -- Strict in places like +-- single ⊥ = ⊥ +-- fmap ⊥ r = ⊥ +-- liftA2 f r ⊥ = ⊥ +-- @ +-- + +-- $looping-parsers +-- +-- What should be the result of parsing an empty sequence with +-- @(many (pure ()))@? +-- +-- Since @many r@ parses @r@ as many times as possible, and @pure ()@ succeeds +-- without consuming input, the result should arguably be the infinite list +-- @repeat ()@. Similarly, parsing an empty sequence with +-- @(foldlMany f z (pure ()))@ should diverge. Note that this applies to not +-- just @pure x@, but any regex that can succeed without consuming input, such +-- as @many x@, @manyMin x@, etc. +-- +-- This library considers that such an outcome is not desirable in practice. It +-- would be surprising to get an infinite structure from a parser. So, in the +-- case that @many@ succeeds an infinite number of times, this library treats it +-- as succeeding /zero/ times. +-- +-- By this rule, @(many (pure ()))@ on an empty sequence parses as @[]@ and +-- @(foldlMany f z (pure ()))@ parses as @z@. +-- +-- This behavior makes it impossible to distinguish between zero parses and +-- infinite parses. To address this, an alternate combinator 'Regex.List.manyr' +-- is provided. This parses into a 'Regex.List.Many', a type that clearly +-- indicates if parsing succeeded without consuming input into an infinite list, +-- or if it succeeded a finite number of times. +-- + +-- $performance +-- +-- This section describes some performance characteristics of this library, +-- without requiring a dive into the source code. +-- +-- Parsing with a @RE@ is done in two distinct steps. +-- +-- 1. A @RE@ is compiled to a @Parser@, which is a +-- [nondeterministic finite automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) +-- (NFA), in \(O(m)\) time. \(m\) here is the size of the @RE@, which is the +-- number of nodes in its internal tree representation. The resulting @Parser@ +-- has \(O(m)\) size. +-- 2. The @Parser@ is run on a sequence in \(O(mn \log m)\) time, where \(n\) is +-- the length of the sequence. This assumes that each @(c -> Maybe a)@ function +-- used to parse individual elements takes \(O(1)\) time. +-- +-- /Performance tip/: Use @(\<$)@ over @(\<$>)@, and @(\<*)@\/@(*>)@ over +-- @liftA2@\/@(\<*>)@ when ignoring the result of a @RE@. Knowing the result is +-- ignored allows compiling to a faster parser. +-- +-- Memory usage for parsing is \(O(nm)\), but +-- +-- * If the result of a @RE@ is ignored using @(\<$)@, @(\<*)@, or @(*>)@, only +-- \(O(m)\) memory is required. +-- +-- This applies even as subcomponents. So, any subcomponent @RE@ of a larger +-- @RE@ that is only recognizing a section of the list is cheaper in terms of +-- memory. +-- diff --git a/src/Regex/Internal/Regex.hs b/src/Regex/Internal/Regex.hs index 8113ec8..2744c4a 100644 --- a/src/Regex/Internal/Regex.hs +++ b/src/Regex/Internal/Regex.hs @@ -85,7 +85,7 @@ import qualified Data.Foldable as F -- -- Note that, because of bias, it is /not true/ that @a \<|> b = b \<|> a@. -- --- /Performance note/: Prefer the smaller of equivalent regexes, i.e. prefer +-- /Performance tip/: Prefer the smaller of equivalent regexes, i.e. prefer -- @(a \<|> b) \<*> c@ over @(a \<*> c) \<|> (b \<*> c)@. -- data RE c a where diff --git a/src/Regex/List.hs b/src/Regex/List.hs index d917d0b..b560981 100644 --- a/src/Regex/List.hs +++ b/src/Regex/List.hs @@ -81,7 +81,18 @@ module Regex.List , L.replaceAll -- * Additional information - -- $info + + -- ** Recursive definitions + -- $recursive-definitions + + -- ** Laziness + -- $laziness + + -- ** Looping parsers + -- $looping-parsers + + -- ** Performance + -- $performance ) where import qualified Regex.Internal.Regex as R @@ -104,9 +115,7 @@ import qualified Regex.Internal.List as L -- * "Data.Traversable": @traverse@, @for@, @sequenceA@ -- --- $info --- --- == Recursive definitions +-- $recursive-definitions -- -- It is not possible to define a @RE@ recursively. If it were permitted, it -- would be capable of parsing more than @@ -134,16 +143,17 @@ import qualified Regex.Internal.List as L -- you are attempting to parse a non-regular language! You need a more powerful -- parser than what this library has to offer. -- --- \* [Unlifted datatypes](https://ghc.gitlab.haskell.org/ghc/doc/users_guide/exts/primitives.html#unlifted-datatypes) --- can serve this purpose but they are too inconvenient to work with. +-- \*[Unlifted datatypes](https://ghc.gitlab.haskell.org/ghc/doc/users_guide/exts/primitives.html#unlifted-datatypes) +-- can be used for this but they are too inconvenient to work with. -- --- == Laziness + +-- $laziness -- -- Parsing is lazy in the result value, i.e. the @a@ in @RE c a@ or -- @Parser c a@. In fact, for the algorithm used in this library, this laziness -- is essential for good runtime complexity. However, there is little reason --- to be lazy in other aspects, such as the values of the sequence, @c@, or the --- functions and regexes used in combinators. Functions are strict in such +-- to be lazy in other aspects, such as the elements of the sequence, @c@, or +-- the functions and regexes used in combinators. Functions are strict in such -- arguments. -- -- @ @@ -157,7 +167,8 @@ import qualified Regex.Internal.List as L -- liftA2 f r ⊥ = ⊥ -- @ -- --- == Looping parsers + +-- $looping-parsers -- -- What should be the result of @reParse (many (pure ())) ""@? -- @@ -168,7 +179,7 @@ import qualified Regex.Internal.List as L -- can succeed without consuming input, such as @many x@, @manyMin x@, etc. -- -- This library considers that such an outcome is not desirable in practice. It --- would be surprising to get an infinite structure from your parser. So, in the +-- would be surprising to get an infinite structure from a parser. So, in the -- case that @many@ succeeds an infinite number of times, this library treats it -- as succeeding /zero/ times. -- @@ -181,25 +192,28 @@ import qualified Regex.Internal.List as L -- indicates if parsing succeeded without consuming input into an infinite list, -- or if it succeeded a finite number of times. -- --- == Performance + +-- $performance -- --- This section may be useful for someone looking to understand the performance --- of this library without diving into the source code. +-- This section describes some performance characteristics of this library, +-- without requiring a dive into the source code. -- -- Parsing with a @RE@ is done in two distinct steps. -- --- 1. A @RE@ is compiled to a @Parser@ in \(O(m)\) time, where \(m\) is the size --- of the @RE@. This is a +-- 1. A @RE@ is compiled to a @Parser@, which is a -- [nondeterministic finite automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) --- (NFA). +-- (NFA), in \(O(m)\) time. \(m\) here is the size of the @RE@, which is the +-- number of nodes in its internal tree representation. The resulting @Parser@ +-- has \(O(m)\) size. -- 2. The @Parser@ is run on a list in \(O(mn \log m)\) time, where \(n\) is --- the length of the list. Assumes every @Char@ is parsed in \(O(1)\). +-- the length of the list. This assumes that each @(c -> Maybe a)@ function +-- used to parse individual elements takes \(O(1)\) time. -- --- /Performance note/: Use @(\<$)@ over @(\<$>)@, and @(\<*)@\/@(*>)@ over +-- /Performance tip/: Use @(\<$)@ over @(\<$>)@, and @(\<*)@\/@(*>)@ over -- @liftA2@\/@(\<*>)@ when ignoring the result of a @RE@. Knowing the result is -- ignored allows compiling to a faster parser. -- --- Memory usage for parsing is \(O(nm)\). +-- Memory usage for parsing is \(O(nm)\), but -- -- * If the result of a @RE@ is ignored using @(\<$)@, @(\<*)@, or @(*>)@, only -- \(O(m)\) memory is required. diff --git a/src/Regex/Text.hs b/src/Regex/Text.hs index 177f48d..ce83000 100644 --- a/src/Regex/Text.hs +++ b/src/Regex/Text.hs @@ -81,7 +81,18 @@ module Regex.Text , T.replaceAll -- * Additional information - -- $info + + -- ** Recursive definitions + -- $recursive-definitions + + -- ** Laziness + -- $laziness + + -- ** Looping parsers + -- $looping-parsers + + -- ** Performance + -- $performance ) where import qualified Regex.Internal.Regex as R @@ -104,9 +115,7 @@ import qualified Regex.Internal.Text as T -- * "Data.Traversable": @traverse@, @for@, @sequenceA@ -- --- $info --- --- == Recursive definitions +-- $recursive-definitions -- -- It is not possible to define a @RE@ recursively. If it were permitted, it -- would be capable of parsing more than @@ -134,16 +143,17 @@ import qualified Regex.Internal.Text as T -- you are attempting to parse a non-regular language! You need a more powerful -- parser than what this library has to offer. -- --- \* [Unlifted datatypes](https://ghc.gitlab.haskell.org/ghc/doc/users_guide/exts/primitives.html#unlifted-datatypes) --- can serve this purpose but they are too inconvenient to work with. +-- \*[Unlifted datatypes](https://ghc.gitlab.haskell.org/ghc/doc/users_guide/exts/primitives.html#unlifted-datatypes) +-- can be used for this but they are too inconvenient to work with. -- --- == Laziness + +-- $laziness -- -- Parsing is lazy in the result value, i.e. the @a@ in @RE c a@ or -- @Parser c a@. In fact, for the algorithm used in this library, this laziness -- is essential for good runtime complexity. However, there is little reason --- to be lazy in other aspects, such as the values of the sequence, @c@, or the --- functions and regexes used in combinators. Functions are strict in such +-- to be lazy in other aspects, such as the elements of the sequence, @c@, or +-- the functions and regexes used in combinators. Functions are strict in such -- arguments. -- -- @ @@ -157,7 +167,8 @@ import qualified Regex.Internal.Text as T -- liftA2 f r ⊥ = ⊥ -- @ -- --- == Looping parsers + +-- $looping-parsers -- -- What should be the result of @reParse (many (pure ())) ""@? -- @@ -168,7 +179,7 @@ import qualified Regex.Internal.Text as T -- can succeed without consuming input, such as @many x@, @manyMin x@, etc. -- -- This library considers that such an outcome is not desirable in practice. It --- would be surprising to get an infinite structure from your parser. So, in the +-- would be surprising to get an infinite structure from a parser. So, in the -- case that @many@ succeeds an infinite number of times, this library treats it -- as succeeding /zero/ times. -- @@ -181,25 +192,28 @@ import qualified Regex.Internal.Text as T -- indicates if parsing succeeded without consuming input into an infinite list, -- or if it succeeded a finite number of times. -- --- == Performance + +-- $performance -- --- This section may be useful for someone looking to understand the performance --- of this library without diving into the source code. +-- This section describes some performance characteristics of this library, +-- without requiring a dive into the source code. -- -- Parsing with a @RE@ is done in two distinct steps. -- --- 1. A @RE@ is compiled to a @Parser@ in \(O(m)\) time, where \(m\) is the size --- of the @RE@. This is a +-- 1. A @RE@ is compiled to a @Parser@, which is a -- [nondeterministic finite automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) --- (NFA). +-- (NFA), in \(O(m)\) time. \(m\) here is the size of the @RE@, which is the +-- number of nodes in its internal tree representation. The resulting @Parser@ +-- has \(O(m)\) size. -- 2. The @Parser@ is run on a @Text@ in \(O(mn \log m)\) time, where \(n\) is --- the length of the @Text@. Assumes every @Char@ is parsed in \(O(1)\). +-- the length of the @Text@. This assumes that each @(TextToken -> Maybe a)@ +-- function used to parse individual elements takes \(O(1)\) time. -- --- /Performance note/: Use @(\<$)@ over @(\<$>)@, and @(\<*)@\/@(*>)@ over +-- /Performance tip/: Use @(\<$)@ over @(\<$>)@, and @(\<*)@\/@(*>)@ over -- @liftA2@\/@(\<*>)@ when ignoring the result of a @RE@. Knowing the result is -- ignored allows compiling to a faster parser. -- --- Memory usage for parsing is \(O(nm)\). +-- Memory usage for parsing is \(O(nm)\), but -- -- * If the result of a @RE@ is ignored using @(\<$)@, @(\<*)@, or @(*>)@, only -- \(O(m)\) memory is required.