lark-parser · MegaIng · Nov 20, 2023 · Jun 19, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/examples/advanced/scan_wikitext.py b/examples/advanced/scan_wikitext.py
@@ -0,0 +1,42 @@
+"""
+Showcases how to use `Lark.scan` to select a pattern from a larger text without having to parse all of it.
+
+Uses `requests` to fetch the current wikitext from `Python (Programming Language)` and uses a simple grammar
+to extract all wikitext templates used in the page.
+
+"""
+
+from collections import Counter
+from pprint import pprint
+
+import lark
+import requests
+
+page_name = "Python_(programming_language)"
+url = f"https://en.wikipedia.org/wiki/{page_name}?action=raw"
+
+wikitext = requests.get(url).text
+
+grammar = r"""
+template: "{{" TEXT ("|" argument)* "}}"
+text: (TEXT|template)+
+argument: /\w+(?==)/ "=" text -> named_argument
+        | text -> numbered_argument
+
+TEXT: / (?:[^{}|]
+      | \{(?!\{)
+      | \}(?!\})
+      )+/x
+"""
+parser = lark.Lark(grammar, parser='lalr', start='template')
+used_templates = Counter()
+inner_templates = 0
+for (start, end), res in parser.scan(wikitext):
+    for temp in res.find_data('template'):
+        if temp != res:
+            inner_templates += 1
+        used_templates[temp.children[0].value] += 1
+
+pprint(used_templates)
+print("Total templates used:", used_templates.total())
+print("Number of templates nested inside others:", inner_templates)
diff --git a/lark/__init__.py b/lark/__init__.py
@@ -9,7 +9,7 @@
     UnexpectedToken,
 )
 from .lark import Lark
-from .lexer import Token
+from .lexer import Token, TextSlice
 from .tree import ParseTree, Tree
 from .utils import logger
 from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
@@ -27,6 +27,7 @@
     "UnexpectedToken",
     "Lark",
     "Token",
+    "TextSlice",
     "ParseTree",
     "Tree",
     "logger",

diff --git a/lark/lark.py b/lark/lark.py
@@ -16,15 +16,15 @@
         from typing import Literal
     else:
         from typing_extensions import Literal
-    from .parser_frontends import ParsingFrontend
+    from .parser_frontends import ParsingFrontend, ScanMatch
 
 from .exceptions import ConfigurationError, assert_config, UnexpectedInput
 from .utils import Serialize, SerializeMemoizer, FS, isascii, logger
 from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
 from .tree import Tree
 from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
 
-from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
+from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token, TextSlice
 from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend
 from .grammar import Rule
@@ -600,8 +600,7 @@ def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_pat
     def __repr__(self):
         return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)
 
-
-    def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
+    def lex(self, text: Union[str, 'TextSlice'], dont_ignore: bool = False) -> Iterator[Token]:
         """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'
 
         When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
@@ -613,6 +612,7 @@ def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
             lexer = self._build_lexer(dont_ignore)
         else:
             lexer = self.lexer
+        text = TextSlice.from_text(text)
         lexer_thread = LexerThread.from_text(lexer, text)
         stream = lexer_thread.lex(None)
         if self.options.postlex:
@@ -623,28 +623,39 @@ def get_terminal(self, name: str) -> TerminalDef:
         """Get information about a terminal"""
         return self._terminals_dict[name]
 
-    def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
+    def parse_interactive(self, text: Optional[str] = None, start: Optional[str] = None,
+                          *, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> 'InteractiveParser':
         """Start an interactive parsing session.
 
         Parameters:
             text (str, optional): Text to be parsed. Required for ``resume_parse()``.
             start (str, optional): Start symbol
+            start_pos (int, optional): Position at which the parser starts. Defaults to 0.
+            end_pos (int, optional): Position at which the parser stops. Defaults to len(text).
 
         Returns:
             A new InteractiveParser instance.
 
         See Also: ``Lark.parse()``
         """
-        return self.parser.parse_interactive(text, start=start)
+        return self.parser.parse_interactive(text, start=start, start_pos=start_pos, end_pos=end_pos)
 
-    def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
+    def parse(self, text: Union[str, 'TextSlice'], start: Optional[str] = None,
+              on_error: 'Optional[Callable[[UnexpectedInput], bool]]' = None) -> 'ParseTree':
         """Parse the given text, according to the options provided.
 
         Parameters:
             text (str): Text to be parsed.
             start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
             on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing.
                 LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.
+            start_pos (int, optional): Position at which the parser starts. Defaults to 0.
+            end_pos (int, optional): Position at which the parser stops. Defaults to len(text).
+                Both of these don't work with lexer='dynamic'/'dynamic_complete'
+                Their behavior mirrors the behavior of the corresponding parameters in the Standard Library re module,
+                which most notably means that look behinds in regex will look behind start_pos, but lookaheads
+                won't look after end_pos. See [re.search](https://docs.python.org/3/library/re.html#re.Pattern.search)
+                for more information
 
         Returns:
             If a transformer is supplied to ``__init__``, returns whatever is the
@@ -657,5 +668,29 @@ def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callab
         """
         return self.parser.parse(text, start=start, on_error=on_error)
 
+    def scan(self, text: Union[str, TextSlice], start: Optional[str] = None) -> Iterable['ScanMatch']:
+        """
+        Scans the input text for non-overlapping matches of this grammar.
+
+        Only works with parser='lalr'. Works best if the first terminal(s)
+        that can be matched by grammar are unique in the text and always indicate the start of a match.
+
+        A found match will never start or end with an ignored terminal.
+
+        Does not raise any exceptions except for invalid arguments/configurations.
+
+        Parameters:
+            text (str, optional): Text to be parsed. Required for ``resume_parse()``.
+            start (str, optional): Start symbol
+            start_pos (int, optional): Position at which the parser starts. Defaults to 0.
+            end_pos (int, optional): Position at which the parser stops. Defaults to len(text).
+
+        Returns:
+            An Iterable of `ScanMatch` instances, which contain two attributes: `range` a tuple with
+            the indices of the start and end of the found match, and `tree`, the parsed Tree object.
+
+        See Also: ``Lark.parse()``
+        """
+        return self.parser.scan(text, start=start)
 
 ###}