Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scanning #1429

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions examples/advanced/scan_wikitext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Showcases how to use `Lark.scan` to select a pattern from a larger text without having to parse all of it.

Uses `requests` to fetch the current wikitext from `Python (Programming Language)` and uses a simple grammar
to extract all wikitext templates used in the page.

"""

from collections import Counter
from pprint import pprint

import lark
import requests

page_name = "Python_(programming_language)"
url = f"https://en.wikipedia.org/wiki/{page_name}?action=raw"

wikitext = requests.get(url).text

grammar = r"""
template: "{{" TEXT ("|" argument)* "}}"
text: (TEXT|template)+
argument: /\w+(?==)/ "=" text -> named_argument
| text -> numbered_argument

TEXT: / (?:[^{}|]
| \{(?!\{)
| \}(?!\})
)+/x
"""
parser = lark.Lark(grammar, parser='lalr', start='template')
used_templates = Counter()
inner_templates = 0
for (start, end), res in parser.scan(wikitext):
for temp in res.find_data('template'):
if temp != res:
inner_templates += 1
used_templates[temp.children[0].value] += 1

pprint(used_templates)
print("Total templates used:", used_templates.total())
print("Number of templates nested inside others:", inner_templates)
3 changes: 2 additions & 1 deletion lark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
UnexpectedToken,
)
from .lark import Lark
from .lexer import Token
from .lexer import Token, TextSlice
from .tree import ParseTree, Tree
from .utils import logger
from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args
Expand All @@ -27,6 +27,7 @@
"UnexpectedToken",
"Lark",
"Token",
"TextSlice",
"ParseTree",
"Tree",
"logger",
Expand Down
49 changes: 42 additions & 7 deletions lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
from typing import Literal
else:
from typing_extensions import Literal
from .parser_frontends import ParsingFrontend
from .parser_frontends import ParsingFrontend, ScanMatch

from .exceptions import ConfigurationError, assert_config, UnexpectedInput
from .utils import Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource, sha256_digest
from .tree import Tree
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType

from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token, TextSlice
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend
from .grammar import Rule
Expand Down Expand Up @@ -600,8 +600,7 @@ def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_pat
def __repr__(self):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)


def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
def lex(self, text: Union[str, 'TextSlice'], dont_ignore: bool = False) -> Iterator[Token]:
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'

When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
Expand All @@ -613,6 +612,7 @@ def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
lexer = self._build_lexer(dont_ignore)
else:
lexer = self.lexer
text = TextSlice.from_text(text)
lexer_thread = LexerThread.from_text(lexer, text)
stream = lexer_thread.lex(None)
if self.options.postlex:
Expand All @@ -623,28 +623,39 @@ def get_terminal(self, name: str) -> TerminalDef:
"""Get information about a terminal"""
return self._terminals_dict[name]

def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
def parse_interactive(self, text: Optional[str] = None, start: Optional[str] = None,
*, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> 'InteractiveParser':
"""Start an interactive parsing session.

Parameters:
text (str, optional): Text to be parsed. Required for ``resume_parse()``.
start (str, optional): Start symbol
start_pos (int, optional): Position at which the parser starts. Defaults to 0.
end_pos (int, optional): Position at which the parser stops. Defaults to len(text).

Returns:
A new InteractiveParser instance.

See Also: ``Lark.parse()``
"""
return self.parser.parse_interactive(text, start=start)
return self.parser.parse_interactive(text, start=start, start_pos=start_pos, end_pos=end_pos)

def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
def parse(self, text: Union[str, 'TextSlice'], start: Optional[str] = None,
on_error: 'Optional[Callable[[UnexpectedInput], bool]]' = None) -> 'ParseTree':
"""Parse the given text, according to the options provided.

Parameters:
text (str): Text to be parsed.
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option).
on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing.
LALR only. See examples/advanced/error_handling.py for an example of how to use on_error.
start_pos (int, optional): Position at which the parser starts. Defaults to 0.
end_pos (int, optional): Position at which the parser stops. Defaults to len(text).
Both of these don't work with lexer='dynamic'/'dynamic_complete'
Their behavior mirrors the behavior of the corresponding parameters in the Standard Library re module,
which most notably means that look behinds in regex will look behind start_pos, but lookaheads
won't look after end_pos. See [re.search](https://docs.python.org/3/library/re.html#re.Pattern.search)
for more information

Returns:
If a transformer is supplied to ``__init__``, returns whatever is the
Expand All @@ -657,5 +668,29 @@ def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callab
"""
return self.parser.parse(text, start=start, on_error=on_error)

def scan(self, text: Union[str, TextSlice], start: Optional[str] = None) -> Iterable['ScanMatch']:
"""
Scans the input text for non-overlapping matches of this grammar.

Only works with parser='lalr'. Works best if the first terminal(s)
that can be matched by grammar are unique in the text and always indicate the start of a match.

A found match will never start or end with an ignored terminal.

Does not raise any exceptions except for invalid arguments/configurations.

Parameters:
text (str, optional): Text to be parsed. Required for ``resume_parse()``.
start (str, optional): Start symbol
start_pos (int, optional): Position at which the parser starts. Defaults to 0.
end_pos (int, optional): Position at which the parser stops. Defaults to len(text).

Returns:
An Iterable of `ScanMatch` instances, which contain two attributes: `range` a tuple with
the indices of the start and end of the found match, and `tree`, the parsed Tree object.

See Also: ``Lark.parse()``
"""
return self.parser.scan(text, start=start)

###}
Loading