From eb9c4c099bf26bdb3936af21ff49e3684b9e6638 Mon Sep 17 00:00:00 2001 From: Takahide Higuchi Date: Mon, 4 Oct 2021 17:56:06 +0900 Subject: [PATCH 1/7] Added support for grammar in ABNF format (RFC5234) --- lark/lark.py | 17 +- lark/load_grammar_abnf.py | 597 +++++++++++++++++++++++++++++++++++++ tests/__main__.py | 1 + tests/test_grammar_abnf.py | 209 +++++++++++++ 4 files changed, 822 insertions(+), 2 deletions(-) create mode 100644 lark/load_grammar_abnf.py create mode 100644 tests/test_grammar_abnf.py diff --git a/lark/lark.py b/lark/lark.py index f29d444d..d470f925 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -16,6 +16,7 @@ from .exceptions import ConfigurationError, assert_config, UnexpectedInput from .utils import Serialize, SerializeMemoizer, FS, isascii, logger from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource +from .load_grammar_abnf import load_abnf_grammar, ABNFGrammar from .tree import Tree from .common import LexerConf, ParserConf @@ -98,6 +99,11 @@ class LarkOptions(Serialize): Prevent the tree builder from automagically removing "punctuation" tokens (default: False) tree_class Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``. + syntax + Syntax for grammar specification. + + - "lark" (default): Lark's EBNF based syntax + - "abnf" : ABNF syntax, described in RFC5234. Various extentions in Lark's EBNF syntax are not supported. **=== Algorithm Options ===** @@ -169,6 +175,7 @@ class LarkOptions(Serialize): 'use_bytes': False, 'import_paths': [], 'source_path': None, + 'syntax': 'lark', } def __init__(self, options_dict): @@ -326,11 +333,17 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: # In practice the only relevant thing that might have been overriden should be `options` self.options = old_options + assert_config(self.options.syntax, ('lark', 'abnf')) # Parse the grammar file and compose the grammars - self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) + if self.options.syntax == 'lark': + self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) + elif self.options.syntax == 'abnf': + self.grammar, used_files = load_abnf_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) + else: + assert False, self.options.syntax else: - assert isinstance(grammar, Grammar) + assert isinstance(grammar, (Grammar, ABNFGrammar)) self.grammar = grammar diff --git a/lark/load_grammar_abnf.py b/lark/load_grammar_abnf.py new file mode 100644 index 00000000..6fa109ce --- /dev/null +++ b/lark/load_grammar_abnf.py @@ -0,0 +1,597 @@ +"""Parses grammar written in ABNF (RFC5234 and 7405) and creates Grammar objects. """ + +from .load_grammar import PrepareGrammar, PrepareAnonTerminals +from .load_grammar import EBNF_to_BNF, SimplifyRule_Visitor +from .load_grammar import _get_parser, symbols_from_strcase, nr_deepcopy_tree + +from .utils import logger +from .lexer import Token, TerminalDef, Pattern, PatternRE, PatternStr + +from .parse_tree_builder import ParseTreeBuilder +from .parser_frontends import ParsingFrontend +from .common import LexerConf, ParserConf +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY +from .tree import Tree, SlottedTree as ST +from .utils import classify, classify_bool +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken + +from .visitors import v_args, Transformer_InPlace, Transformer_NonRecursive, Visitor, Transformer +inline_args = v_args(inline=True) + + +# Terminals (ie. keys in TERMINALS ) shall consist of uppercase letters and underscores. +TERMINALS = { + '_LPAR': r'\(', + '_RPAR': r'\)', + '_LBRA': r'\[', + '_RBRA': r'\]', + '_STAR' : r'\*', + '_SLASH' : r'/', + + 'RULENAME': r'[a-zA-Z][a-zA-Z0-9\-]*', + 'EQ': r'=', + 'EQ_ALT': r'=/', + + '_IGNORE_CASE': r'%i', + '_CASE_SENSITIVE': r'%s', + + # quoted-string = DQUOTE *(%x20-21 / %x23-7E) DQUOTE + 'QSTRING': r'"[ !#$%&\'\(\)\*\+,\-\./0-9:;<=>\?@A-Z\[\\\]\^_a-z\{|\}~]*"', + + # prose-val = "<" *(%x20-3D / %x3F-7E) ">" + 'PROSE_VAL' : r'<[ !"#$%&\'\(\)\*\+,\-\./0-9:;<=\?@A-Z\[\\\]\^_a-z\{|\}~]*>', + + 'NUMBER' : r'[0-9]+', + + 'DEC_VAL': r'%d([0-9]+(\.[0-9]+)+|[0-9]+\-[0-9]+|[0-9]+)', + 'HEX_VAL': r'%x([0-9A-F]+(\.[0-9A-F]+)+|[0-9A-F]+\-[0-9A-F]+|[0-9A-F]+)', + 'BIN_VAL': r'%b([01]+(\.[01]+)+|[01]+\-[01]+|[01]+)', + + # c-wsp = WSP / (c-nl WSP) + 'C_WSP': r'[ \t]+|((;[^\n]*)*\r?\n)[ \t]+', + '_C_NL': r'((;[^\n]*)*\r?\n)(?![ \t])', + + # define terminal for unusable charaters to see nice error messages for common pitfalls + '_UNUSABLE_CHARS': r'[_@!#$&\+:]' +} +_TERMINALS_TO_IGNORE=['C_WSP'] + + +# Name of rules (ie. keys in RULES below) shall consist of lowercase letters and underscores. +RULES = { + 'start': ['_rulelist'], + + # rulelist = 1*( rule / (*c-wsp c-nl) ) + '_rulelist': ['_item', '_rulelist _item'], + '_item': ['rule', '_C_NL' ], + + # There are some assumptions in rule for 'rule' + # + # - Name of the rule definition shall be 'rule' + # - First element in the lefthand side of the rule shall be named as 'RULENAME' + # - '_c-nl' cannot be renamed to 'c-nl', + # otherwise self._unpack_definition() will fail to capture 'elements' + # + 'rule': ['RULENAME _defined_as elements _C_NL'], + + '_defined_as': [ 'EQ', 'EQ_ALT' ], + + # elements = alternation *c-wsp + # alternation = concatenation *(*c-wsp "/" *c-wsp concatenation) + # concatenation = repetition *(1*c-wsp repetition) + # repetition = [repeat] element + # + 'elements': [ 'alternation' ], + 'alternation': [ '_alternation'], + '_alternation': [ 'concatenation', '_alternation _SLASH concatenation'], + 'concatenation': [ '_concatenation'], + '_concatenation':[ 'repetition', '_concatenation repetition'], + + 'repetition': [ 'element', 'repeat element' ], + + # repeat = 1*DIGIT / (*DIGIT "*" *DIGIT) + 'repeat': [ 'repeat_min _STAR repeat_max', + 'repeat_min _STAR', + '_STAR repeat_max', + '_STAR', + 'repeat_n' ], + + 'repeat_n': [ 'NUMBER' ], + 'repeat_min': [ 'NUMBER' ], + 'repeat_max': [ 'NUMBER' ], + + 'element': [ 'RULENAME', '_group', 'option', 'char_val', 'num_val', 'prose_val'], + + # 'group' is inlined intentionally. + # + # grouping will produces nested 'alternation' rule tree. + # (e.g. '"a" | ("b")' in ABNF produces 'alternation("a", alternation("b"))' in AST terms.) + # + # Such nested and redundant rule will be flattened later + # by SimplifyRule_Visitor()._flatten(). + '_group': [ '_LPAR alternation _RPAR' ], + 'option': [ '_LBRA alternation _RBRA' ], + + 'char_val': [ 'case_insensitive_string', 'case_sensitive_string' ], + 'case_insensitive_string': [ '_IGNORE_CASE QSTRING', 'QSTRING' ], + 'case_sensitive_string': [ '_CASE_SENSITIVE QSTRING' ], + + 'num_val': [ 'dec_val', 'bin_val', 'hex_val',], + + 'dec_val': [ 'DEC_VAL' ], + 'hex_val': [ 'HEX_VAL' ], + 'bin_val': [ 'BIN_VAL' ], + + 'prose_val': [ 'PROSE_VAL' ], +} + + +class ABNF_to_BNF(EBNF_to_BNF): + """ converts ABNF to BNF. + we reuse super()._add_repeat_rule() etc. from EBNF_to_BNF via inheritance. + """ + + def _add_recurse_rule(self, type_, element, repeat_min): + assert repeat_min >= 1 + + new_name = self._name_rule(type_) + t = NonTerminal(new_name) + tree = ST('alternation', [ + ST('concatenation', [element] * repeat_min), + ST('concatenation', [t, element]) + ]) + return self._add_rule(element, new_name, tree) + + def option(self, items): + assert len(items) == 1 + + # RFC5234 Section 3.8: Optional Sequence: [RULE] + empty = ST('concatenation', []) + alternation = items[0] + return ST('alternation', [alternation, empty]) + + def repetition(self, items): + if len(items) == 1: + # no repetition + return items[0] + + repeat = items[0] + element = items[1] + + rmin = [ x for x in repeat.find_data('repeat_min') ] + rmax = [ x for x in repeat.find_data('repeat_max') ] + rnum = [ x for x in repeat.find_data('repeat_n') ] + + rmin = int(rmin[0].children[0].value) if len(rmin) else 0 + rmax = int(rmax[0].children[0].value) if len(rmax) else None + rnum = int(rnum[0].children[0].value) if len(rnum) else None + + if rnum is not None: + # Specific Repetition 'nRule' + if rnum == 0: + empty = ST('concatenation', []) + return ST('alternation', [empty]) + + else: + rmin = rmax = rnum + else: + # Variable Repetition '*Rule', where and are optional + if rmax is None: + if rmin == 0: + # '*Rule' (or '0*Rule') + new_name = self._add_recurse_rule('star', element, 1) + empty = ST('concatenation', []) + return ST('alternation', [new_name, empty]) + else: + # '*Rule' + return self._add_recurse_rule('repeat_min', element, rmin) + + else: + # '*Rule' or '*Rule' + pass + + if rmax < rmin or rmin < 0: + raise GrammarError("Bad repetition (%d*%d isn't allowed)" % (rmin, rmax)) + + return self._generate_repeats(element, rmin, rmax) + +class RenameRule_Visitor(Visitor): + """ rename ABNF Rule names to EBNF ones to reuse SimplifyRule_Visitor(). """ + def concatenation(self, tree): + tree.data = 'expansion' + + def alternation(self, tree): + tree.data = 'expansions' + + +class ABNFRuleTreeToText(Transformer): + + def expansion(self, symbols): + # renamed from 'concatenation' + return symbols + + def expansions(self, x): + # renamed from 'alternation' + return x + + def elements(self, x): + return x[0] + + def prose_val(self, x): + prose = x[0] + raise GrammarError("This ABNF cannot be used to generate parsers " + "since it has prose (informal) descriptions at line %s column %s" + % (prose.line, prose.column)) + + +@inline_args +class PrepareLiterals(Transformer_InPlace): + """ convert literals (char-val and num-val tokens in ABNF) into regexps """ + def char_val(self, char_val): + literal = char_val.children[0].value + text = literal[1:-1] # remove double quotes + if char_val.data == 'case_insensitive_string': + flags = ('i') + else: + flags = () + + return ST('pattern', [PatternStr(text, flags=flags, raw=literal)]) + + def _char_to_pattern(self, num_val_literal, base): + char = int(num_val_literal, base=base) + if char > 0xffffffff: + raise GrammarError("Terminal value characters larger than 0xffffffff is not supported.") + elif char > 0xffff: + regexp = r'\U{:08x}'.format(char) + elif char > 0xff: + regexp = r'\u{:04x}'.format(char) + else: + regexp = r'\x{:02x}'.format(char) + return regexp + + def _value_range_to_pattern(self, num_val, base=10): + literal = num_val.value[2:] + if literal.find('.') > 0: + # '.' concatenation of values + nums = ( self._char_to_pattern(num, base) for num in literal.split('.') ) + regexp = ''.join(nums) + + elif literal.find('-') > 0: + # '-' value range + start, end = ( self._char_to_pattern(num, base) for num in literal.split('-') ) + regexp = r'[%s-%s]' % (start, end) + else: + regexp = self._char_to_pattern(literal, base) + + # list is unpacked in self.num_val() + return [ ST('pattern', [PatternRE(regexp)]) ] + + def hex_val(self, literal): + return self._value_range_to_pattern(literal, base=16) + def dec_val(self, literal): + return self._value_range_to_pattern(literal, base=10) + def bin_val(self, literal): + return self._value_range_to_pattern(literal, base=2) + + def num_val(self, items): + return items[0] + + +class PruneTerminalTreeToPattern(Transformer_NonRecursive): + """ + simplify terminal-tree by converting it into single instance of PatternRE or PatternStr, + which is created by PrepareLiterals().transform(). + """ + def pattern(self, ps): + p ,= ps + return p; + + def elements(self, items): + assert len(items) == 1 + return items[0] + + def alternation(self, items): + assert len(items) == 1 + return items[0] + + def concatenation(self, items): + assert len(items) == 1 + return items[0] + + def repetition(self, items): + assert len(items) == 1 + return items[0] + + def element(self, items): + assert len(items) == 1 + return items[0] + + def num_val(self, items): + assert len(items) == 1 and isinstance(items[0], Pattern) + return items[0] + +class PrepareRuleNames(Transformer_InPlace): + def __init__(self, rule_names): + self.rule_names = rule_names + + def element(self, v): + v ,= v + if isinstance(v, Tree): + return v + + assert isinstance(v, Token) + if v.type == 'RULENAME': + if v.value in self.rule_names: + return NonTerminal(str(v.value)) + + return Terminal(str(v.value)) + + assert False + + +class ABNFGrammar: + def __init__(self, rule_defs, term_defs, ignore): + self.term_defs = term_defs + self.rule_defs = rule_defs + self.ignore = ignore + + def compile(self, start, terminals_to_keep): + # We change the trees in-place (to support huge grammars) + # So deepcopy allows calling compile more than once. + term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs] + rule_defs = [(n, nr_deepcopy_tree(t), o) for n, t, o in self.rule_defs] + + # =================== + # Compile Terminals + # =================== + + # This transformer applies PrepareLiterals first. + # It converts literals to regexps and place them in instances of PatternRE or PatternStr. + # + # Next, PruneTerminalTreeToPattern is applied to simplify terminal-tree to + # single instance of PatternRE or PatternStr. + + transformer = PrepareLiterals() * PruneTerminalTreeToPattern() + + terminal_list = [TerminalDef(name, transformer.transform(term_tree), priority) + for name, (term_tree, priority) in term_defs if term_tree] + + # ================= + # Compile Rules + # ================= + + # convert literals in rule_defs to Terminals, rule names to NonTerminals. + rule_names = [n for n, _t, _o in self.rule_defs] + transformer = PrepareLiterals() * PrepareRuleNames(rule_names) + + # convert anonymous terminals (i.e. literals in the right-hand-side of ABNF rules) + # to terminals and add them to terminal_list + + anon_tokens_transf = PrepareAnonTerminals(terminal_list) + transformer *= anon_tokens_transf + + # Convert ABNF to BNF. It will convert as follows: + # - repetitions (e.g. 1*DIGIT) -> recursive rules or repetition of symbols, + # - optional sequences (e.g. [ "word" ] ) -> alternation (e.g. ' "word" | "" ' ) + + abnf_to_bnf = ABNF_to_BNF() + + rules = [] + for name, rule_tree, options in rule_defs: + rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None + abnf_to_bnf.rule_options = rule_options + abnf_to_bnf.prefix = name + tree = transformer.transform(rule_tree) + res = abnf_to_bnf.transform(tree) + rules.append((name, res, options)) + + # add recursive rules generated in abnf_to_bnf.transform() + rules += abnf_to_bnf.new_rules + + # Compile tree to Rule objects + + # rename ABNF rule names to EBNF ones to reuse SimplifyRule_Visitor() + # ('alternation' in ABNF -> 'expansions', 'concatenation' in ABNF -> 'expansion' ) + rename_rule = RenameRule_Visitor() + + # unpack some rule trees and simplify nested rule tree in expansion and expansions + simplify_rule = SimplifyRule_Visitor() + + # unpack Tree objects to list of symbols + rule_tree_to_text = ABNFRuleTreeToText() + + compiled_rules = [] + for rule_content in rules: + name, tree, options = rule_content + + rename_rule.visit(tree) + simplify_rule.visit(tree) + + expansions = rule_tree_to_text.transform(tree) + + for i, expansion in enumerate(expansions): + + alias = None + exp_options = options + rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) + compiled_rules.append(rule) + + # assertion will fail if there are duplicates of rules + assert len(set(compiled_rules)) == len(compiled_rules) + + # Filter out unused rules + while True: + c = len(compiled_rules) + used_rules = {s for r in compiled_rules + for s in r.expansion + if isinstance(s, NonTerminal) + and s != r.origin} + used_rules |= {NonTerminal(s) for s in start} + compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules) + for r in unused: + logger.debug("Unused rule: %s", r) + if len(compiled_rules) == c: + break + + # Filter out unused terminals + if terminals_to_keep != '*': + used_terms = {t.name for r in compiled_rules + for t in r.expansion + if isinstance(t, Terminal)} + terminal_list, unused = classify_bool(terminal_list, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) + if unused: + logger.debug("Unused terminals: %s", [t.name for t in unused]) + + return terminal_list, compiled_rules, self.ignore + + + +def _find_used_symbols(tree): + assert tree.data == 'elements' + return {t for x in tree.find_data('element') + for t in x.scan_values(lambda t: t.type in ('RULENAME'))} + +def _get_abnf_parser(): + try: + return _get_abnf_parser.cache + except AttributeError: + terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] + + rules = [(rulename, exp, RuleOptions()) for rulename, exp in RULES.items()] + + rules = [Rule(NonTerminal(rulename), symbols_from_strcase(x.split()), i, None, o) + for rulename, elements, o in rules for i, x in enumerate(elements)] + + callback = ParseTreeBuilder(rules, ST).create_callback() + import re + lexer_conf = LexerConf(terminals, re, _TERMINALS_TO_IGNORE) + parser_conf = ParserConf(rules, callback, ['start']) + lexer_conf.lexer_type = 'basic' + parser_conf.parser_type = 'lalr' + _get_abnf_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) + return _get_abnf_parser.cache + +ABNF_GRAMMAR_ERRORS = [ + ('Unclosed parenthesis', ['a = ( \n']), + ('Unclosed bracket', ['a = [ \n']), + ('Incorrect type of value', ['a = 1\n']), + ('Unmatched closing parenthesis', ['a = )\n', 'a = ]\n', 'a = [)\n', 'a = (]\n']), + ('Expecting rule or terminal definition (missing "=" or "=/")', + ['a\n', 'a A\n', 'a /= A\n', 'a == A\n']), + ('Unexpected character, which is not usable in ABNF grammar', ['a@rule = "a rule"\n']), +] + +def _translate_parser_exception(parse, e): + error = e.match_examples(parse, ABNF_GRAMMAR_ERRORS, use_accepts=True) + if error: + return error + elif 'STRING' in e.expected: + return "Expecting a value" + +def _parse_abnf_grammar(text, name, start='start'): + try: + tree = _get_abnf_parser().parse(text + '\n', start) + except UnexpectedCharacters as e: + context = e.get_context(text) + raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % + (e.line, e.column, name, context)) + except UnexpectedToken as e: + context = e.get_context(text) + error = _translate_parser_exception(_get_abnf_parser().parse, e) + if error: + raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) + raise + + return PrepareGrammar().transform(tree) + +class ABNFGrammarBuilder: + def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None): + self.global_keep_all_tokens = global_keep_all_tokens + self.import_paths = import_paths or [] + self.used_files = used_files or {} + + self._definitions = {} + self._ignore_names = [] + + def _is_terminal(self, tree): + if not isinstance(tree, Tree): + # it would be a token (RULENAME). it is non-terminal. + return False + + # It is a terminal if rule reduces to single instance of char-val or num-val. + if len(tree.children) > 1: + return False + elif len(tree.children) == 1: + if tree.data in ('char_val', 'num_val'): + return True + else: + return self._is_terminal(tree.children[0]) + + assert False, tree + + def _define(self, name, oper, exp): + if name in self._definitions: + if oper == '=/': + + assert isinstance(exp.children[0].children[0], Tree) + assert exp.children[0].children[0].data == 'concatenation' + + # unify incremental alternatives into existing alternatives + base_exp = self._definitions[name] + base_exp.children[0].children += exp.children[0].children + return + + raise GrammarError("Rule '%s' defined more than once" % name) + + if name.startswith('__'): + raise GrammarError("Names starting with double-underscore are reserved (Error at '%s'})" % name) + + self._definitions[name] = exp + + def _unpack_definition(self, tree): + assert tree.data == 'rule' + rulename = tree.children[0].value + oper = tree.children[1].value # '=' or '=/' + rule_elements = tree.children[-1] + + assert isinstance(rule_elements, Tree) and rule_elements.data == 'elements' + + return rulename, oper, rule_elements + + + def load_grammar(self, grammar_text, grammar_name=""): + tree = _parse_abnf_grammar(grammar_text, grammar_name) + + for stmt in tree.children: + if stmt.data == 'rule': + self._define(*self._unpack_definition(stmt)) + else: + assert False, stmt + + def validate(self): + for name, elements in self._definitions.items(): + for sym in _find_used_symbols(elements): + if sym not in self._definitions: + raise GrammarError("Rule '%s' used but not defined in %s" % (sym, name)) + + def build(self): + rule_defs = [] + term_defs = [] + prio = TOKEN_DEFAULT_PRIORITY + for name, exp in self._definitions.items(): + if self._is_terminal(exp): + options = prio + term_defs.append((name, (exp, options))) + else: + options = RuleOptions(keep_all_tokens=self.global_keep_all_tokens, + expand1=False, priority=prio, template_source=None) + rule_defs.append((name, exp, options)) + + return ABNFGrammar(rule_defs, term_defs, self._ignore_names) + + +def load_abnf_grammar(grammar, source, import_paths, global_keep_all_tokens): + builder = ABNFGrammarBuilder(global_keep_all_tokens, import_paths) + builder.load_grammar(grammar, source) + builder.validate() + return builder.build(), builder.used_files diff --git a/tests/__main__.py b/tests/__main__.py index b8d39712..6c576a4e 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -8,6 +8,7 @@ from .test_tools import TestStandalone from .test_cache import TestCache from .test_grammar import TestGrammar +from .test_grammar_abnf import TestABNFGrammar from .test_reconstructor import TestReconstructor from .test_tree_forest_transformer import TestTreeForestTransformer from .test_lexer import TestLexer diff --git a/tests/test_grammar_abnf.py b/tests/test_grammar_abnf.py new file mode 100644 index 00000000..90d358b8 --- /dev/null +++ b/tests/test_grammar_abnf.py @@ -0,0 +1,209 @@ +from __future__ import absolute_import + +import os +from unittest import TestCase, main + +from lark import Lark, Token, Tree, ParseError, UnexpectedInput, UnexpectedCharacters +from lark.load_grammar import GrammarError +from lark.load_grammar import FromPackageLoader +from lark.load_grammar_abnf import ABNF_GRAMMAR_ERRORS + +class TestABNFGrammar(TestCase): + def setUp(self): + pass + + def test_charval_case_insensitive(self): + p = Lark('rulename = %i"aBc" / "xyz"\n', syntax='abnf', start='rulename') + abcs = ["abc", "Abc", "aBc", "abC", "ABc", "aBC", "AbC", "ABC"] + xyzs = ["xyz", "Xyz", "XYZ" ] + for i in abcs + xyzs: + self.assertEqual(p.parse(i), Tree('rulename', [])) + + def test_charval_case_sensitive(self): + p = Lark('rulename = %s"aBc" / %s"xyZ"\n', syntax='abnf', start='rulename') + for i in ('aBc', 'xyZ'): + self.assertEqual(p.parse(i), Tree('rulename', [])) + + for i in ('abc', 'xYy'): + self.assertRaises(UnexpectedCharacters, p.parse, i) + + def test_inline_numval(self): + # test for anonymous rules generated for inline num-val (%x22) + g = ('cat = %x40 "cat" %x40\n') + l = Lark(g, syntax='abnf', start='cat', keep_all_tokens=True) + self.assertEqual(l.parse('@cat@'), + Tree('cat', [Token('__ANON_0', '@'), Token('CAT', 'cat'), Token('__ANON_0', '@')])) + + def test_basic_abnf(self): + # test for alternatives, concatenation, and grouping + g1 =('beef = %s"bEEf" / beef2 / (BE EF) \n' + 'BE = %xBE\n' + 'EF = %xEF\n' + 'beef2 = %s"beef"\n') + + # the same rule in multiple lines with comments + g2 =(' ; rules \n' + 'beef = %s"bEEf" \n' + ' / beef2 ; word "beef" in lowercase \n' + ' / (BE EF) ; bytes sequence [0xbe,0xef] \n' + ';terminals \n' + 'BE = %xBE\n' + 'EF = %xEF\n' + 'beef2 = %s"beef"\n') + + # the same rule using incremental alternatives + g3 = ('beef = %s"bEEf"\n' + 'beef =/ beef2 \n' + 'beef =/ (BE EF)\n' + 'BE = %xBE\n' + 'EF = %xEF\n' + 'beef2 = %s"beef"\n') + + for g in (g1, g2, g3): + l = Lark(g, syntax='abnf', start='beef', keep_all_tokens=True) + self.assertEqual(l.parse(u'beef'), Tree('beef', [Token('beef2', 'beef')])) + self.assertEqual(l.parse(u'bEEf'), Tree('beef', [Token('BEEF', 'bEEf')])) + self.assertEqual(l.parse(u'\xbe\xef'), Tree('beef', [Token('BE', '\xbe'), Token('EF', '\xef')])) + + # undefined rule + g = g3 + 'unused-rule = BE EF beef3\n' + self.assertRaises(GrammarError, Lark, g, syntax='abnf', start='beef') + + def test_optional(self): + g = ('start = [ foo ] bar\n' + 'foo = "foo"\n' + 'bar = "bar"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=False) + self.assertEqual(l.parse('foobar'), + Tree('start', [Token('foo', 'foo'), Token('bar', 'bar')])) + self.assertEqual(l.parse('bar'), + Tree('start', [Token('bar', 'bar')])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, 'foo') + + + def test_repetition(self): + g = ('start = rep-inf / rep-fixed \n' + 'rep-inf = *"X"\n' + 'rep-fixed = 3"F"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=False) + self.assertEqual(l.parse('XXX'), Tree('start', [Tree('rep-inf', [])])) + self.assertEqual(l.parse(''), Tree('start', [Tree('rep-inf', [])])) + self.assertEqual(l.parse('FFF'), Tree('start', [Tree('rep-fixed', [])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'FF') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'FFFF') + + def test_repetition_range(self): + g = ('start = rep-range / rep-atleast / rep-atmost\n' + 'rep-range = 2*4%s"R"\n' + 'rep-atleast = 3*"L"\n' + 'rep-atmost = *5"M"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=False) + + self.assertEqual(l.parse('RRR'), Tree('start', [Tree('rep-range', [])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'RRRRR') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'R') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'RRr') # case sensitive + + self.assertEqual(l.parse('LlL'), Tree('start', [Tree('rep-atleast', [])])) # case insensitive + self.assertEqual(l.parse('LLLL'), Tree('start', [Tree('rep-atleast', [])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'LL') + + self.assertEqual(l.parse('mmm'), Tree('start', [Tree('rep-atmost', [])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'mmmmmm') + + def test_zero_repetition(self): + g1 = ('start = ("cat" / "dog" / empty) "food" \n' + 'empty = 0\n') + l = Lark(g1, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse("catfood"), Tree('start', [Token('CAT', 'cat'), Token('FOOD', 'food')])) + self.assertEqual(l.parse("dogfood"), Tree('start', [Token('DOG', 'dog'), Token('FOOD', 'food')])) + self.assertEqual(l.parse("food"), Tree('start', [Tree('empty', []), Token('FOOD', 'food')])) + self.assertRaises((UnexpectedInput), l.parse, u"petfood") + + def test_literal_range(self): + + g1 = ('start = LALPHA UALPHA 1*DIGIT\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %x30-39\n') + g2 = ('start = LALPHA UALPHA 1*DIGIT\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %d48-57 \n') + g3 = ('start = LALPHA UALPHA 1*DIGIT\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %b00110000-00111001 \n') + for g in (g1, g2, g3): + l = Lark(g, syntax='abnf') + for i in (0,1,2,3,4,5,6,7,8,9): + self.assertEqual(l.parse('lU%d' % i), + Tree('start', [Token('LALPHA', 'l'), Token('UALPHA', 'U'), + Token('DIGIT', '%d' % i)])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'lU0123456789:') + + + def test_literal_concatenation(self): + g1 = ('start = digits12345\n' + 'digits12345 = %x31.32.33.34.35\n') + g2 = ('start = digits12345\n' + 'digits12345 = %b00110001.00110010.00110011.00110100.00110101\n') + g3 = ('start = digits12345\n' + 'digits12345 = %x49.50.51.52.53\n') + #for g in (g1, g2, g3): + for g in (g1,): + l = Lark(g, syntax='abnf', keep_all_tokens=False) + self.assertEqual(l.parse('12345'), Tree('start', [Token('digits12345', '12345')])) + + def test_operator_precedence(self): + # concatenation has higher precedence than alternation + g = ('start = "a" / "b" "c"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse('bc'), Tree('start', [Token('B', 'b'), Token('C', 'c')])) + self.assertEqual(l.parse('a'), Tree('start', [Token('A', 'a')])) + + self.assertRaises((ParseError, UnexpectedInput), l.parse, 'ac') + + # grouping + g = ('start = ("a" / "b") "c"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse('bc'), Tree('start', [Token('B', 'b'), Token('C', 'c')])) + self.assertEqual(l.parse('ac'), Tree('start', [Token('A', 'a'), Token('C', 'c')])) + + def test_unicode_match(self): + # test for 16bit unicode character + char_vals = ('%x2227', '%d8743', '%b0010001000100111') + unicode_char = '∧' + + template = ('start = sym1\n' + 'sym1 = %s\n') + grammars = [ template % i for i in char_vals] + for g in grammars: + l = Lark(g, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse(unicode_char), Tree('start', [Token('sym1', unicode_char)])) + + def test_unicode_match_emoji(self): + # test for 32bit unicode character + char_vals = ('%x1F431', '%d128049', '%b00011111010000110001') + cat_face_in_unicode = '🐱' + + template = ('start = thecat\n' + 'thecat = %s\n') + grammars = [ template % i for i in char_vals] + for g in grammars: + l = Lark(g, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse(cat_face_in_unicode), + Tree('start', [Token('thecat', cat_face_in_unicode)])) + + def test_errors(self): + for msg, examples in ABNF_GRAMMAR_ERRORS: + for example in examples: + try: + p = Lark(example, syntax='abnf') + except GrammarError as e: + assert msg in str(e) + else: + assert False, "example did not raise an error" + +if __name__ == '__main__': + main() From 3211beb49569accc3ea0da96cd6fea61fcc7206a Mon Sep 17 00:00:00 2001 From: Takahide Higuchi Date: Mon, 4 Oct 2021 18:14:31 +0900 Subject: [PATCH 2/7] Fix lexer to allow any character in terminal names. This fix is needed for ABNF grammar support. --- lark/lexer.py | 16 ++++++++++++++-- tests/test_grammar_abnf.py | 9 +++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index c30b9dd4..aa33ff5f 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -287,6 +287,12 @@ def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): self.allowed_types = {t.name for t in self.terminals} + self.pattern_name_map = {n:'__%d' % i + for i, n in enumerate({t.name for t in self.terminals + if not t.name.isidentifier()})} + + self.pattern_name_map_reverse = {altname:n for n, altname in self.pattern_name_map.items()} + self._mres = self._build_mres(terminals, len(terminals)) def _build_mres(self, terminals, max_size): @@ -296,7 +302,12 @@ def _build_mres(self, terminals, max_size): postfix = '$' if self.match_whole else '' mres = [] while terminals: - pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + # Replace terminal name with '__%d' if it is not a valid python identifier. + # otherwise pattern will fail to compile. + pattern = u'|'.join(u'(?P<%s>%s)' % (self.pattern_name_map.get(t.name, t.name), + t.pattern.to_regexp() + postfix) + for t in terminals[:max_size]) + if self.use_bytes: pattern = pattern.encode('latin-1') try: @@ -312,7 +323,8 @@ def match(self, text, pos): for mre, type_from_index in self._mres: m = mre.match(text, pos) if m: - return m.group(0), type_from_index[m.lastindex] + type_ = type_from_index[m.lastindex] + return m.group(0), self.pattern_name_map_reverse.get(type_, type_) def _regexp_has_newline(r: str): diff --git a/tests/test_grammar_abnf.py b/tests/test_grammar_abnf.py index 90d358b8..fc9bced1 100644 --- a/tests/test_grammar_abnf.py +++ b/tests/test_grammar_abnf.py @@ -195,6 +195,15 @@ def test_unicode_match_emoji(self): self.assertEqual(l.parse(cat_face_in_unicode), Tree('start', [Token('thecat', cat_face_in_unicode)])) + def test_terminal_rulename_with_hyphen(self): + # test to make sure that rule names may contain hyphen. + g = ('start = L-ALPHA U-ALPHA 1*DIGIT \n' + 'U-ALPHA = %x41-5A \n' + 'L-ALPHA = %x61-7A \n' + 'DIGIT = %d48-57 \n') + l = Lark(g, syntax='abnf') + self.assertEqual(l.parse(u'aA1'), Tree('start', [Token('L-ALPHA', 'a'), Token('U-ALPHA', 'A'), Token('DIGIT', '1')])) + def test_errors(self): for msg, examples in ABNF_GRAMMAR_ERRORS: for example in examples: From c6fbb0d604a1138dde23e3f5c5f972e438085004 Mon Sep 17 00:00:00 2001 From: Takahide Higuchi Date: Mon, 4 Oct 2021 22:39:20 +0900 Subject: [PATCH 3/7] Added '%import' extension to ABNF grammar Syntax: %import module %import module (rule1, rule2, ...) Example: %import core-rules ; import rules from lark/grammars/core-rules.abnf %import core-rules (CRLF, DIGITS) ; import specified rules (CRLF and DIGITS) only --- lark/load_grammar_abnf.py | 122 +++++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar_abnf.py b/lark/load_grammar_abnf.py index 6fa109ce..3b8f0658 100644 --- a/lark/load_grammar_abnf.py +++ b/lark/load_grammar_abnf.py @@ -1,8 +1,11 @@ """Parses grammar written in ABNF (RFC5234 and 7405) and creates Grammar objects. """ +import hashlib +import os, sys from .load_grammar import PrepareGrammar, PrepareAnonTerminals from .load_grammar import EBNF_to_BNF, SimplifyRule_Visitor from .load_grammar import _get_parser, symbols_from_strcase, nr_deepcopy_tree +from .load_grammar import PackageResource, stdlib_loader from .utils import logger from .lexer import Token, TerminalDef, Pattern, PatternRE, PatternStr @@ -12,12 +15,14 @@ from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY from .tree import Tree, SlottedTree as ST -from .utils import classify, classify_bool +from .utils import classify, classify_bool, bfs from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .visitors import v_args, Transformer_InPlace, Transformer_NonRecursive, Visitor, Transformer inline_args = v_args(inline=True) +_ALL_RULES = object() +ABNF_EXT = '.abnf' # Terminals (ie. keys in TERMINALS ) shall consist of uppercase letters and underscores. TERMINALS = { @@ -51,6 +56,11 @@ 'C_WSP': r'[ \t]+|((;[^\n]*)*\r?\n)[ \t]+', '_C_NL': r'((;[^\n]*)*\r?\n)(?![ \t])', + # terminals for nonstandard extensions + '_IMPORT': r'%import', + '_DOT': r'\.', + '_COMMA': r',', + # define terminal for unusable charaters to see nice error messages for common pitfalls '_UNUSABLE_CHARS': r'[_@!#$&\+:]' } @@ -63,7 +73,7 @@ # rulelist = 1*( rule / (*c-wsp c-nl) ) '_rulelist': ['_item', '_rulelist _item'], - '_item': ['rule', '_C_NL' ], + '_item': ['rule', 'import', '_C_NL' ], # 'import' is nonstandard extension # There are some assumptions in rule for 'rule' # @@ -123,6 +133,18 @@ 'bin_val': [ 'BIN_VAL' ], 'prose_val': [ 'PROSE_VAL' ], + + # nonstandard extensions to ABNF grammar (%import directive) + 'import': ['_IMPORT _import_path _C_NL', + '_IMPORT _import_path _LPAR name_list _RPAR _C_NL', + ], + '_import_path': ['import_from_lib', 'import_relpath'], + 'import_from_lib': ['_import_args'], + 'import_relpath': ['_DOT _import_args'], + '_import_args': ['RULENAME', '_import_args _DOT RULENAME'], + + 'name_list': ['_name_list'], + '_name_list': ['RULENAME', '_name_list _COMMA RULENAME'], } @@ -529,6 +551,17 @@ def _is_terminal(self, tree): assert False, tree + def _remove_unused(self, used): + def rule_dependencies(symbol): + try: + tree = self._definitions[symbol] + except KeyError: + return [] + return _find_used_symbols(tree) + + _used = set(bfs(used, rule_dependencies)) + self._definitions = {k: v for k, v in self._definitions.items() if k in _used} + def _define(self, name, oper, exp): if name in self._definitions: if oper == '=/': @@ -559,12 +592,97 @@ def _unpack_definition(self, tree): return rulename, oper, rule_elements + def _unpack_import(self, stmt, grammar_name): + if len(stmt.children) > 1: + path_node, name_list = stmt.children + rules_to_import = [n.value for n in name_list.children] + else: + path_node, = stmt.children + rules_to_import = _ALL_RULES + + # '%import topdir.subdir.file' --> dotted_path=['topdir','subdir','file'] + dotted_path = tuple(path_node.children) + + if path_node.data == 'import_from_lib': # Import from lark/grammars/ + base_path = None + else: # Relative import + if grammar_name == '': # Import relative to script file path if grammar is coded in script + try: + base_file = os.path.abspath(sys.modules['__main__'].__file__) + except AttributeError: + base_file = None + else: + base_file = grammar_name # Import relative to grammar file path if external grammar file + if base_file: + if isinstance(base_file, PackageResource): + base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) + else: + base_path = os.path.split(base_file)[0] + else: + base_path = os.path.abspath(os.path.curdir) + + return dotted_path, base_path, rules_to_import + + def do_import(self, dotted_path, base_path, rules_to_import): + + assert dotted_path + grammar_path = os.path.join(*dotted_path) + ABNF_EXT + + to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] + for source in to_try: + try: + if callable(source): + joined_path, text = source(base_path, grammar_path) + else: + joined_path = os.path.join(source, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + except IOError: + continue + else: + h = hashlib.md5(text.encode('utf8')).hexdigest() + if self.used_files.get(joined_path, h) != h: + raise RuntimeError("Grammar file was changed during importing") + self.used_files[joined_path] = h + + gb = ABNFGrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files) + gb.load_grammar(text, joined_path) + if rules_to_import != _ALL_RULES: + gb._remove_unused(rules_to_import) + + for name in gb._definitions: + if name in self._definitions: + raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path)) + + self._definitions.update(**gb._definitions) + break + else: + # Search failed. Make Python throw a nice error. + open(grammar_path, encoding='utf8') + assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,) + def load_grammar(self, grammar_text, grammar_name=""): tree = _parse_abnf_grammar(grammar_text, grammar_name) + imports = {} + for stmt in tree.children: + if stmt.data == 'import': + dotted_path, base_path, rules_to_import = self._unpack_import(stmt, grammar_name) + try: + import_base_path, import_rules = imports[dotted_path] + assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) + import_rules.update(rules_to_import) + except KeyError: + imports[dotted_path] = base_path, rules_to_import + + for dotted_path, (base_path, rules_to_import) in imports.items(): + self.do_import(dotted_path, base_path, rules_to_import) + for stmt in tree.children: if stmt.data == 'rule': self._define(*self._unpack_definition(stmt)) + elif stmt.data == 'import': + pass else: assert False, stmt From d3a925d0fe1ed726bec6d99e2385314d91c6537a Mon Sep 17 00:00:00 2001 From: Takahide Higuchi Date: Mon, 4 Oct 2021 23:00:30 +0900 Subject: [PATCH 4/7] added the ABNF core rules, defined in RFC5234 appendix B.1, to the standard library. --- lark/grammars/core-rules.abnf | 39 +++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 lark/grammars/core-rules.abnf diff --git a/lark/grammars/core-rules.abnf b/lark/grammars/core-rules.abnf new file mode 100644 index 00000000..7ccf69f5 --- /dev/null +++ b/lark/grammars/core-rules.abnf @@ -0,0 +1,39 @@ +; ABNF Core Rules (RFC5234 Appendix.B) + +ALPHA = %x41-5A / %x61-7A ; A-Z / a-z +BIT = "0" / "1" +CHAR = %x01-7F + ; any 7-bit US-ASCII character, + ; excluding NUL +CR = %x0D + ; carriage return +CRLF = CR LF + ; Internet standard newline +CTL = %x00-1F / %x7F + ; controls +DIGIT = %x30-39 + ; 0-9 +DQUOTE = %x22 + ; " (Double Quote) +HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" +HTAB = %x09 + ; horizontal tab +LF = %x0A + ; linefeed +LWSP = *(WSP / CRLF WSP) + ; Use of this linear-white-space rule + ; permits lines containing only white + ; space that are no longer legal in + ; mail headers and have caused + ; interoperability problems in other + ; contexts. + ; Do not use when defining mail + ; headers and use with caution in + ; other contexts. +OCTET = %x00-FF + ; 8 bits of data +SP = %x20 +VCHAR = %x21-7E + ; visible (printing) characters +WSP = SP / HTAB + ; white space From ef29a28b2c920276148cc9de219ded54298a8df0 Mon Sep 17 00:00:00 2001 From: Takahide Higuchi Date: Mon, 4 Oct 2021 23:09:10 +0900 Subject: [PATCH 5/7] Added tests for %import directive in ABNF grammar --- tests/grammars/ab.abnf | 8 ++++++++ tests/test_grammar_abnf.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 tests/grammars/ab.abnf diff --git a/tests/grammars/ab.abnf b/tests/grammars/ab.abnf new file mode 100644 index 00000000..9b0f97c4 --- /dev/null +++ b/tests/grammars/ab.abnf @@ -0,0 +1,8 @@ +startab = expr + +expr = A B + / A expr B + +A = "a" +B = "b" + diff --git a/tests/test_grammar_abnf.py b/tests/test_grammar_abnf.py index fc9bced1..b3314113 100644 --- a/tests/test_grammar_abnf.py +++ b/tests/test_grammar_abnf.py @@ -214,5 +214,35 @@ def test_errors(self): else: assert False, "example did not raise an error" + def test_import_from_custom_sources(self): + custom_loader = FromPackageLoader('tests', ('grammars', )) + g1 = ('start = startab \n' + '%import ab\n') + p = Lark(g1, syntax='abnf', start='start', import_paths=[custom_loader]) + self.assertEqual(p.parse('ab'), + Tree('start', [Tree('startab', [Tree('expr', [Token('A', 'a'), Token('B', 'b')])])])) + + def test_import(self): + g1 = ('start = LALPHA UALPHA 1*DIGIT CRLF\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %x30-39\n' + '%import core-rules\n') + # grammar error is expected since DIGIT is defined twice in both g1 and core-rules.abnf + self.assertRaises(GrammarError, Lark, g1, syntax='abnf') + + g2 = ('start = LALPHA UALPHA 1*DIGIT CRLF\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %x30-39\n' + '%import core-rules ( CRLF )\n') + # g2 is okay since only rule 'CRLF' is imported but 'DIGITS' is not + p = Lark(g2, syntax='abnf') + self.assertEqual(p.parse('aA1\r\n'), + Tree('start', [Token('LALPHA', 'a'), Token('UALPHA', 'A'), + Token('DIGIT', '1'), + Tree('CRLF', [Token('CR', '\r'), Token('LF', '\n')])])) + + if __name__ == '__main__': main() From 51ba8a4ebff97e0b52f18ddf7149259a1772ad38 Mon Sep 17 00:00:00 2001 From: Takahide Higuchi Date: Tue, 12 Oct 2021 19:16:01 +0900 Subject: [PATCH 6/7] Added 'abnf_alias' decorator for transformer and visitor classes --- lark/__init__.py | 1 + lark/visitors.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/lark/__init__.py b/lark/__init__.py index 609cfc7f..d494c9a1 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,6 +1,7 @@ from .utils import logger from .tree import Tree from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive +from .visitors import abnf_alias from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError) from .lexer import Token diff --git a/lark/visitors.py b/lark/visitors.py index ae7930c0..cc18df66 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -515,6 +515,29 @@ def _visitor_args_dec(obj): return _visitor_args_dec +class abnf_alias: + """ + A decorator to make aliases for public methods such that underscores in their names + changed to hyphens. + (e.g an alias method "self.foo-bar(..)" is created for "self.foo_bar(..)". ) + + This is required to support ABNF grammar since hyphens are allowed in ABNF rules but not in + python method names. + """ + def __init__( self, cls ): + self._cls = cls + + for name, func in getmembers(cls): + if name.startswith('_'): + continue + if callable(getattr(cls, name)) and name.find('_') > 0: + alias = name.replace('_', '-') + setattr(cls, alias, func) + + def __call__( self, *args, **kwargs ): + instance = self._cls( *args, **kwargs ) + return instance + ###} From 18f91cc26add8ff5a6d6908cbc2f33bcc49d9e12 Mon Sep 17 00:00:00 2001 From: Takahide Higuchi Date: Tue, 12 Oct 2021 19:18:11 +0900 Subject: [PATCH 7/7] Added example of creating parser using ABNF grammar --- examples/grammars/rfc3986.abnf | 87 ++++++++++++++++++++++++ examples/url_parser_abnf.py | 120 +++++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 examples/grammars/rfc3986.abnf create mode 100644 examples/url_parser_abnf.py diff --git a/examples/grammars/rfc3986.abnf b/examples/grammars/rfc3986.abnf new file mode 100644 index 00000000..d127daf7 --- /dev/null +++ b/examples/grammars/rfc3986.abnf @@ -0,0 +1,87 @@ +; ABNF grammar from RFC3986 +; Uniform Resource Identifier (URI): Generic Syntax +; +; some terminals (e.g. DIGIT, ALPHA, ..) is defined in ABNF core rules in RFC5234. +; + +URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + +hier-part = "//" authority path-abempty + / path-absolute + / path-rootless + / path-empty + +URI-reference = URI / relative-ref + +absolute-URI = scheme ":" hier-part [ "?" query ] + +relative-ref = relative-part [ "?" query ] [ "#" fragment ] + +relative-part = "//" authority path-abempty + / path-absolute + / path-noscheme + / path-empty + +scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + +authority = [ userinfo "@" ] host [ ":" port ] +userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +host = IP-literal / IPv4address / reg-name +port = *DIGIT + +IP-literal = "[" ( IPv6address / IPvFuture ) "]" +IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + +IPv6address = 6( h16 ":" ) ls32 + / "::" 5( h16 ":" ) ls32 + / [ h16 ] "::" 4( h16 ":" ) ls32 + / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + / [ *4( h16 ":" ) h16 ] "::" ls32 + / [ *5( h16 ":" ) h16 ] "::" h16 + / [ *6( h16 ":" ) h16 ] "::" + +h16 = 1*4HEXDIG +ls32 = ( h16 ":" h16 ) / IPv4address +IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + +dec-octet = DIGIT ; 0-9 + / %x31-39 DIGIT ; 10-99 + / "1" 2DIGIT ; 100-199 + / "2" %x30-34 DIGIT ; 200-249 + / "25" %x30-35 ; 250-255 + +reg-name = *( unreserved / pct-encoded / sub-delims ) + +path = path-abempty ; begins with "/" or is empty + / path-absolute ; begins with "/" but not "//" + / path-noscheme ; begins with a non-colon segment + / path-rootless ; begins with a segment + / path-empty ; zero characters + +path-abempty = *( "/" segment ) +path-absolute = "/" [ segment-nz *( "/" segment ) ] +path-noscheme = segment-nz-nc *( "/" segment ) +path-rootless = segment-nz *( "/" segment ) +path-empty = 0 + + +segment = *pchar +segment-nz = 1*pchar +segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) + ; non-zero-length segment without any colon ":" + +pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + +query = *( pchar / "/" / "?" ) +fragment = *( pchar / "/" / "?" ) + +pct-encoded = "%" HEXDIG HEXDIG + +unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +reserved = gen-delims / sub-delims +gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +sub-delims = "!" / "$" / "&" / "'" / "(" / ")" + / "*" / "+" / "," / ";" / "=" + diff --git a/examples/url_parser_abnf.py b/examples/url_parser_abnf.py new file mode 100644 index 00000000..9a515c1a --- /dev/null +++ b/examples/url_parser_abnf.py @@ -0,0 +1,120 @@ +""" +Creating URL Parser from ABNF grammar in internet standards (RFCs) +================================================================== + +Usage: + python3 -m examples.url_parser_abnf https://github.com/lark%2Dparser/lark + python3 -m examples.url_parser_abnf http://user@127.0.0.1:8000/index.html + +It outputs parse tree for an URI passed as first argument. + +""" +import sys + +from lark import Lark, Transformer, v_args, abnf_alias, Token, Visitor, Tree + +grammar_in_abnf =""" +%import .grammars.rfc3986 ; import from ./grammars/rfc3986.abnf +%import core-rules ; import from the standard library: ../lark/grammars/core-rules.abnf +""" + + +class SimplifyABNFTree_Visitor(Visitor): + def __init__(self, unwrap_children=(), keep=(), *args, **kwargs): + super(SimplifyABNFTree_Visitor, self).__init__(*args, **kwargs) + self.unwrap = unwrap_children + self.keep = keep + + def visit(self, tree: Tree) -> Tree: + # override self.visit(), since _unwrap_and_flatten() assumes top-down visitor + self.visit_topdown(tree) + + def _unwrap_and_flatten(self, tree, unwrap_recursive=False): + """ a generator to flatten tree into list or tuple """ + do_unwrap = True if tree.data in self.unwrap or unwrap_recursive else False + + for x in tree.children: + if isinstance(x, Tree) and do_unwrap: + if x.data in self.keep: + yield self._concat_tokens(x, unwrap_recursive=True) + else: + for item in list(self._unwrap_and_flatten(x, unwrap_recursive=True)): + yield item + elif isinstance(x, Token): + yield x + else: + yield x + + + def _concat_tokens(self, tree, unwrap_recursive=False): + """ concatenate multiple tokens in tree.children into single token. + leave it as it is if there is a tree in tree.children. + """ + items = [None] + words = [] + children = list(self._unwrap_and_flatten(tree, unwrap_recursive=unwrap_recursive)) + + for x in children: + if isinstance(x, Token): + words.append(x.value) + if not isinstance(items[-1], Token): + items.append(x) + else: + if len(words) > 1: + items[-1] = items[-1].update(value=''.join(words)) + items.append(x) + words=[] + + if len(words) > 1: + items[-1] = items[-1].update(value=''.join(words)) + + tree.children = items[1:] + return tree; + + def __default__(self, tree): + return self._concat_tokens(tree) + + +@abnf_alias +class pct_encoded_conv(Transformer): + def pct_encoded(self, items): # alias for pct-encoded + # items = "%" HEXDIG HEXDIG + + # extract hexadecimal digits, convert it to a character, + # then return modified token + char_in_hex = ''.join((items[1].children[0], items[2].children[0])) + char_ = bytearray.fromhex(char_in_hex).decode() + token = items[0].update(value=char_) + return token + +def main(): + url = sys.argv[1] + + url_parser = Lark(grammar_in_abnf, + # using ABNF grammar + syntax='abnf', + start='URI', + # use earley parser since RFC3986 is too complex for LALR. + parser='earley', + # usually needed to set keep_all_tokens=True when ABNF grammar is used. + keep_all_tokens=True, + ) + tree = url_parser.parse(url) + + # Convert pct-encoded (e.g. '%2D' in given URL) to ascii characters + transformer=pct_encoded_conv() + tree = transformer.transform(tree) + + # We need some post-processing to unwrap unwanted tree node and concatenate ABNF tokens + # to construct a token that we actually want since many ABNF grammar + # in RFCs split every input into too small units like a single character. + unwrap = ('scheme', 'userinfo', 'IPv4address', 'IPv6address', 'reg-name', + 'path-abempty', 'path-absolute', 'path-noscheme', 'path-rootless') + simplifier = SimplifyABNFTree_Visitor(unwrap_children=unwrap) + simplifier.visit(tree) + + print(tree.pretty()) + + +if __name__ == '__main__': + main()