From eb9c4c099bf26bdb3936af21ff49e3684b9e6638 Mon Sep 17 00:00:00 2001
From: Takahide Higuchi <takahidehiguchi@gmail.com>
Date: Mon, 4 Oct 2021 17:56:06 +0900
Subject: [PATCH 1/7] Added support for grammar in ABNF format (RFC5234)

---
 lark/lark.py               |  17 +-
 lark/load_grammar_abnf.py  | 597 +++++++++++++++++++++++++++++++++++++
 tests/__main__.py          |   1 +
 tests/test_grammar_abnf.py | 209 +++++++++++++
 4 files changed, 822 insertions(+), 2 deletions(-)
 create mode 100644 lark/load_grammar_abnf.py
 create mode 100644 tests/test_grammar_abnf.py

diff --git a/lark/lark.py b/lark/lark.py
index f29d444d..d470f925 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -16,6 +16,7 @@
 from .exceptions import ConfigurationError, assert_config, UnexpectedInput
 from .utils import Serialize, SerializeMemoizer, FS, isascii, logger
 from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource
+from .load_grammar_abnf import load_abnf_grammar, ABNFGrammar
 from .tree import Tree
 from .common import LexerConf, ParserConf
 
@@ -98,6 +99,11 @@ class LarkOptions(Serialize):
             Prevent the tree builder from automagically removing "punctuation" tokens (default: False)
     tree_class
             Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``.
+    syntax
+            Syntax for grammar specification.
+
+            - "lark" (default): Lark's EBNF based syntax
+            - "abnf" : ABNF syntax, described in RFC5234. Various extentions in Lark's EBNF syntax are not supported.
 
     **=== Algorithm Options ===**
 
@@ -169,6 +175,7 @@ class LarkOptions(Serialize):
         'use_bytes': False,
         'import_paths': [],
         'source_path': None,
+        'syntax': 'lark',
     }
 
     def __init__(self, options_dict):
@@ -326,11 +333,17 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
                             # In practice the only relevant thing that might have been overriden should be `options`
                             self.options = old_options
 
+            assert_config(self.options.syntax, ('lark', 'abnf'))
 
             # Parse the grammar file and compose the grammars
-            self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
+            if self.options.syntax == 'lark':
+                self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
+            elif self.options.syntax == 'abnf':
+                self.grammar, used_files = load_abnf_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
+            else:
+                assert False, self.options.syntax
         else:
-            assert isinstance(grammar, Grammar)
+            assert isinstance(grammar, (Grammar, ABNFGrammar))
             self.grammar = grammar
 
 
diff --git a/lark/load_grammar_abnf.py b/lark/load_grammar_abnf.py
new file mode 100644
index 00000000..6fa109ce
--- /dev/null
+++ b/lark/load_grammar_abnf.py
@@ -0,0 +1,597 @@
+"""Parses grammar written in ABNF (RFC5234 and 7405) and creates Grammar objects. """
+
+from .load_grammar import PrepareGrammar, PrepareAnonTerminals
+from .load_grammar import EBNF_to_BNF, SimplifyRule_Visitor
+from .load_grammar import _get_parser, symbols_from_strcase, nr_deepcopy_tree
+
+from .utils import logger
+from .lexer import Token, TerminalDef, Pattern, PatternRE, PatternStr
+
+from .parse_tree_builder import ParseTreeBuilder
+from .parser_frontends import ParsingFrontend
+from .common import LexerConf, ParserConf
+from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY
+from .tree import Tree, SlottedTree as ST
+from .utils import classify, classify_bool
+from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
+
+from .visitors import v_args, Transformer_InPlace, Transformer_NonRecursive, Visitor, Transformer
+inline_args = v_args(inline=True)
+
+
+# Terminals (ie. keys in TERMINALS ) shall consist of uppercase letters and underscores.
+TERMINALS = {
+    '_LPAR':     r'\(',
+    '_RPAR':     r'\)',
+    '_LBRA':     r'\[',
+    '_RBRA':     r'\]',
+    '_STAR'  :     r'\*',
+    '_SLASH' :     r'/',
+
+    'RULENAME':     r'[a-zA-Z][a-zA-Z0-9\-]*',
+    'EQ':          r'=',
+    'EQ_ALT':      r'=/',
+
+    '_IGNORE_CASE':    r'%i',
+    '_CASE_SENSITIVE': r'%s',
+
+    # quoted-string  =  DQUOTE *(%x20-21 / %x23-7E) DQUOTE
+    'QSTRING':      r'"[ !#$%&\'\(\)\*\+,\-\./0-9:;<=>\?@A-Z\[\\\]\^_a-z\{|\}~]*"',
+
+    # prose-val = "<" *(%x20-3D / %x3F-7E) ">"
+    'PROSE_VAL' :   r'<[ !"#$%&\'\(\)\*\+,\-\./0-9:;<=\?@A-Z\[\\\]\^_a-z\{|\}~]*>',
+
+    'NUMBER'     :  r'[0-9]+',
+
+    'DEC_VAL':      r'%d([0-9]+(\.[0-9]+)+|[0-9]+\-[0-9]+|[0-9]+)',
+    'HEX_VAL':      r'%x([0-9A-F]+(\.[0-9A-F]+)+|[0-9A-F]+\-[0-9A-F]+|[0-9A-F]+)',
+    'BIN_VAL':      r'%b([01]+(\.[01]+)+|[01]+\-[01]+|[01]+)',
+
+    # c-wsp = WSP / (c-nl WSP)
+    'C_WSP':       r'[ \t]+|((;[^\n]*)*\r?\n)[ \t]+',
+    '_C_NL':       r'((;[^\n]*)*\r?\n)(?![ \t])',
+
+    # define terminal for unusable charaters to see nice error messages for common pitfalls
+    '_UNUSABLE_CHARS': r'[_@!#$&\+:]'
+}
+_TERMINALS_TO_IGNORE=['C_WSP']
+
+
+# Name of rules (ie. keys in RULES below) shall consist of lowercase letters and underscores.
+RULES = {
+    'start':         ['_rulelist'],
+
+    # rulelist       =  1*( rule / (*c-wsp c-nl) )
+    '_rulelist':     ['_item', '_rulelist _item'],
+    '_item':         ['rule', '_C_NL' ],
+
+    # There are some assumptions in rule for 'rule'
+    #
+    # - Name of the rule definition shall be 'rule'
+    # - First element in the lefthand side of the rule shall be named as 'RULENAME'
+    # - '_c-nl' cannot be renamed to 'c-nl',
+    #   otherwise self._unpack_definition() will fail to capture 'elements'
+    #
+    'rule':          ['RULENAME _defined_as elements _C_NL'],
+
+    '_defined_as':    [ 'EQ', 'EQ_ALT' ],
+
+    # elements       =  alternation *c-wsp
+    # alternation    =  concatenation *(*c-wsp "/" *c-wsp concatenation)
+    # concatenation  =  repetition *(1*c-wsp repetition)
+    # repetition     =  [repeat] element
+    #
+    'elements':      [ 'alternation' ],
+    'alternation':   [ '_alternation'],
+    '_alternation':  [ 'concatenation', '_alternation _SLASH concatenation'],
+    'concatenation': [ '_concatenation'],
+    '_concatenation':[ 'repetition', '_concatenation repetition'],
+
+    'repetition':    [ 'element', 'repeat element' ],
+
+    # repeat         =  1*DIGIT / (*DIGIT "*" *DIGIT)
+    'repeat':        [ 'repeat_min _STAR repeat_max',
+                       'repeat_min _STAR',
+                       '_STAR repeat_max',
+                       '_STAR',
+                       'repeat_n' ],
+
+    'repeat_n':      [ 'NUMBER' ],
+    'repeat_min':    [ 'NUMBER' ],
+    'repeat_max':    [ 'NUMBER' ],
+
+    'element':       [ 'RULENAME', '_group', 'option', 'char_val', 'num_val', 'prose_val'],
+
+    # 'group' is inlined intentionally.
+    #
+    # grouping will produces nested 'alternation' rule tree.
+    #   (e.g.  '"a" | ("b")' in ABNF produces 'alternation("a", alternation("b"))' in AST terms.)
+    #
+    # Such nested and redundant rule will be flattened later
+    # by SimplifyRule_Visitor()._flatten().
+    '_group':         [ '_LPAR alternation _RPAR' ],
+    'option':        [ '_LBRA alternation _RBRA' ],
+
+    'char_val':      [ 'case_insensitive_string', 'case_sensitive_string' ],
+    'case_insensitive_string': [ '_IGNORE_CASE    QSTRING', 'QSTRING' ],
+    'case_sensitive_string':   [ '_CASE_SENSITIVE QSTRING' ],
+
+    'num_val':       [ 'dec_val', 'bin_val', 'hex_val',],
+
+    'dec_val':       [ 'DEC_VAL' ],
+    'hex_val':       [ 'HEX_VAL' ],
+    'bin_val':       [ 'BIN_VAL' ],
+
+    'prose_val':     [ 'PROSE_VAL' ],
+}
+
+
+class ABNF_to_BNF(EBNF_to_BNF):
+    """ converts ABNF to BNF.
+    we reuse super()._add_repeat_rule() etc. from EBNF_to_BNF via inheritance.
+    """
+
+    def _add_recurse_rule(self, type_, element, repeat_min):
+        assert repeat_min >= 1
+
+        new_name = self._name_rule(type_)
+        t = NonTerminal(new_name)
+        tree = ST('alternation', [
+            ST('concatenation', [element] * repeat_min),
+            ST('concatenation', [t, element])
+        ])
+        return self._add_rule(element, new_name, tree)
+
+    def option(self, items):
+        assert len(items) == 1
+
+        # RFC5234 Section 3.8: Optional Sequence:  [RULE]
+        empty = ST('concatenation', [])
+        alternation = items[0]
+        return  ST('alternation', [alternation, empty])
+
+    def repetition(self, items):
+        if len(items) == 1:
+            # no repetition
+            return items[0]
+
+        repeat  = items[0]
+        element = items[1]
+
+        rmin = [ x for x in repeat.find_data('repeat_min') ]
+        rmax = [ x for x in repeat.find_data('repeat_max') ]
+        rnum = [ x for x in repeat.find_data('repeat_n') ]
+
+        rmin = int(rmin[0].children[0].value) if len(rmin) else 0
+        rmax = int(rmax[0].children[0].value) if len(rmax) else None
+        rnum = int(rnum[0].children[0].value) if len(rnum) else None
+
+        if rnum is not None:
+            # Specific Repetition 'nRule'
+            if rnum == 0:
+                empty = ST('concatenation', [])
+                return  ST('alternation', [empty])
+
+            else:
+                rmin = rmax = rnum
+        else:
+            # Variable Repetition '<a>*<b>Rule', where <a> and <b> are optional
+            if rmax is None:
+                if rmin == 0:
+                    # '*Rule' (or '0*Rule')
+                    new_name = self._add_recurse_rule('star', element, 1)
+                    empty    = ST('concatenation',  [])
+                    return ST('alternation', [new_name, empty])
+                else:
+                    # '<a>*Rule'
+                    return self._add_recurse_rule('repeat_min', element, rmin)
+
+            else:
+                # '*<b>Rule' or '<a>*<b>Rule'
+                pass
+
+        if rmax < rmin or rmin < 0:
+            raise GrammarError("Bad repetition (%d*%d isn't allowed)" % (rmin, rmax))
+
+        return self._generate_repeats(element, rmin, rmax)
+
+class RenameRule_Visitor(Visitor):
+    """ rename ABNF Rule names to EBNF ones to reuse SimplifyRule_Visitor(). """
+    def concatenation(self, tree):
+        tree.data = 'expansion'
+
+    def alternation(self, tree):
+        tree.data = 'expansions'
+
+
+class ABNFRuleTreeToText(Transformer):
+
+    def expansion(self, symbols):
+        # renamed from 'concatenation'
+        return symbols
+
+    def expansions(self, x):
+        # renamed from 'alternation'
+        return x
+
+    def elements(self, x):
+        return x[0]
+
+    def prose_val(self, x):
+        prose = x[0]
+        raise GrammarError("This ABNF cannot be used to generate parsers "
+                           "since it has prose (informal) descriptions at line %s column %s"
+                           % (prose.line, prose.column))
+
+
+@inline_args
+class PrepareLiterals(Transformer_InPlace):
+    """ convert literals (char-val and num-val tokens in ABNF) into regexps """
+    def char_val(self, char_val):
+        literal = char_val.children[0].value
+        text    = literal[1:-1] # remove double quotes
+        if char_val.data == 'case_insensitive_string':
+            flags = ('i')
+        else:
+            flags = ()
+
+        return ST('pattern', [PatternStr(text, flags=flags, raw=literal)])
+
+    def _char_to_pattern(self, num_val_literal, base):
+        char = int(num_val_literal, base=base)
+        if char > 0xffffffff:
+            raise GrammarError("Terminal value characters larger than 0xffffffff is not supported.")
+        elif char > 0xffff:
+            regexp = r'\U{:08x}'.format(char)
+        elif char > 0xff:
+            regexp = r'\u{:04x}'.format(char)
+        else:
+            regexp = r'\x{:02x}'.format(char)
+        return regexp
+
+    def _value_range_to_pattern(self, num_val, base=10):
+        literal = num_val.value[2:]
+        if literal.find('.') > 0:
+            # '.' concatenation of values
+            nums = ( self._char_to_pattern(num, base) for num in literal.split('.') )
+            regexp = ''.join(nums)
+
+        elif literal.find('-') > 0:
+            # '-' value range
+            start, end = ( self._char_to_pattern(num, base) for num in literal.split('-') )
+            regexp = r'[%s-%s]' % (start, end)
+        else:
+            regexp = self._char_to_pattern(literal, base)
+
+        # list is unpacked in self.num_val()
+        return [ ST('pattern', [PatternRE(regexp)]) ]
+
+    def hex_val(self, literal):
+        return self._value_range_to_pattern(literal, base=16)
+    def dec_val(self, literal):
+        return self._value_range_to_pattern(literal, base=10)
+    def bin_val(self, literal):
+        return self._value_range_to_pattern(literal, base=2)
+
+    def num_val(self, items):
+        return items[0]
+
+
+class PruneTerminalTreeToPattern(Transformer_NonRecursive):
+    """
+    simplify terminal-tree by converting it into single instance of PatternRE or PatternStr,
+    which is created by PrepareLiterals().transform().
+    """
+    def pattern(self, ps):
+        p ,= ps
+        return p;
+
+    def elements(self, items):
+        assert len(items) == 1
+        return items[0]
+
+    def alternation(self, items):
+        assert len(items) == 1
+        return items[0]
+
+    def concatenation(self, items):
+        assert len(items) == 1
+        return items[0]
+
+    def repetition(self, items):
+        assert len(items) == 1
+        return items[0]
+
+    def element(self, items):
+        assert len(items) == 1
+        return items[0]
+
+    def num_val(self, items):
+        assert len(items) == 1 and isinstance(items[0], Pattern)
+        return items[0]
+
+class PrepareRuleNames(Transformer_InPlace):
+    def __init__(self, rule_names):
+        self.rule_names = rule_names
+
+    def element(self, v):
+        v ,= v
+        if isinstance(v, Tree):
+            return v
+
+        assert isinstance(v, Token)
+        if v.type == 'RULENAME':
+            if v.value in self.rule_names:
+                return NonTerminal(str(v.value))
+
+            return Terminal(str(v.value))
+
+        assert False
+
+
+class ABNFGrammar:
+    def __init__(self, rule_defs, term_defs, ignore):
+        self.term_defs = term_defs
+        self.rule_defs = rule_defs
+        self.ignore = ignore
+
+    def compile(self, start, terminals_to_keep):
+        # We change the trees in-place (to support huge grammars)
+        # So deepcopy allows calling compile more than once.
+        term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs]
+        rule_defs = [(n, nr_deepcopy_tree(t), o) for n, t, o in self.rule_defs]
+
+        # ===================
+        #  Compile Terminals
+        # ===================
+
+        # This transformer applies PrepareLiterals first.
+        # It converts literals to regexps and place them in instances of PatternRE or PatternStr.
+        #
+        # Next, PruneTerminalTreeToPattern is applied to simplify terminal-tree to
+        # single instance of PatternRE or PatternStr.
+
+        transformer = PrepareLiterals() * PruneTerminalTreeToPattern()
+
+        terminal_list = [TerminalDef(name, transformer.transform(term_tree), priority)
+                         for name, (term_tree, priority) in term_defs if term_tree]
+
+        # =================
+        #  Compile Rules
+        # =================
+
+        # convert literals in rule_defs to Terminals, rule names to NonTerminals.
+        rule_names  = [n for n, _t, _o in self.rule_defs]
+        transformer = PrepareLiterals() * PrepareRuleNames(rule_names)
+
+        # convert anonymous terminals (i.e. literals in the right-hand-side of ABNF rules)
+        # to terminals and add them to terminal_list
+
+        anon_tokens_transf = PrepareAnonTerminals(terminal_list)
+        transformer *= anon_tokens_transf
+
+        # Convert ABNF to BNF. It will convert as follows:
+        #  - repetitions (e.g. 1*DIGIT) ->  recursive rules or repetition of symbols,
+        #  - optional sequences (e.g. [ "word" ] ) -> alternation (e.g. ' "word" | "" ' )
+
+        abnf_to_bnf = ABNF_to_BNF()
+
+        rules = []
+        for name, rule_tree, options in rule_defs:
+            rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
+            abnf_to_bnf.rule_options = rule_options
+            abnf_to_bnf.prefix       = name
+            tree = transformer.transform(rule_tree)
+            res  = abnf_to_bnf.transform(tree)
+            rules.append((name, res, options))
+
+        # add recursive rules generated in abnf_to_bnf.transform()
+        rules += abnf_to_bnf.new_rules
+
+        # Compile tree to Rule objects
+
+        # rename ABNF rule names to EBNF ones to reuse SimplifyRule_Visitor()
+        #  ('alternation' in ABNF -> 'expansions',  'concatenation' in ABNF -> 'expansion' )
+        rename_rule       = RenameRule_Visitor()
+
+        # unpack some rule trees and simplify nested rule tree in expansion and expansions
+        simplify_rule     = SimplifyRule_Visitor()
+
+        # unpack Tree objects to list of symbols
+        rule_tree_to_text = ABNFRuleTreeToText()
+
+        compiled_rules = []
+        for rule_content in rules:
+            name, tree, options = rule_content
+
+            rename_rule.visit(tree)
+            simplify_rule.visit(tree)
+
+            expansions = rule_tree_to_text.transform(tree)
+
+            for i, expansion in enumerate(expansions):
+
+                alias       = None
+                exp_options = options
+                rule = Rule(NonTerminal(name), expansion, i, alias, exp_options)
+                compiled_rules.append(rule)
+
+        # assertion will fail if there are duplicates of rules
+        assert len(set(compiled_rules)) == len(compiled_rules)
+
+        # Filter out unused rules
+        while True:
+            c = len(compiled_rules)
+            used_rules = {s for r in compiled_rules
+                            for s in r.expansion
+                            if isinstance(s, NonTerminal)
+                            and s != r.origin}
+            used_rules |= {NonTerminal(s) for s in start}
+            compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules)
+            for r in unused:
+                logger.debug("Unused rule: %s", r)
+            if len(compiled_rules) == c:
+                break
+
+        # Filter out unused terminals
+        if terminals_to_keep != '*':
+            used_terms = {t.name for r in compiled_rules
+                                 for t in r.expansion
+                                 if isinstance(t, Terminal)}
+            terminal_list, unused = classify_bool(terminal_list, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep)
+            if unused:
+                logger.debug("Unused terminals: %s", [t.name for t in unused])
+
+        return terminal_list, compiled_rules, self.ignore
+
+
+
+def _find_used_symbols(tree):
+    assert tree.data == 'elements'
+    return {t for x in tree.find_data('element')
+              for t in x.scan_values(lambda t: t.type in ('RULENAME'))}
+
+def _get_abnf_parser():
+    try:
+        return _get_abnf_parser.cache
+    except AttributeError:
+        terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
+
+        rules = [(rulename, exp, RuleOptions()) for rulename, exp in RULES.items()]
+
+        rules = [Rule(NonTerminal(rulename), symbols_from_strcase(x.split()), i, None, o)
+                 for rulename, elements, o in rules for i, x in enumerate(elements)]
+
+        callback = ParseTreeBuilder(rules, ST).create_callback()
+        import re
+        lexer_conf = LexerConf(terminals, re, _TERMINALS_TO_IGNORE)
+        parser_conf = ParserConf(rules, callback, ['start'])
+        lexer_conf.lexer_type = 'basic'
+        parser_conf.parser_type = 'lalr'
+        _get_abnf_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None)
+        return _get_abnf_parser.cache
+
+ABNF_GRAMMAR_ERRORS = [
+    ('Unclosed parenthesis', ['a = ( \n']),
+    ('Unclosed bracket', ['a = [ \n']),
+    ('Incorrect type of value', ['a = 1\n']),
+    ('Unmatched closing parenthesis', ['a = )\n', 'a = ]\n', 'a = [)\n', 'a = (]\n']),
+    ('Expecting rule or terminal definition (missing "=" or "=/")',
+     ['a\n', 'a A\n', 'a /= A\n', 'a == A\n']),
+    ('Unexpected character, which is not usable in ABNF grammar', ['a@rule = "a rule"\n']),
+]
+
+def _translate_parser_exception(parse, e):
+    error = e.match_examples(parse, ABNF_GRAMMAR_ERRORS, use_accepts=True)
+    if error:
+        return error
+    elif 'STRING' in e.expected:
+        return "Expecting a value"
+
+def _parse_abnf_grammar(text, name, start='start'):
+    try:
+        tree = _get_abnf_parser().parse(text + '\n', start)
+    except UnexpectedCharacters as e:
+        context = e.get_context(text)
+        raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
+                           (e.line, e.column, name, context))
+    except UnexpectedToken as e:
+        context = e.get_context(text)
+        error = _translate_parser_exception(_get_abnf_parser().parse, e)
+        if error:
+            raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
+        raise
+
+    return PrepareGrammar().transform(tree)
+
+class ABNFGrammarBuilder:
+    def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None):
+        self.global_keep_all_tokens = global_keep_all_tokens
+        self.import_paths = import_paths or []
+        self.used_files = used_files or {}
+
+        self._definitions = {}
+        self._ignore_names = []
+
+    def _is_terminal(self, tree):
+        if not isinstance(tree, Tree):
+            # it would be a token (RULENAME). it is non-terminal.
+            return False
+
+        # It is a terminal if rule reduces to single instance of char-val or num-val.
+        if len(tree.children) > 1:
+            return False
+        elif len(tree.children) == 1:
+            if tree.data in ('char_val', 'num_val'):
+                return True
+            else:
+                return self._is_terminal(tree.children[0])
+
+        assert False, tree
+
+    def _define(self, name, oper, exp):
+        if name in self._definitions:
+            if oper == '=/':
+
+                assert isinstance(exp.children[0].children[0], Tree)
+                assert exp.children[0].children[0].data == 'concatenation'
+
+                # unify incremental alternatives into existing alternatives
+                base_exp = self._definitions[name]
+                base_exp.children[0].children += exp.children[0].children
+                return
+
+            raise GrammarError("Rule '%s' defined more than once" % name)
+
+        if name.startswith('__'):
+            raise GrammarError("Names starting with double-underscore are reserved (Error at '%s'})" % name)
+
+        self._definitions[name] = exp
+
+    def _unpack_definition(self, tree):
+        assert tree.data == 'rule'
+        rulename = tree.children[0].value
+        oper     = tree.children[1].value  # '=' or '=/'
+        rule_elements = tree.children[-1]
+
+        assert isinstance(rule_elements, Tree) and rule_elements.data == 'elements'
+
+        return rulename, oper, rule_elements
+
+
+    def load_grammar(self, grammar_text, grammar_name="<?>"):
+        tree = _parse_abnf_grammar(grammar_text, grammar_name)
+
+        for stmt in tree.children:
+            if stmt.data == 'rule':
+                self._define(*self._unpack_definition(stmt))
+            else:
+                assert False, stmt
+
+    def validate(self):
+        for name, elements in self._definitions.items():
+            for sym in _find_used_symbols(elements):
+                if sym not in self._definitions:
+                    raise GrammarError("Rule '%s' used but not defined in %s" % (sym, name))
+
+    def build(self):
+        rule_defs = []
+        term_defs = []
+        prio      = TOKEN_DEFAULT_PRIORITY
+        for name, exp in self._definitions.items():
+            if self._is_terminal(exp):
+                options = prio
+                term_defs.append((name, (exp, options)))
+            else:
+                options = RuleOptions(keep_all_tokens=self.global_keep_all_tokens,
+                                      expand1=False, priority=prio, template_source=None)
+                rule_defs.append((name, exp, options))
+
+        return ABNFGrammar(rule_defs, term_defs, self._ignore_names)
+
+
+def load_abnf_grammar(grammar, source, import_paths, global_keep_all_tokens):
+    builder = ABNFGrammarBuilder(global_keep_all_tokens, import_paths)
+    builder.load_grammar(grammar, source)
+    builder.validate()
+    return builder.build(), builder.used_files
diff --git a/tests/__main__.py b/tests/__main__.py
index b8d39712..6c576a4e 100644
--- a/tests/__main__.py
+++ b/tests/__main__.py
@@ -8,6 +8,7 @@
 from .test_tools import TestStandalone
 from .test_cache import TestCache
 from .test_grammar import TestGrammar
+from .test_grammar_abnf import TestABNFGrammar
 from .test_reconstructor import TestReconstructor
 from .test_tree_forest_transformer import TestTreeForestTransformer
 from .test_lexer import TestLexer
diff --git a/tests/test_grammar_abnf.py b/tests/test_grammar_abnf.py
new file mode 100644
index 00000000..90d358b8
--- /dev/null
+++ b/tests/test_grammar_abnf.py
@@ -0,0 +1,209 @@
+from __future__ import absolute_import
+
+import os
+from unittest import TestCase, main
+
+from lark import Lark, Token, Tree, ParseError, UnexpectedInput, UnexpectedCharacters
+from lark.load_grammar import GrammarError
+from lark.load_grammar import FromPackageLoader
+from lark.load_grammar_abnf import ABNF_GRAMMAR_ERRORS
+
+class TestABNFGrammar(TestCase):
+    def setUp(self):
+        pass
+
+    def test_charval_case_insensitive(self):
+        p = Lark('rulename = %i"aBc" / "xyz"\n', syntax='abnf', start='rulename')
+        abcs = ["abc", "Abc", "aBc", "abC", "ABc", "aBC", "AbC", "ABC"]
+        xyzs = ["xyz", "Xyz", "XYZ" ]
+        for i in abcs + xyzs:
+            self.assertEqual(p.parse(i), Tree('rulename', []))
+
+    def test_charval_case_sensitive(self):
+        p = Lark('rulename = %s"aBc" / %s"xyZ"\n', syntax='abnf', start='rulename')
+        for i in ('aBc', 'xyZ'):
+            self.assertEqual(p.parse(i), Tree('rulename', []))
+
+        for i in ('abc', 'xYy'):
+            self.assertRaises(UnexpectedCharacters, p.parse, i)
+
+    def test_inline_numval(self):
+        # test for anonymous rules generated for inline num-val (%x22)
+        g = ('cat = %x40 "cat" %x40\n')
+        l = Lark(g, syntax='abnf', start='cat',  keep_all_tokens=True)
+        self.assertEqual(l.parse('@cat@'),
+                         Tree('cat', [Token('__ANON_0', '@'), Token('CAT', 'cat'), Token('__ANON_0', '@')]))
+
+    def test_basic_abnf(self):
+        # test for alternatives, concatenation, and grouping
+        g1 =('beef   = %s"bEEf" / beef2 / (BE EF) \n'
+             'BE     = %xBE\n'
+             'EF     = %xEF\n'
+             'beef2  = %s"beef"\n')
+
+        # the same rule in multiple lines with comments
+        g2 =(' ; rules \n'
+             'beef   = %s"bEEf" \n'
+             '       / beef2   ; word "beef" in lowercase \n'
+             '       / (BE EF) ; bytes sequence [0xbe,0xef] \n'
+             ';terminals \n'
+             'BE     = %xBE\n'
+             'EF     = %xEF\n'
+             'beef2  = %s"beef"\n')
+
+        # the same rule using incremental alternatives
+        g3 = ('beef   =  %s"bEEf"\n'
+             'beef   =/ beef2 \n'
+             'beef   =/ (BE EF)\n'
+             'BE     = %xBE\n'
+             'EF     = %xEF\n'
+             'beef2  = %s"beef"\n')
+
+        for g in (g1, g2, g3):
+            l = Lark(g, syntax='abnf', start='beef',  keep_all_tokens=True)
+            self.assertEqual(l.parse(u'beef'), Tree('beef', [Token('beef2', 'beef')]))
+            self.assertEqual(l.parse(u'bEEf'), Tree('beef', [Token('BEEF', 'bEEf')]))
+            self.assertEqual(l.parse(u'\xbe\xef'), Tree('beef', [Token('BE', '\xbe'), Token('EF', '\xef')]))
+
+        # undefined rule
+        g = g3 + 'unused-rule = BE EF beef3\n'
+        self.assertRaises(GrammarError, Lark, g, syntax='abnf', start='beef')
+
+    def test_optional(self):
+        g = ('start = [ foo ] bar\n'
+             'foo   = "foo"\n'
+             'bar   = "bar"\n')
+        l = Lark(g, syntax='abnf', keep_all_tokens=False)
+        self.assertEqual(l.parse('foobar'),
+                         Tree('start', [Token('foo', 'foo'), Token('bar', 'bar')]))
+        self.assertEqual(l.parse('bar'),
+                         Tree('start', [Token('bar', 'bar')]))
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, 'foo')
+
+
+    def test_repetition(self):
+        g = ('start = rep-inf / rep-fixed \n'
+              'rep-inf     = *"X"\n'
+              'rep-fixed   = 3"F"\n')
+        l = Lark(g, syntax='abnf', keep_all_tokens=False)
+        self.assertEqual(l.parse('XXX'), Tree('start', [Tree('rep-inf', [])]))
+        self.assertEqual(l.parse(''), Tree('start', [Tree('rep-inf', [])]))
+        self.assertEqual(l.parse('FFF'), Tree('start', [Tree('rep-fixed', [])]))
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'FF')
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'FFFF')
+
+    def test_repetition_range(self):
+        g = ('start = rep-range / rep-atleast / rep-atmost\n'
+             'rep-range   = 2*4%s"R"\n'
+             'rep-atleast = 3*"L"\n'
+             'rep-atmost  = *5"M"\n')
+        l = Lark(g, syntax='abnf', keep_all_tokens=False)
+
+        self.assertEqual(l.parse('RRR'), Tree('start', [Tree('rep-range', [])]))
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'RRRRR')
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'R')
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'RRr')  # case sensitive
+
+        self.assertEqual(l.parse('LlL'), Tree('start', [Tree('rep-atleast', [])])) # case insensitive
+        self.assertEqual(l.parse('LLLL'), Tree('start', [Tree('rep-atleast', [])]))
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'LL')
+
+        self.assertEqual(l.parse('mmm'), Tree('start', [Tree('rep-atmost', [])]))
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'mmmmmm')
+
+    def test_zero_repetition(self):
+        g1 = ('start = ("cat" / "dog" / empty) "food" \n'
+             'empty = 0<animals>\n')
+        l = Lark(g1, syntax='abnf', keep_all_tokens=True)
+        self.assertEqual(l.parse("catfood"), Tree('start', [Token('CAT', 'cat'), Token('FOOD', 'food')]))
+        self.assertEqual(l.parse("dogfood"), Tree('start', [Token('DOG', 'dog'), Token('FOOD', 'food')]))
+        self.assertEqual(l.parse("food"), Tree('start', [Tree('empty', []), Token('FOOD', 'food')]))
+        self.assertRaises((UnexpectedInput), l.parse, u"petfood")
+
+    def test_literal_range(self):
+
+        g1 = ('start = LALPHA UALPHA 1*DIGIT\n'
+              'UALPHA = %x41-5A \n'
+              'LALPHA = %x61-7A \n'
+              'DIGIT = %x30-39\n')
+        g2 = ('start = LALPHA UALPHA 1*DIGIT\n'
+              'UALPHA = %x41-5A \n'
+              'LALPHA = %x61-7A \n'
+              'DIGIT  = %d48-57 \n')
+        g3 = ('start = LALPHA UALPHA 1*DIGIT\n'
+              'UALPHA = %x41-5A \n'
+              'LALPHA = %x61-7A \n'
+              'DIGIT  = %b00110000-00111001 \n')
+        for g in (g1, g2, g3):
+            l = Lark(g, syntax='abnf')
+            for i in (0,1,2,3,4,5,6,7,8,9):
+                self.assertEqual(l.parse('lU%d' % i),
+                                 Tree('start', [Token('LALPHA', 'l'), Token('UALPHA', 'U'),
+                                                Token('DIGIT', '%d' % i)]))
+            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'lU0123456789:')
+
+
+    def test_literal_concatenation(self):
+        g1 = ('start       = digits12345\n'
+              'digits12345 = %x31.32.33.34.35\n')
+        g2 = ('start       = digits12345\n'
+              'digits12345 = %b00110001.00110010.00110011.00110100.00110101\n')
+        g3 = ('start       = digits12345\n'
+              'digits12345 = %x49.50.51.52.53\n')
+        #for g in (g1, g2, g3):
+        for g in (g1,):
+            l = Lark(g, syntax='abnf', keep_all_tokens=False)
+            self.assertEqual(l.parse('12345'), Tree('start', [Token('digits12345', '12345')]))
+
+    def test_operator_precedence(self):
+        # concatenation has higher precedence than alternation
+        g = ('start = "a" / "b" "c"\n')
+        l = Lark(g, syntax='abnf', keep_all_tokens=True)
+        self.assertEqual(l.parse('bc'), Tree('start', [Token('B', 'b'), Token('C', 'c')]))
+        self.assertEqual(l.parse('a'),  Tree('start', [Token('A', 'a')]))
+
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, 'ac')
+
+        # grouping
+        g = ('start = ("a" / "b") "c"\n')
+        l = Lark(g, syntax='abnf', keep_all_tokens=True)
+        self.assertEqual(l.parse('bc'), Tree('start', [Token('B', 'b'), Token('C', 'c')]))
+        self.assertEqual(l.parse('ac'), Tree('start', [Token('A', 'a'), Token('C', 'c')]))
+
+    def test_unicode_match(self):
+        # test for 16bit unicode character
+        char_vals = ('%x2227', '%d8743', '%b0010001000100111')
+        unicode_char = '∧'
+
+        template = ('start = sym1\n'
+                    'sym1  = %s\n')
+        grammars = [ template % i for i in char_vals]
+        for g in grammars:
+            l = Lark(g, syntax='abnf', keep_all_tokens=True)
+            self.assertEqual(l.parse(unicode_char), Tree('start', [Token('sym1', unicode_char)]))
+
+    def test_unicode_match_emoji(self):
+        # test for 32bit unicode character
+        char_vals = ('%x1F431', '%d128049', '%b00011111010000110001')
+        cat_face_in_unicode = '🐱'
+
+        template = ('start  = thecat\n'
+                    'thecat = %s\n')
+        grammars = [ template % i for i in char_vals]
+        for g in grammars:
+            l = Lark(g, syntax='abnf', keep_all_tokens=True)
+            self.assertEqual(l.parse(cat_face_in_unicode),
+                             Tree('start', [Token('thecat', cat_face_in_unicode)]))
+
+    def test_errors(self):
+        for msg, examples in ABNF_GRAMMAR_ERRORS:
+            for example in examples:
+                try:
+                    p = Lark(example, syntax='abnf')
+                except GrammarError as e:
+                    assert msg in str(e)
+                else:
+                    assert False, "example did not raise an error"
+
+if __name__ == '__main__':
+    main()

From 3211beb49569accc3ea0da96cd6fea61fcc7206a Mon Sep 17 00:00:00 2001
From: Takahide Higuchi <takahidehiguchi@gmail.com>
Date: Mon, 4 Oct 2021 18:14:31 +0900
Subject: [PATCH 2/7] Fix lexer to allow any character in terminal names.

This fix is needed for ABNF grammar support.
---
 lark/lexer.py              | 16 ++++++++++++++--
 tests/test_grammar_abnf.py |  9 +++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/lark/lexer.py b/lark/lexer.py
index c30b9dd4..aa33ff5f 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -287,6 +287,12 @@ def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
 
         self.allowed_types = {t.name for t in self.terminals}
 
+        self.pattern_name_map = {n:'__%d' % i
+                                 for i, n in enumerate({t.name for t in self.terminals
+                                                        if not t.name.isidentifier()})}
+
+        self.pattern_name_map_reverse = {altname:n for n, altname in self.pattern_name_map.items()}
+
         self._mres = self._build_mres(terminals, len(terminals))
 
     def _build_mres(self, terminals, max_size):
@@ -296,7 +302,12 @@ def _build_mres(self, terminals, max_size):
         postfix = '$' if self.match_whole else ''
         mres = []
         while terminals:
-            pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
+            # Replace terminal name with '__%d' if it is not a valid python identifier.
+            # otherwise pattern will fail to compile.
+            pattern = u'|'.join(u'(?P<%s>%s)' % (self.pattern_name_map.get(t.name, t.name),
+                                                 t.pattern.to_regexp() + postfix)
+                                for t in terminals[:max_size])
+
             if self.use_bytes:
                 pattern = pattern.encode('latin-1')
             try:
@@ -312,7 +323,8 @@ def match(self, text, pos):
         for mre, type_from_index in self._mres:
             m = mre.match(text, pos)
             if m:
-                return m.group(0), type_from_index[m.lastindex]
+                type_ = type_from_index[m.lastindex]
+                return m.group(0), self.pattern_name_map_reverse.get(type_, type_)
 
 
 def _regexp_has_newline(r: str):
diff --git a/tests/test_grammar_abnf.py b/tests/test_grammar_abnf.py
index 90d358b8..fc9bced1 100644
--- a/tests/test_grammar_abnf.py
+++ b/tests/test_grammar_abnf.py
@@ -195,6 +195,15 @@ def test_unicode_match_emoji(self):
             self.assertEqual(l.parse(cat_face_in_unicode),
                              Tree('start', [Token('thecat', cat_face_in_unicode)]))
 
+    def test_terminal_rulename_with_hyphen(self):
+        # test to make sure that rule names may contain hyphen.
+        g = ('start = L-ALPHA U-ALPHA 1*DIGIT \n'
+             'U-ALPHA = %x41-5A \n'
+             'L-ALPHA = %x61-7A \n'
+             'DIGIT  = %d48-57 \n')
+        l = Lark(g, syntax='abnf')
+        self.assertEqual(l.parse(u'aA1'), Tree('start', [Token('L-ALPHA', 'a'), Token('U-ALPHA', 'A'), Token('DIGIT', '1')]))
+
     def test_errors(self):
         for msg, examples in ABNF_GRAMMAR_ERRORS:
             for example in examples:

From c6fbb0d604a1138dde23e3f5c5f972e438085004 Mon Sep 17 00:00:00 2001
From: Takahide Higuchi <takahidehiguchi@gmail.com>
Date: Mon, 4 Oct 2021 22:39:20 +0900
Subject: [PATCH 3/7] Added '%import' extension to ABNF grammar

Syntax:
 %import module
 %import module (rule1, rule2, ...)

Example:
 %import core-rules                ; import rules from lark/grammars/core-rules.abnf
 %import core-rules (CRLF, DIGITS) ; import specified rules (CRLF and DIGITS) only
---
 lark/load_grammar_abnf.py | 122 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 2 deletions(-)

diff --git a/lark/load_grammar_abnf.py b/lark/load_grammar_abnf.py
index 6fa109ce..3b8f0658 100644
--- a/lark/load_grammar_abnf.py
+++ b/lark/load_grammar_abnf.py
@@ -1,8 +1,11 @@
 """Parses grammar written in ABNF (RFC5234 and 7405) and creates Grammar objects. """
+import hashlib
+import os, sys
 
 from .load_grammar import PrepareGrammar, PrepareAnonTerminals
 from .load_grammar import EBNF_to_BNF, SimplifyRule_Visitor
 from .load_grammar import _get_parser, symbols_from_strcase, nr_deepcopy_tree
+from .load_grammar import PackageResource, stdlib_loader
 
 from .utils import logger
 from .lexer import Token, TerminalDef, Pattern, PatternRE, PatternStr
@@ -12,12 +15,14 @@
 from .common import LexerConf, ParserConf
 from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY
 from .tree import Tree, SlottedTree as ST
-from .utils import classify, classify_bool
+from .utils import classify, classify_bool, bfs
 from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken
 
 from .visitors import v_args, Transformer_InPlace, Transformer_NonRecursive, Visitor, Transformer
 inline_args = v_args(inline=True)
 
+_ALL_RULES = object()
+ABNF_EXT = '.abnf'
 
 # Terminals (ie. keys in TERMINALS ) shall consist of uppercase letters and underscores.
 TERMINALS = {
@@ -51,6 +56,11 @@
     'C_WSP':       r'[ \t]+|((;[^\n]*)*\r?\n)[ \t]+',
     '_C_NL':       r'((;[^\n]*)*\r?\n)(?![ \t])',
 
+    # terminals for nonstandard extensions
+    '_IMPORT': r'%import',
+    '_DOT':  r'\.',
+    '_COMMA':  r',',
+
     # define terminal for unusable charaters to see nice error messages for common pitfalls
     '_UNUSABLE_CHARS': r'[_@!#$&\+:]'
 }
@@ -63,7 +73,7 @@
 
     # rulelist       =  1*( rule / (*c-wsp c-nl) )
     '_rulelist':     ['_item', '_rulelist _item'],
-    '_item':         ['rule', '_C_NL' ],
+    '_item':         ['rule', 'import', '_C_NL' ], # 'import' is nonstandard extension
 
     # There are some assumptions in rule for 'rule'
     #
@@ -123,6 +133,18 @@
     'bin_val':       [ 'BIN_VAL' ],
 
     'prose_val':     [ 'PROSE_VAL' ],
+
+    # nonstandard extensions to ABNF grammar (%import directive)
+    'import': ['_IMPORT _import_path _C_NL',
+               '_IMPORT _import_path _LPAR name_list _RPAR _C_NL',
+    ],
+    '_import_path':    ['import_from_lib', 'import_relpath'],
+    'import_from_lib': ['_import_args'],
+    'import_relpath':  ['_DOT _import_args'],
+    '_import_args':    ['RULENAME', '_import_args _DOT RULENAME'],
+
+    'name_list': ['_name_list'],
+    '_name_list': ['RULENAME', '_name_list _COMMA RULENAME'],
 }
 
 
@@ -529,6 +551,17 @@ def _is_terminal(self, tree):
 
         assert False, tree
 
+    def _remove_unused(self, used):
+        def rule_dependencies(symbol):
+            try:
+                tree = self._definitions[symbol]
+            except KeyError:
+                return []
+            return _find_used_symbols(tree)
+
+        _used = set(bfs(used, rule_dependencies))
+        self._definitions = {k: v for k, v in self._definitions.items() if k in _used}
+
     def _define(self, name, oper, exp):
         if name in self._definitions:
             if oper == '=/':
@@ -559,12 +592,97 @@ def _unpack_definition(self, tree):
         return rulename, oper, rule_elements
 
 
+    def _unpack_import(self, stmt, grammar_name):
+        if len(stmt.children) > 1:
+            path_node, name_list = stmt.children
+            rules_to_import = [n.value for n in name_list.children]
+        else:
+            path_node, = stmt.children
+            rules_to_import = _ALL_RULES
+
+        # '%import topdir.subdir.file' --> dotted_path=['topdir','subdir','file']
+        dotted_path = tuple(path_node.children)
+
+        if path_node.data == 'import_from_lib':  # Import from lark/grammars/
+            base_path = None
+        else:  # Relative import
+            if grammar_name == '<string>':  # Import relative to script file path if grammar is coded in script
+                try:
+                    base_file = os.path.abspath(sys.modules['__main__'].__file__)
+                except AttributeError:
+                    base_file = None
+            else:
+                base_file = grammar_name  # Import relative to grammar file path if external grammar file
+            if base_file:
+                if isinstance(base_file, PackageResource):
+                    base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
+                else:
+                    base_path = os.path.split(base_file)[0]
+            else:
+                base_path = os.path.abspath(os.path.curdir)
+
+        return dotted_path, base_path, rules_to_import
+
+    def do_import(self, dotted_path, base_path, rules_to_import):
+
+        assert dotted_path
+        grammar_path = os.path.join(*dotted_path) + ABNF_EXT
+
+        to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
+        for source in to_try:
+            try:
+                if callable(source):
+                    joined_path, text = source(base_path, grammar_path)
+                else:
+                    joined_path = os.path.join(source, grammar_path)
+                    with open(joined_path, encoding='utf8') as f:
+                        text = f.read()
+            except IOError:
+                continue
+            else:
+                h = hashlib.md5(text.encode('utf8')).hexdigest()
+                if self.used_files.get(joined_path, h) != h:
+                    raise RuntimeError("Grammar file was changed during importing")
+                self.used_files[joined_path] = h
+
+                gb = ABNFGrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files)
+                gb.load_grammar(text, joined_path)
+                if rules_to_import != _ALL_RULES:
+                    gb._remove_unused(rules_to_import)
+
+                for name in gb._definitions:
+                    if name in self._definitions:
+                        raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path))
+
+                self._definitions.update(**gb._definitions)
+                break
+        else:
+            # Search failed. Make Python throw a nice error.
+            open(grammar_path, encoding='utf8')
+            assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,)
+
     def load_grammar(self, grammar_text, grammar_name="<?>"):
         tree = _parse_abnf_grammar(grammar_text, grammar_name)
 
+        imports = {}
+        for stmt in tree.children:
+            if stmt.data == 'import':
+                dotted_path, base_path, rules_to_import = self._unpack_import(stmt, grammar_name)
+                try:
+                    import_base_path, import_rules = imports[dotted_path]
+                    assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
+                    import_rules.update(rules_to_import)
+                except KeyError:
+                    imports[dotted_path] = base_path, rules_to_import
+
+        for dotted_path, (base_path, rules_to_import) in imports.items():
+            self.do_import(dotted_path, base_path, rules_to_import)
+
         for stmt in tree.children:
             if stmt.data == 'rule':
                 self._define(*self._unpack_definition(stmt))
+            elif stmt.data == 'import':
+                pass
             else:
                 assert False, stmt
 

From d3a925d0fe1ed726bec6d99e2385314d91c6537a Mon Sep 17 00:00:00 2001
From: Takahide Higuchi <takahidehiguchi@gmail.com>
Date: Mon, 4 Oct 2021 23:00:30 +0900
Subject: [PATCH 4/7] added the ABNF core rules, defined in RFC5234 appendix
 B.1, to the standard library.

---
 lark/grammars/core-rules.abnf | 39 +++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 lark/grammars/core-rules.abnf

diff --git a/lark/grammars/core-rules.abnf b/lark/grammars/core-rules.abnf
new file mode 100644
index 00000000..7ccf69f5
--- /dev/null
+++ b/lark/grammars/core-rules.abnf
@@ -0,0 +1,39 @@
+; ABNF Core Rules (RFC5234 Appendix.B)
+
+ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z
+BIT            =  "0" / "1"
+CHAR           =  %x01-7F
+                        ; any 7-bit US-ASCII character,
+                        ;  excluding NUL
+CR             =  %x0D
+                        ; carriage return
+CRLF           =  CR LF
+                        ; Internet standard newline
+CTL            =  %x00-1F / %x7F
+                        ; controls
+DIGIT          =  %x30-39
+                        ; 0-9
+DQUOTE         =  %x22
+                        ; " (Double Quote)
+HEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+HTAB           =  %x09
+                        ; horizontal tab
+LF             =  %x0A
+                        ; linefeed
+LWSP           =  *(WSP / CRLF WSP)
+                        ; Use of this linear-white-space rule
+                        ;  permits lines containing only white
+                        ;  space that are no longer legal in
+                        ;  mail headers and have caused
+                        ;  interoperability problems in other
+                        ;  contexts.
+                        ; Do not use when defining mail
+                        ;  headers and use with caution in
+                        ;  other contexts.
+OCTET          =  %x00-FF
+                        ; 8 bits of data
+SP             =  %x20
+VCHAR          =  %x21-7E
+                        ; visible (printing) characters
+WSP            =  SP / HTAB
+                        ; white space

From ef29a28b2c920276148cc9de219ded54298a8df0 Mon Sep 17 00:00:00 2001
From: Takahide Higuchi <takahidehiguchi@gmail.com>
Date: Mon, 4 Oct 2021 23:09:10 +0900
Subject: [PATCH 5/7] Added tests for %import directive in ABNF grammar

---
 tests/grammars/ab.abnf     |  8 ++++++++
 tests/test_grammar_abnf.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 tests/grammars/ab.abnf

diff --git a/tests/grammars/ab.abnf b/tests/grammars/ab.abnf
new file mode 100644
index 00000000..9b0f97c4
--- /dev/null
+++ b/tests/grammars/ab.abnf
@@ -0,0 +1,8 @@
+startab = expr
+
+expr = A B
+     / A expr B
+
+A = "a"
+B = "b"
+
diff --git a/tests/test_grammar_abnf.py b/tests/test_grammar_abnf.py
index fc9bced1..b3314113 100644
--- a/tests/test_grammar_abnf.py
+++ b/tests/test_grammar_abnf.py
@@ -214,5 +214,35 @@ def test_errors(self):
                 else:
                     assert False, "example did not raise an error"
 
+    def test_import_from_custom_sources(self):
+        custom_loader = FromPackageLoader('tests', ('grammars', ))
+        g1 = ('start = startab \n'
+              '%import ab\n')
+        p = Lark(g1, syntax='abnf', start='start', import_paths=[custom_loader])
+        self.assertEqual(p.parse('ab'),
+                         Tree('start', [Tree('startab', [Tree('expr', [Token('A', 'a'), Token('B', 'b')])])]))
+
+    def test_import(self):
+        g1 = ('start = LALPHA UALPHA 1*DIGIT CRLF\n'
+              'UALPHA = %x41-5A \n'
+              'LALPHA = %x61-7A \n'
+              'DIGIT = %x30-39\n'
+              '%import core-rules\n')
+        # grammar error is expected since DIGIT is defined twice in both g1 and core-rules.abnf
+        self.assertRaises(GrammarError, Lark, g1, syntax='abnf')
+
+        g2 = ('start = LALPHA UALPHA 1*DIGIT CRLF\n'
+             'UALPHA = %x41-5A \n'
+             'LALPHA = %x61-7A \n'
+             'DIGIT = %x30-39\n'
+             '%import core-rules ( CRLF )\n')
+        # g2 is okay since only rule 'CRLF' is imported but 'DIGITS' is not
+        p = Lark(g2, syntax='abnf')
+        self.assertEqual(p.parse('aA1\r\n'),
+                         Tree('start', [Token('LALPHA', 'a'), Token('UALPHA', 'A'),
+                                        Token('DIGIT', '1'),
+                                        Tree('CRLF', [Token('CR', '\r'), Token('LF', '\n')])]))
+
+
 if __name__ == '__main__':
     main()

From 51ba8a4ebff97e0b52f18ddf7149259a1772ad38 Mon Sep 17 00:00:00 2001
From: Takahide Higuchi <takahidehiguchi@gmail.com>
Date: Tue, 12 Oct 2021 19:16:01 +0900
Subject: [PATCH 6/7] Added 'abnf_alias' decorator for transformer and visitor
 classes

---
 lark/__init__.py |  1 +
 lark/visitors.py | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/lark/__init__.py b/lark/__init__.py
index 609cfc7f..d494c9a1 100644
--- a/lark/__init__.py
+++ b/lark/__init__.py
@@ -1,6 +1,7 @@
 from .utils import logger
 from .tree import Tree
 from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive
+from .visitors import abnf_alias
 from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken,
                          UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError)
 from .lexer import Token
diff --git a/lark/visitors.py b/lark/visitors.py
index ae7930c0..cc18df66 100644
--- a/lark/visitors.py
+++ b/lark/visitors.py
@@ -515,6 +515,29 @@ def _visitor_args_dec(obj):
     return _visitor_args_dec
 
 
+class abnf_alias:
+    """
+    A decorator to make aliases for public methods such that underscores in their names
+    changed to hyphens.
+     (e.g  an alias method "self.foo-bar(..)" is created for "self.foo_bar(..)". )
+
+    This is required to support ABNF grammar since hyphens are allowed in ABNF rules but not in
+    python method names.
+    """
+    def __init__( self, cls ):
+        self._cls = cls
+
+        for name, func in getmembers(cls):
+            if name.startswith('_'):
+                continue
+            if callable(getattr(cls, name)) and name.find('_') > 0:
+                alias = name.replace('_', '-')
+                setattr(cls, alias, func)
+
+    def __call__( self, *args, **kwargs ):
+        instance = self._cls( *args, **kwargs )
+        return instance
+
 ###}
 
 

From 18f91cc26add8ff5a6d6908cbc2f33bcc49d9e12 Mon Sep 17 00:00:00 2001
From: Takahide Higuchi <takahidehiguchi@gmail.com>
Date: Tue, 12 Oct 2021 19:18:11 +0900
Subject: [PATCH 7/7] Added example of creating parser using ABNF grammar

---
 examples/grammars/rfc3986.abnf |  87 ++++++++++++++++++++++++
 examples/url_parser_abnf.py    | 120 +++++++++++++++++++++++++++++++++
 2 files changed, 207 insertions(+)
 create mode 100644 examples/grammars/rfc3986.abnf
 create mode 100644 examples/url_parser_abnf.py

diff --git a/examples/grammars/rfc3986.abnf b/examples/grammars/rfc3986.abnf
new file mode 100644
index 00000000..d127daf7
--- /dev/null
+++ b/examples/grammars/rfc3986.abnf
@@ -0,0 +1,87 @@
+; ABNF grammar from RFC3986
+;   Uniform Resource Identifier (URI): Generic Syntax
+;
+; some terminals (e.g. DIGIT, ALPHA, ..) is defined in ABNF core rules in RFC5234.
+;
+
+URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+
+hier-part     = "//" authority path-abempty
+                 / path-absolute
+                 / path-rootless
+                 / path-empty
+
+URI-reference = URI / relative-ref
+
+absolute-URI  = scheme ":" hier-part [ "?" query ]
+
+relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
+
+relative-part = "//" authority path-abempty
+                 / path-absolute
+                 / path-noscheme
+                 / path-empty
+
+scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+
+authority     = [ userinfo "@" ] host [ ":" port ]
+userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
+host          = IP-literal / IPv4address / reg-name
+port          = *DIGIT
+
+IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
+IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
+
+IPv6address   =                            6( h16 ":" ) ls32
+                 /                       "::" 5( h16 ":" ) ls32
+                 / [               h16 ] "::" 4( h16 ":" ) ls32
+                 / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+                 / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+                 / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+                 / [ *4( h16 ":" ) h16 ] "::"              ls32
+                 / [ *5( h16 ":" ) h16 ] "::"              h16
+                 / [ *6( h16 ":" ) h16 ] "::"
+
+h16           = 1*4HEXDIG
+ls32          = ( h16 ":" h16 ) / IPv4address
+IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
+
+dec-octet     = DIGIT                 ; 0-9
+                 / %x31-39 DIGIT         ; 10-99
+                 / "1" 2DIGIT            ; 100-199
+                 / "2" %x30-34 DIGIT     ; 200-249
+                 / "25" %x30-35          ; 250-255
+
+reg-name      = *( unreserved / pct-encoded / sub-delims )
+
+path          = path-abempty    ; begins with "/" or is empty
+                 / path-absolute   ; begins with "/" but not "//"
+                 / path-noscheme   ; begins with a non-colon segment
+                 / path-rootless   ; begins with a segment
+                 / path-empty      ; zero characters
+
+path-abempty  = *( "/" segment )
+path-absolute = "/" [ segment-nz *( "/" segment ) ]
+path-noscheme = segment-nz-nc *( "/" segment )
+path-rootless = segment-nz *( "/" segment )
+path-empty    = 0<pchar>
+
+
+segment       = *pchar
+segment-nz    = 1*pchar
+segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
+                 ; non-zero-length segment without any colon ":"
+
+pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
+
+query         = *( pchar / "/" / "?" )
+fragment      = *( pchar / "/" / "?" )
+
+pct-encoded   = "%" HEXDIG HEXDIG
+
+unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+reserved      = gen-delims / sub-delims
+gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+                 / "*" / "+" / "," / ";" / "="
+
diff --git a/examples/url_parser_abnf.py b/examples/url_parser_abnf.py
new file mode 100644
index 00000000..9a515c1a
--- /dev/null
+++ b/examples/url_parser_abnf.py
@@ -0,0 +1,120 @@
+"""
+Creating URL Parser from ABNF grammar in internet standards (RFCs)
+==================================================================
+
+Usage:
+ python3 -m examples.url_parser_abnf https://github.com/lark%2Dparser/lark
+ python3 -m examples.url_parser_abnf http://user@127.0.0.1:8000/index.html
+
+It outputs parse tree for an URI passed as first argument.
+
+"""
+import sys
+
+from lark import Lark, Transformer, v_args, abnf_alias, Token, Visitor, Tree
+
+grammar_in_abnf ="""
+%import .grammars.rfc3986   ; import from ./grammars/rfc3986.abnf
+%import core-rules          ; import from the standard library: ../lark/grammars/core-rules.abnf
+"""
+
+
+class SimplifyABNFTree_Visitor(Visitor):
+    def __init__(self, unwrap_children=(), keep=(), *args, **kwargs):
+        super(SimplifyABNFTree_Visitor, self).__init__(*args, **kwargs)
+        self.unwrap = unwrap_children
+        self.keep   = keep
+
+    def visit(self, tree: Tree) -> Tree:
+        # override self.visit(), since _unwrap_and_flatten() assumes top-down visitor
+        self.visit_topdown(tree)
+
+    def _unwrap_and_flatten(self, tree, unwrap_recursive=False):
+        """ a generator to flatten tree into list or tuple """
+        do_unwrap = True if tree.data in self.unwrap or unwrap_recursive else False
+
+        for x in tree.children:
+            if isinstance(x, Tree) and do_unwrap:
+                if x.data in self.keep:
+                    yield self._concat_tokens(x, unwrap_recursive=True)
+                else:
+                    for item in list(self._unwrap_and_flatten(x, unwrap_recursive=True)):
+                        yield item
+            elif isinstance(x, Token):
+                yield x
+            else:
+                yield x
+
+
+    def _concat_tokens(self, tree, unwrap_recursive=False):
+        """ concatenate multiple tokens in tree.children into single token.
+            leave it as it is if there is a tree in tree.children.
+        """
+        items = [None]
+        words = []
+        children = list(self._unwrap_and_flatten(tree, unwrap_recursive=unwrap_recursive))
+
+        for x in children:
+            if isinstance(x, Token):
+                words.append(x.value)
+                if not isinstance(items[-1], Token):
+                    items.append(x)
+            else:
+                if len(words) > 1:
+                    items[-1] = items[-1].update(value=''.join(words))
+                items.append(x)
+                words=[]
+
+        if len(words) > 1:
+            items[-1] = items[-1].update(value=''.join(words))
+
+        tree.children = items[1:]
+        return tree;
+
+    def __default__(self, tree):
+        return self._concat_tokens(tree)
+
+
+@abnf_alias
+class pct_encoded_conv(Transformer):
+    def pct_encoded(self, items): # alias for pct-encoded
+        # items = "%" HEXDIG HEXDIG
+
+        # extract hexadecimal digits, convert it to a character,
+        # then return modified token
+        char_in_hex = ''.join((items[1].children[0], items[2].children[0]))
+        char_ = bytearray.fromhex(char_in_hex).decode()
+        token = items[0].update(value=char_)
+        return token
+
+def main():
+    url = sys.argv[1]
+
+    url_parser = Lark(grammar_in_abnf,
+                      # using ABNF grammar
+                      syntax='abnf',
+                      start='URI',
+                      # use earley parser since RFC3986 is too complex for LALR.
+                      parser='earley',
+                      # usually needed to set keep_all_tokens=True when ABNF grammar is used.
+                      keep_all_tokens=True,
+    )
+    tree = url_parser.parse(url)
+
+    # Convert pct-encoded (e.g. '%2D' in given URL) to ascii characters
+    transformer=pct_encoded_conv()
+    tree = transformer.transform(tree)
+
+    # We need some post-processing to unwrap unwanted tree node and concatenate ABNF tokens
+    # to construct a token that we actually want since many ABNF grammar
+    # in RFCs split every input into too small units like a single character.
+    unwrap = ('scheme', 'userinfo', 'IPv4address', 'IPv6address', 'reg-name',
+              'path-abempty', 'path-absolute', 'path-noscheme', 'path-rootless')
+    simplifier = SimplifyABNFTree_Visitor(unwrap_children=unwrap)
+    simplifier.visit(tree)
+
+    print(tree.pretty())
+
+
+if __name__ == '__main__':
+    main()