lark-parser · t-higuchi · Oct 4, 2021 · Oct 4, 2021 · Oct 4, 2021 · Oct 4, 2021
diff --git a/examples/grammars/rfc3986.abnf b/examples/grammars/rfc3986.abnf
@@ -0,0 +1,87 @@
+; ABNF grammar from RFC3986
+;   Uniform Resource Identifier (URI): Generic Syntax
+;
+; some terminals (e.g. DIGIT, ALPHA, ..) is defined in ABNF core rules in RFC5234.
+;
+
+URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+
+hier-part     = "//" authority path-abempty
+                 / path-absolute
+                 / path-rootless
+                 / path-empty
+
+URI-reference = URI / relative-ref
+
+absolute-URI  = scheme ":" hier-part [ "?" query ]
+
+relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
+
+relative-part = "//" authority path-abempty
+                 / path-absolute
+                 / path-noscheme
+                 / path-empty
+
+scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+
+authority     = [ userinfo "@" ] host [ ":" port ]
+userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
+host          = IP-literal / IPv4address / reg-name
+port          = *DIGIT
+
+IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
+IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
+
+IPv6address   =                            6( h16 ":" ) ls32
+                 /                       "::" 5( h16 ":" ) ls32
+                 / [               h16 ] "::" 4( h16 ":" ) ls32
+                 / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+                 / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+                 / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
+                 / [ *4( h16 ":" ) h16 ] "::"              ls32
+                 / [ *5( h16 ":" ) h16 ] "::"              h16
+                 / [ *6( h16 ":" ) h16 ] "::"
+
+h16           = 1*4HEXDIG
+ls32          = ( h16 ":" h16 ) / IPv4address
+IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
+
+dec-octet     = DIGIT                 ; 0-9
+                 / %x31-39 DIGIT         ; 10-99
+                 / "1" 2DIGIT            ; 100-199
+                 / "2" %x30-34 DIGIT     ; 200-249
+                 / "25" %x30-35          ; 250-255
+
+reg-name      = *( unreserved / pct-encoded / sub-delims )
+
+path          = path-abempty    ; begins with "/" or is empty
+                 / path-absolute   ; begins with "/" but not "//"
+                 / path-noscheme   ; begins with a non-colon segment
+                 / path-rootless   ; begins with a segment
+                 / path-empty      ; zero characters
+
+path-abempty  = *( "/" segment )
+path-absolute = "/" [ segment-nz *( "/" segment ) ]
+path-noscheme = segment-nz-nc *( "/" segment )
+path-rootless = segment-nz *( "/" segment )
+path-empty    = 0<pchar>
+
+
+segment       = *pchar
+segment-nz    = 1*pchar
+segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
+                 ; non-zero-length segment without any colon ":"
+
+pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
+
+query         = *( pchar / "/" / "?" )
+fragment      = *( pchar / "/" / "?" )
+
+pct-encoded   = "%" HEXDIG HEXDIG
+
+unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+reserved      = gen-delims / sub-delims
+gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+                 / "*" / "+" / "," / ";" / "="
+
diff --git a/examples/url_parser_abnf.py b/examples/url_parser_abnf.py
@@ -0,0 +1,120 @@
+"""
+Creating URL Parser from ABNF grammar in internet standards (RFCs)
+==================================================================
+
+Usage:
+ python3 -m examples.url_parser_abnf https://github.com/lark%2Dparser/lark
+ python3 -m examples.url_parser_abnf http://[email protected]:8000/index.html
+
+It outputs parse tree for an URI passed as first argument.
+
+"""
+import sys
+
+from lark import Lark, Transformer, v_args, abnf_alias, Token, Visitor, Tree
+
+grammar_in_abnf ="""
+%import .grammars.rfc3986   ; import from ./grammars/rfc3986.abnf
+%import core-rules          ; import from the standard library: ../lark/grammars/core-rules.abnf
+"""
+
+
+class SimplifyABNFTree_Visitor(Visitor):
+    def __init__(self, unwrap_children=(), keep=(), *args, **kwargs):
+        super(SimplifyABNFTree_Visitor, self).__init__(*args, **kwargs)
+        self.unwrap = unwrap_children
+        self.keep   = keep
+
+    def visit(self, tree: Tree) -> Tree:
+        # override self.visit(), since _unwrap_and_flatten() assumes top-down visitor
+        self.visit_topdown(tree)
+
+    def _unwrap_and_flatten(self, tree, unwrap_recursive=False):
+        """ a generator to flatten tree into list or tuple """
+        do_unwrap = True if tree.data in self.unwrap or unwrap_recursive else False
+
+        for x in tree.children:
+            if isinstance(x, Tree) and do_unwrap:
+                if x.data in self.keep:
+                    yield self._concat_tokens(x, unwrap_recursive=True)
+                else:
+                    for item in list(self._unwrap_and_flatten(x, unwrap_recursive=True)):
+                        yield item
+            elif isinstance(x, Token):
+                yield x
+            else:
+                yield x
+
+
+    def _concat_tokens(self, tree, unwrap_recursive=False):
+        """ concatenate multiple tokens in tree.children into single token.
+            leave it as it is if there is a tree in tree.children.
+        """
+        items = [None]
+        words = []
+        children = list(self._unwrap_and_flatten(tree, unwrap_recursive=unwrap_recursive))
+
+        for x in children:
+            if isinstance(x, Token):
+                words.append(x.value)
+                if not isinstance(items[-1], Token):
+                    items.append(x)
+            else:
+                if len(words) > 1:
+                    items[-1] = items[-1].update(value=''.join(words))
+                items.append(x)
+                words=[]
+
+        if len(words) > 1:
+            items[-1] = items[-1].update(value=''.join(words))
+
+        tree.children = items[1:]
+        return tree;
+
+    def __default__(self, tree):
+        return self._concat_tokens(tree)
+
+
+@abnf_alias
+class pct_encoded_conv(Transformer):
+    def pct_encoded(self, items): # alias for pct-encoded
+        # items = "%" HEXDIG HEXDIG
+
+        # extract hexadecimal digits, convert it to a character,
+        # then return modified token
+        char_in_hex = ''.join((items[1].children[0], items[2].children[0]))
+        char_ = bytearray.fromhex(char_in_hex).decode()
+        token = items[0].update(value=char_)
+        return token
+
+def main():
+    url = sys.argv[1]
+
+    url_parser = Lark(grammar_in_abnf,
+                      # using ABNF grammar
+                      syntax='abnf',
+                      start='URI',
+                      # use earley parser since RFC3986 is too complex for LALR.
+                      parser='earley',
+                      # usually needed to set keep_all_tokens=True when ABNF grammar is used.
+                      keep_all_tokens=True,
+    )
+    tree = url_parser.parse(url)
+
+    # Convert pct-encoded (e.g. '%2D' in given URL) to ascii characters
+    transformer=pct_encoded_conv()
+    tree = transformer.transform(tree)
+
+    # We need some post-processing to unwrap unwanted tree node and concatenate ABNF tokens
+    # to construct a token that we actually want since many ABNF grammar
+    # in RFCs split every input into too small units like a single character.
+    unwrap = ('scheme', 'userinfo', 'IPv4address', 'IPv6address', 'reg-name',
+              'path-abempty', 'path-absolute', 'path-noscheme', 'path-rootless')
+    simplifier = SimplifyABNFTree_Visitor(unwrap_children=unwrap)
+    simplifier.visit(tree)
+
+    print(tree.pretty())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/lark/__init__.py b/lark/__init__.py
@@ -1,6 +1,7 @@
 from .utils import logger
 from .tree import Tree
 from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive
+from .visitors import abnf_alias
 from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken,
                          UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError)
 from .lexer import Token

diff --git a/lark/grammars/core-rules.abnf b/lark/grammars/core-rules.abnf
@@ -0,0 +1,39 @@
+; ABNF Core Rules (RFC5234 Appendix.B)
+
+ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z
+BIT            =  "0" / "1"
+CHAR           =  %x01-7F
+                        ; any 7-bit US-ASCII character,
+                        ;  excluding NUL
+CR             =  %x0D
+                        ; carriage return
+CRLF           =  CR LF
+                        ; Internet standard newline
+CTL            =  %x00-1F / %x7F
+                        ; controls
+DIGIT          =  %x30-39
+                        ; 0-9
+DQUOTE         =  %x22
+                        ; " (Double Quote)
+HEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+HTAB           =  %x09
+                        ; horizontal tab
+LF             =  %x0A
+                        ; linefeed
+LWSP           =  *(WSP / CRLF WSP)
+                        ; Use of this linear-white-space rule
+                        ;  permits lines containing only white
+                        ;  space that are no longer legal in
+                        ;  mail headers and have caused
+                        ;  interoperability problems in other
+                        ;  contexts.
+                        ; Do not use when defining mail
+                        ;  headers and use with caution in
+                        ;  other contexts.
+OCTET          =  %x00-FF
+                        ; 8 bits of data
+SP             =  %x20
+VCHAR          =  %x21-7E
+                        ; visible (printing) characters
+WSP            =  SP / HTAB
+                        ; white space
diff --git a/lark/lark.py b/lark/lark.py
@@ -16,6 +16,7 @@
 from .exceptions import ConfigurationError, assert_config, UnexpectedInput
 from .utils import Serialize, SerializeMemoizer, FS, isascii, logger
 from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource
+from .load_grammar_abnf import load_abnf_grammar, ABNFGrammar
 from .tree import Tree
 from .common import LexerConf, ParserConf
 
@@ -98,6 +99,11 @@ class LarkOptions(Serialize):
             Prevent the tree builder from automagically removing "punctuation" tokens (default: False)
     tree_class
             Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``.
+    syntax
+            Syntax for grammar specification.
+
+            - "lark" (default): Lark's EBNF based syntax
+            - "abnf" : ABNF syntax, described in RFC5234. Various extentions in Lark's EBNF syntax are not supported.
 
     **=== Algorithm Options ===**
 
@@ -169,6 +175,7 @@ class LarkOptions(Serialize):
         'use_bytes': False,
         'import_paths': [],
         'source_path': None,
+        'syntax': 'lark',
     }
 
     def __init__(self, options_dict):
@@ -326,11 +333,17 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
                             # In practice the only relevant thing that might have been overriden should be `options`
                             self.options = old_options
 
+            assert_config(self.options.syntax, ('lark', 'abnf'))
 
             # Parse the grammar file and compose the grammars
-            self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
+            if self.options.syntax == 'lark':
+                self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
+            elif self.options.syntax == 'abnf':
+                self.grammar, used_files = load_abnf_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
+            else:
+                assert False, self.options.syntax
         else:
-            assert isinstance(grammar, Grammar)
+            assert isinstance(grammar, (Grammar, ABNFGrammar))
             self.grammar = grammar
 
 

diff --git a/lark/lexer.py b/lark/lexer.py
@@ -287,6 +287,12 @@ def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
 
         self.allowed_types = {t.name for t in self.terminals}
 
+        self.pattern_name_map = {n:'__%d' % i
+                                 for i, n in enumerate({t.name for t in self.terminals
+                                                        if not t.name.isidentifier()})}
+
+        self.pattern_name_map_reverse = {altname:n for n, altname in self.pattern_name_map.items()}
+
         self._mres = self._build_mres(terminals, len(terminals))
 
     def _build_mres(self, terminals, max_size):
@@ -296,7 +302,12 @@ def _build_mres(self, terminals, max_size):
         postfix = '$' if self.match_whole else ''
         mres = []
         while terminals:
-            pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
+            # Replace terminal name with '__%d' if it is not a valid python identifier.
+            # otherwise pattern will fail to compile.
+            pattern = u'|'.join(u'(?P<%s>%s)' % (self.pattern_name_map.get(t.name, t.name),
+                                                 t.pattern.to_regexp() + postfix)
+                                for t in terminals[:max_size])
+
             if self.use_bytes:
                 pattern = pattern.encode('latin-1')
             try:
@@ -312,7 +323,8 @@ def match(self, text, pos):
         for mre, type_from_index in self._mres:
             m = mre.match(text, pos)
             if m:
-                return m.group(0), type_from_index[m.lastindex]
+                type_ = type_from_index[m.lastindex]
+                return m.group(0), self.pattern_name_map_reverse.get(type_, type_)
 
 
 def _regexp_has_newline(r: str):