Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ABNF grammar support #1017

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions examples/grammars/rfc3986.abnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
; ABNF grammar from RFC3986
; Uniform Resource Identifier (URI): Generic Syntax
;
; some terminals (e.g. DIGIT, ALPHA, ..) is defined in ABNF core rules in RFC5234.
;

URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]

hier-part = "//" authority path-abempty
/ path-absolute
/ path-rootless
/ path-empty

URI-reference = URI / relative-ref

absolute-URI = scheme ":" hier-part [ "?" query ]

relative-ref = relative-part [ "?" query ] [ "#" fragment ]

relative-part = "//" authority path-abempty
/ path-absolute
/ path-noscheme
/ path-empty

scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )

authority = [ userinfo "@" ] host [ ":" port ]
userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
host = IP-literal / IPv4address / reg-name
port = *DIGIT

IP-literal = "[" ( IPv6address / IPvFuture ) "]"
IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )

IPv6address = 6( h16 ":" ) ls32
/ "::" 5( h16 ":" ) ls32
/ [ h16 ] "::" 4( h16 ":" ) ls32
/ [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
/ [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
/ [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
/ [ *4( h16 ":" ) h16 ] "::" ls32
/ [ *5( h16 ":" ) h16 ] "::" h16
/ [ *6( h16 ":" ) h16 ] "::"

h16 = 1*4HEXDIG
ls32 = ( h16 ":" h16 ) / IPv4address
IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet

dec-octet = DIGIT ; 0-9
/ %x31-39 DIGIT ; 10-99
/ "1" 2DIGIT ; 100-199
/ "2" %x30-34 DIGIT ; 200-249
/ "25" %x30-35 ; 250-255

reg-name = *( unreserved / pct-encoded / sub-delims )

path = path-abempty ; begins with "/" or is empty
/ path-absolute ; begins with "/" but not "//"
/ path-noscheme ; begins with a non-colon segment
/ path-rootless ; begins with a segment
/ path-empty ; zero characters

path-abempty = *( "/" segment )
path-absolute = "/" [ segment-nz *( "/" segment ) ]
path-noscheme = segment-nz-nc *( "/" segment )
path-rootless = segment-nz *( "/" segment )
path-empty = 0<pchar>


segment = *pchar
segment-nz = 1*pchar
segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
; non-zero-length segment without any colon ":"

pchar = unreserved / pct-encoded / sub-delims / ":" / "@"

query = *( pchar / "/" / "?" )
fragment = *( pchar / "/" / "?" )

pct-encoded = "%" HEXDIG HEXDIG

unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved = gen-delims / sub-delims
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
/ "*" / "+" / "," / ";" / "="

120 changes: 120 additions & 0 deletions examples/url_parser_abnf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""
Creating URL Parser from ABNF grammar in internet standards (RFCs)
==================================================================

Usage:
python3 -m examples.url_parser_abnf https://github.com/lark%2Dparser/lark
python3 -m examples.url_parser_abnf http://[email protected]:8000/index.html

It outputs parse tree for an URI passed as first argument.

"""
import sys

from lark import Lark, Transformer, v_args, abnf_alias, Token, Visitor, Tree

grammar_in_abnf ="""
%import .grammars.rfc3986 ; import from ./grammars/rfc3986.abnf
%import core-rules ; import from the standard library: ../lark/grammars/core-rules.abnf
"""


class SimplifyABNFTree_Visitor(Visitor):
def __init__(self, unwrap_children=(), keep=(), *args, **kwargs):
super(SimplifyABNFTree_Visitor, self).__init__(*args, **kwargs)
self.unwrap = unwrap_children
self.keep = keep

def visit(self, tree: Tree) -> Tree:
# override self.visit(), since _unwrap_and_flatten() assumes top-down visitor
self.visit_topdown(tree)

def _unwrap_and_flatten(self, tree, unwrap_recursive=False):
""" a generator to flatten tree into list or tuple """
do_unwrap = True if tree.data in self.unwrap or unwrap_recursive else False

for x in tree.children:
if isinstance(x, Tree) and do_unwrap:
if x.data in self.keep:
yield self._concat_tokens(x, unwrap_recursive=True)
else:
for item in list(self._unwrap_and_flatten(x, unwrap_recursive=True)):
yield item
elif isinstance(x, Token):
yield x
else:
yield x


def _concat_tokens(self, tree, unwrap_recursive=False):
""" concatenate multiple tokens in tree.children into single token.
leave it as it is if there is a tree in tree.children.
"""
items = [None]
words = []
children = list(self._unwrap_and_flatten(tree, unwrap_recursive=unwrap_recursive))

for x in children:
if isinstance(x, Token):
words.append(x.value)
if not isinstance(items[-1], Token):
items.append(x)
else:
if len(words) > 1:
items[-1] = items[-1].update(value=''.join(words))
items.append(x)
words=[]

if len(words) > 1:
items[-1] = items[-1].update(value=''.join(words))

tree.children = items[1:]
return tree;

def __default__(self, tree):
return self._concat_tokens(tree)


@abnf_alias
class pct_encoded_conv(Transformer):
def pct_encoded(self, items): # alias for pct-encoded
# items = "%" HEXDIG HEXDIG

# extract hexadecimal digits, convert it to a character,
# then return modified token
char_in_hex = ''.join((items[1].children[0], items[2].children[0]))
char_ = bytearray.fromhex(char_in_hex).decode()
token = items[0].update(value=char_)
return token

def main():
url = sys.argv[1]

url_parser = Lark(grammar_in_abnf,
# using ABNF grammar
syntax='abnf',
start='URI',
# use earley parser since RFC3986 is too complex for LALR.
parser='earley',
# usually needed to set keep_all_tokens=True when ABNF grammar is used.
keep_all_tokens=True,
)
tree = url_parser.parse(url)

# Convert pct-encoded (e.g. '%2D' in given URL) to ascii characters
transformer=pct_encoded_conv()
tree = transformer.transform(tree)

# We need some post-processing to unwrap unwanted tree node and concatenate ABNF tokens
# to construct a token that we actually want since many ABNF grammar
# in RFCs split every input into too small units like a single character.
unwrap = ('scheme', 'userinfo', 'IPv4address', 'IPv6address', 'reg-name',
'path-abempty', 'path-absolute', 'path-noscheme', 'path-rootless')
simplifier = SimplifyABNFTree_Visitor(unwrap_children=unwrap)
simplifier.visit(tree)

print(tree.pretty())


if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions lark/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .utils import logger
from .tree import Tree
from .visitors import Transformer, Visitor, v_args, Discard, Transformer_NonRecursive
from .visitors import abnf_alias
from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken,
UnexpectedInput, UnexpectedCharacters, UnexpectedEOF, LarkError)
from .lexer import Token
Expand Down
39 changes: 39 additions & 0 deletions lark/grammars/core-rules.abnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
; ABNF Core Rules (RFC5234 Appendix.B)

ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
BIT = "0" / "1"
CHAR = %x01-7F
; any 7-bit US-ASCII character,
; excluding NUL
CR = %x0D
; carriage return
CRLF = CR LF
; Internet standard newline
CTL = %x00-1F / %x7F
; controls
DIGIT = %x30-39
; 0-9
DQUOTE = %x22
; " (Double Quote)
HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
HTAB = %x09
; horizontal tab
LF = %x0A
; linefeed
LWSP = *(WSP / CRLF WSP)
; Use of this linear-white-space rule
; permits lines containing only white
; space that are no longer legal in
; mail headers and have caused
; interoperability problems in other
; contexts.
; Do not use when defining mail
; headers and use with caution in
; other contexts.
OCTET = %x00-FF
; 8 bits of data
SP = %x20
VCHAR = %x21-7E
; visible (printing) characters
WSP = SP / HTAB
; white space
17 changes: 15 additions & 2 deletions lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
from .utils import Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files, PackageResource
from .load_grammar_abnf import load_abnf_grammar, ABNFGrammar
from .tree import Tree
from .common import LexerConf, ParserConf

Expand Down Expand Up @@ -98,6 +99,11 @@ class LarkOptions(Serialize):
Prevent the tree builder from automagically removing "punctuation" tokens (default: False)
tree_class
Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``.
syntax
Syntax for grammar specification.

- "lark" (default): Lark's EBNF based syntax
- "abnf" : ABNF syntax, described in RFC5234. Various extentions in Lark's EBNF syntax are not supported.

**=== Algorithm Options ===**

Expand Down Expand Up @@ -169,6 +175,7 @@ class LarkOptions(Serialize):
'use_bytes': False,
'import_paths': [],
'source_path': None,
'syntax': 'lark',
}

def __init__(self, options_dict):
Expand Down Expand Up @@ -326,11 +333,17 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
# In practice the only relevant thing that might have been overriden should be `options`
self.options = old_options

assert_config(self.options.syntax, ('lark', 'abnf'))

# Parse the grammar file and compose the grammars
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
if self.options.syntax == 'lark':
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
elif self.options.syntax == 'abnf':
self.grammar, used_files = load_abnf_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else:
assert False, self.options.syntax
else:
assert isinstance(grammar, Grammar)
assert isinstance(grammar, (Grammar, ABNFGrammar))
self.grammar = grammar


Expand Down
16 changes: 14 additions & 2 deletions lark/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,12 @@ def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):

self.allowed_types = {t.name for t in self.terminals}

self.pattern_name_map = {n:'__%d' % i
for i, n in enumerate({t.name for t in self.terminals
if not t.name.isidentifier()})}

self.pattern_name_map_reverse = {altname:n for n, altname in self.pattern_name_map.items()}

self._mres = self._build_mres(terminals, len(terminals))

def _build_mres(self, terminals, max_size):
Expand All @@ -296,7 +302,12 @@ def _build_mres(self, terminals, max_size):
postfix = '$' if self.match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
# Replace terminal name with '__%d' if it is not a valid python identifier.
# otherwise pattern will fail to compile.
pattern = u'|'.join(u'(?P<%s>%s)' % (self.pattern_name_map.get(t.name, t.name),
t.pattern.to_regexp() + postfix)
for t in terminals[:max_size])

if self.use_bytes:
pattern = pattern.encode('latin-1')
try:
Expand All @@ -312,7 +323,8 @@ def match(self, text, pos):
for mre, type_from_index in self._mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]
type_ = type_from_index[m.lastindex]
return m.group(0), self.pattern_name_map_reverse.get(type_, type_)


def _regexp_has_newline(r: str):
Expand Down
Loading