-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimple_recursive_descent_parser.py
280 lines (201 loc) · 7.76 KB
/
simple_recursive_descent_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
"""
A simple recursive descent parser for basic arithmetic using
s-expressions (as in lisp).
e.g.
> (+ 1 2)
3
> (/ (* 2 2) 2)
2
> (- 4 3 2)
-1
Note that a language like this uses a prefix (aka Polish) notation
where the operator precedes its operands.
For example while the infix (conventional) notation would be represented as
> 3 + 4
7
prefix notation would be represented as
> (+ 3 4)
7
The operator (binary in this scope) is followed by 2 or more operands.
For example
> (+ 3 4 5)
12
is equivalent to the following in infix notation
> (3 + 4) + 5
12
Any additional operand after the first two implies applying the
operation on the accumulated result from the previous operands with the
additional operand as seen above, until all the operands are exhausted.
We begin by a formal definition of our grammer.
expr ::= NUM
| form
form ::= ( operator operands )
operands ::= expr expr optional_operands
optional_operands ::= expr optional_operands | None
operator ::= +
| -
| *
| /
The definitions on the left are expanded to those on the right.
This reads as:
- An expression is either a number or a form.
- A form is composed of an operator followed by operands enclosed in
parentheses
- Operands are a pair of expressions followed by an optional operand
- An optional operand is an expression followed by another optional
operand, or None. This allows us to accept any additional number of
optional operands
- An operator is one of +, -, /, *
NUM and the operators above are terminal symbols, meaning that they
cannot be expanded any further. The goal is to expand an expression
(recursively if necessary) until we're have terminal symbols
that can be evaluated.
More on BNF here: https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form
We then proceed with the implementation below.
"""
import re
import collections
# We define regular expressions to extract the different types of
# tokens from an input string
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LEFT_PAREN = r'(?P<LEFT_PAREN>\()'
RIGHT_PAREN = r'(?P<RIGHT_PAREN>\))'
WHITESPACE = r'(?P<WHITESPACE>\s+)'
MISMATCH = r'(?P<MISMATCH>.)' # a catch all for any patterns we don't expect
# Create a pattern that will match all of the above
master_pattern = re.compile("|".join(
[NUM, PLUS, MINUS, TIMES, DIVIDE, LEFT_PAREN, RIGHT_PAREN,
WHITESPACE, MISMATCH]))
# Create a generator for tokens
# We use a namedtuple to create a lightweight class for the tokens
Token = collections.namedtuple("Token", ["type", "value"])
def generate_tokens(text):
"""Generates tokens identified by the regular expressions above
Accepts an input string and generates non-whitespace tokens
If a token is identified as a mismatch, raises an error
"""
for m in master_pattern.finditer(text):
token = Token(m.lastgroup, m.group())
if token.type == "MISMATCH":
raise RuntimeError(f"Unexpected token {token.value}")
elif token.type != "WHITESPACE":
yield token
class Evaluator:
"""Parser for our expressions
Provides a method `parse` which when given input string,
parses it and executes it returns a result
>>> evaluator = Evaluator()
>>> evaluator.parse("(+ 1 2)")
3
Key helper methods that are used:
_advance - Move forward one token
_accept - If the next token is of the given type, advance and
return True
_expect - Advance expecting the next token to be of the given type
If it isn't, raise a syntax error
"""
def parse(self, input_string):
"""Given an input string, evaluate it"""
self.tokens = generate_tokens(input_string)
# We have a reference to both the current and next token
# This allows us to look ahead one token
self.token, self.next_token = None, None
self._advance()
# root our grammar is expr
# call that and return result
result = self.expr()
return result
def expr(self):
"""expr ::= NUM | form
An expression is either number or form"""
if self._accept("NUM"):
value = int(self.token.value)
else:
# descend into form
value = self.form()
return value
def form(self):
"""form ::= ( operator operands )
A form is an operator, followed by operands enclosed in parentheses"""
self._expect("LEFT_PAREN")
# 'descend' into operator and operands
operator = self.operator()
operands = self.operands()
# operands should be a list of two or more operands
# apply the operator on the operands
first, second, *rest = operands
result = self._apply(operator, first, second)
for operand in rest:
result = self._apply(operator, result, operand)
self._expect("RIGHT_PAREN")
return result
def operator(self):
"""operator ::= + | - | * | / """
if (
self._accept("PLUS") or
self._accept("MINUS") or
self._accept("TIMES") or
self._accept("DIVIDE")
):
return self.token.value
raise SyntaxError("Expected PLUS or MINUS or TIMES or DIVIDE")
def operands(self):
"""operands ::= expr expr optional_operands
operands are two expressions followed by an optional operand
"""
# recursively call expression twice
# Then add optional operands
first = self.expr()
second = self.expr()
return [first, second] + self.optional_operands()
def optional_operands(self):
"""optional_operands ::= expr optional_operands | None
An expression followed by an optional operand, or None
"""
operands = []
# Check if the next token is the first token of an expression
# i.e. if token is in FIRST(expr) in parsing terminology
# If so, collect it and move to the next optional token
# If not, we have hit the terminal condition (None)
FIRST_EXPR = ["NUM", "LEFT_PAREN"] # FIRST(expr)
if self.next_token.type in FIRST_EXPR:
operands.append(self.expr())
operands += self.optional_operands()
return operands
def _apply(self, operator, left, right):
"""Applies operator on the left and right operands"""
if operator == "+":
result = left + right
elif operator == "-":
result = left - right
elif operator == "*":
result = left * right
else:
result = left / right
return result
def _advance(self):
"""Advances one token ahead in input stream"""
self.token, self.next_token = self.next_token, next(self.tokens, None)
def _accept(self, token_type):
"""Advances to the next token if it matches the given type"""
if self.next_token.type == token_type:
self._advance()
return True
return False
def _expect(self, token_type):
"""Advances to the next token expecting that it is of the given type
If the token is not of the given type, raises a syntax error
"""
if not self._accept(token_type):
raise SyntaxError(f"Expected {token_type}")
evaluator = Evaluator()
assert evaluator.parse("1") == 1
assert evaluator.parse("(+ 1 2)") == 3
assert evaluator.parse("(- 1 2)") == -1
assert evaluator.parse("(* 5 2 3)") == 30
assert evaluator.parse("(/ 10 2)") == 5
assert evaluator.parse("(+ (- 3 5) 2 (- 3 5) 2)") == 0