-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexical_analysis.py
417 lines (372 loc) · 15.3 KB
/
lexical_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
#-*- coding: utf-8 -*-
from collections import defaultdict
from graphviz import Digraph
star = '*'
line = '|'
dot = '·'
leftBracket, rightBracket = '(', ')'
alphabet = [chr(i) for i in range(ord('A'), ord('Z') + 1)] + \
[chr(i) for i in range(ord('a'), ord('z') + 1)] + \
[chr(i) for i in range(ord('0'), ord('9') + 1)]
epsilon = 'ε'
class FA:
def __init__(self, symbol = set([])):
self.states = set()
self.symbol = symbol # input symbol 输入符号表
self.transitions = defaultdict(defaultdict)
self.startstate = None
self.finalstates = []
def setStart(self, state):
self.startstate = state
self.states.add(state)
def addFinal(self, state):
if isinstance(state, int):
state = [state]
for s in state:
if s not in self.finalstates:
self.finalstates.append(s)
def addTransition(self, fromstate, tostate, inputch): # add only one 仅添加一条映射关系
if isinstance(inputch, str):
inputch = set([inputch])
self.states.add(fromstate)
self.states.add(tostate)
if fromstate in self.transitions and tostate in self.transitions[fromstate]:
self.transitions[fromstate][tostate] = \
self.transitions[fromstate][tostate].union(inputch)
else:
self.transitions[fromstate][tostate] = inputch
def addTransition_dict(self, transitions): # add dict to dict 将一个字典的内容添加到另一个字典
for fromstate, tostates in transitions.items():
for state in tostates:
self.addTransition(fromstate, state, tostates[state])
def newBuildFromNumber(self, startnum):
# change the states' representing number to start with the given startnum
# 改变各状态的表示数字,使之从 startnum 开始
translations = {}
for i in self.states:
translations[i] = startnum
startnum += 1
rebuild = FA(self.symbol)
rebuild.setStart(translations[self.startstate])
rebuild.addFinal(translations[self.finalstates[0]])
# 多个终结状态不方便合并操作, 同时提供的合并操作可以保证只产生一个终结状态
for fromstate, tostates in self.transitions.items():
for state in tostates:
rebuild.addTransition(translations[fromstate], translations[state], tostates[state])
return [rebuild, startnum]
def newBuildFromEqualStates(self, equivalent, pos):
# change states' number after merging
# 在最小化合并状态后修改状态的表示数字
rebuild = FA(self.symbol)
for fromstate, tostates in self.transitions.items():
for state in tostates:
rebuild.addTransition(pos[fromstate], pos[state], tostates[state])
rebuild.setStart(pos[self.startstate])
for s in self.finalstates:
rebuild.addFinal(pos[s])
return rebuild
def getEpsilonClosure(self, findstate):
allstates = set()
states = [findstate]
while len(states):
state = states.pop()
allstates.add(state)
if state in self.transitions:
for tos in self.transitions[state]:
if epsilon in self.transitions[state][tos] and \
tos not in allstates:
states.append(tos)
return allstates
def getMove(self, state, skey):
if isinstance(state, int):
state = [state]
trstates = set()
for st in state:
if st in self.transitions:
for tns in self.transitions[st]:
if skey in self.transitions[st][tns]:
trstates.add(tns)
return trstates
def display(self, fname, pname):
fa = Digraph(pname, filename = fname, format = 'png')
fa.attr(rankdir='LR')
fa.attr('node', shape = 'doublecircle')
for fst in self.finalstates:
fa.node('s' + str(fst))
fa.attr('node', shape = 'circle')
for fromstate, tostates in self.transitions.items():
for state in tostates:
tmp = ''
for s in tostates[state]:
tmp += s + '|'
fa.edge('s' + str(fromstate), 's' + str(state), label = tmp[:-1])
fa.attr('node', shape = 'point')
fa.edge('', 's' + str(self.startstate))
fa.view()
class Regex2NFA:
def __init__(self, regex):
self.regex = regex
self.buildNFA()
def displayNFA(self):
self.nfa.display('nfa.gv', 'nondeterministic_finite_state_machine')
@staticmethod
def getPriority(op):
if op == line:
return 1
elif op == dot:
return 2
elif op == star:
return 3
else: # left bracket 左括号
return 0
@staticmethod
def basicstruct(inputch): # Regex = a -> NFA
state1 = 1
state2 = 2
basic = FA(set([inputch]))
basic.setStart(state1)
basic.addFinal(state2)
basic.addTransition(state1, state2, inputch)
return basic
@staticmethod
def linestruct(a, b): # Regex = a | b -> NFA
[a, m1] = a.newBuildFromNumber(2)
[b, m2] = b.newBuildFromNumber(m1)
state1 = 1
state2 = m2
lineFA = FA(a.symbol.union(b.symbol))
lineFA.setStart(state1)
lineFA.addFinal(state2)
lineFA.addTransition(lineFA.startstate, a.startstate, epsilon)
lineFA.addTransition(lineFA.startstate, b.startstate, epsilon)
lineFA.addTransition(a.finalstates[0], lineFA.finalstates[0], epsilon)
lineFA.addTransition(b.finalstates[0], lineFA.finalstates[0], epsilon)
lineFA.addTransition_dict(a.transitions)
lineFA.addTransition_dict(b.transitions)
return lineFA
@staticmethod
def dotstruct(a, b): # Regex = a · b -> NFA
[a, m1] = a.newBuildFromNumber(1)
[b, m2] = b.newBuildFromNumber(m1)
state1 = 1
state2 = m2 - 1
dotFA = FA(a.symbol.union(b.symbol))
dotFA.setStart(state1)
dotFA.addFinal(state2)
dotFA.addTransition(a.finalstates[0], b.startstate, epsilon)
dotFA.addTransition_dict(a.transitions)
dotFA.addTransition_dict(b.transitions)
return dotFA
@staticmethod
def starstruct(a): # Regex = a* -> NFA
[a, m1] = a.newBuildFromNumber(2)
state1 = 1
state2 = m1
starFA = FA(a.symbol)
starFA.setStart(state1)
starFA.addFinal(state2)
starFA.addTransition(starFA.startstate, a.startstate, epsilon)
starFA.addTransition(starFA.startstate, starFA.finalstates[0], epsilon)
starFA.addTransition(a.finalstates[0], starFA.finalstates[0], epsilon)
starFA.addTransition(a.finalstates[0], a.startstate, epsilon)
starFA.addTransition_dict(a.transitions)
return starFA
def buildNFA(self):
tword = ''
pre = ''
symbol = set()
# explicitly add dot to the expression 显式地为正则式添加连接符
for ch in self.regex:
if ch in alphabet:
symbol.add(ch)
if ch in alphabet or ch == leftBracket:
if pre != dot and (pre in alphabet or pre in [star, rightBracket]):
tword += dot
tword += ch
pre = ch
self.regex = tword
# convert infix expression to postfix expression 将中缀表达式转换为后缀表达式
tword = ''
stack = []
for ch in self.regex:
if ch in alphabet:
tword += ch
elif ch == leftBracket:
stack.append(ch)
elif ch == rightBracket:
while(stack[-1] != leftBracket):
tword += stack[-1]
stack.pop()
stack.pop() # pop left bracket 弹出左括号
else:
while(len(stack) and Regex2NFA.getPriority(stack[-1]) >= Regex2NFA.getPriority(ch)):
tword += stack[-1]
stack.pop()
stack.append(ch)
while(len(stack) > 0):
tword += stack.pop()
self.regex = tword
# build ε-NFA from postfix expression 由后缀表达式构建ε-NFA
self.automata = []
for ch in self.regex:
if ch in alphabet:
self.automata.append(Regex2NFA.basicstruct(ch))
elif ch == line:
b = self.automata.pop()
a = self.automata.pop()
self.automata.append(Regex2NFA.linestruct(a, b))
elif ch == dot:
b = self.automata.pop()
a = self.automata.pop()
self.automata.append(Regex2NFA.dotstruct(a, b))
elif ch == star:
a = self.automata.pop()
self.automata.append(Regex2NFA.starstruct(a))
self.nfa = self.automata.pop()
self.nfa.symbol = symbol
class NFA2DFA:
def __init__(self, nfa):
self.buildDFA(nfa)
def displayDFA(self):
self.dfa.display('dfa.gv', 'deterministic_finite_state_machine')
def displayminDFA(self):
self.minDFA.display('mindfa.gv', 'min_deterministic_finite_state_machine')
def buildDFA(self, nfa): # subset method 子集法
allstates = dict() # visited subset
eclosure = dict() # every state's ε-closure
state1 = nfa.getEpsilonClosure(nfa.startstate)
eclosure[nfa.startstate] = state1
cnt = 1 # the number of subset, dfa state id
dfa = FA(nfa.symbol)
dfa.setStart(cnt)
states = [[state1, dfa.startstate]] # unvisit
allstates[cnt] = state1
cnt += 1
while len(states):
[state, fromindex] = states.pop()
for ch in dfa.symbol:
trstates = nfa.getMove(state, ch)
for s in list(trstates): # 转化为list, 相当于使用了一个临时变量
if s not in eclosure:
eclosure[s] = nfa.getEpsilonClosure(s)
trstates = trstates.union(eclosure[s])
if len(trstates):
if trstates not in allstates.values():
states.append([trstates, cnt])
allstates[cnt] = trstates
toindex = cnt
cnt += 1
else:
toindex = [k for k, v in allstates.items() if v == trstates][0]
dfa.addTransition(fromindex, toindex, ch)
for value, state in allstates.items():
if nfa.finalstates[0] in state:
dfa.addFinal(value)
self.dfa = dfa
@staticmethod
def reNumber(states, pos): # renumber the sets' number begin from 1
cnt = 1
change = dict()
for st in states:
if pos[st] not in change:
change[pos[st]] = cnt
cnt += 1
pos[st] = change[pos[st]]
def minimise(self): # segmentation 分割法, 生成新的状态集合
states = list(self.dfa.states)
tostate = dict(set()) # Move(every state, every symbol)
# initialization 预处理出每个状态经一个输入符号可以到达的下一个状态
for st in states:
for sy in self.dfa.symbol:
if st in tostate:
if sy in tostate[st]:
tostate[st][sy] = tostate[st][sy].union(self.dfa.getMove(st, sy))
else:
tostate[st][sy] = self.dfa.getMove(st, sy)
else:
tostate[st] = {sy : self.dfa.getMove(st, sy)}
if len(tostate[st][sy]):
tostate[st][sy] = tostate[st][sy].pop()
else:
tostate[st][sy] = 0
equal = dict() # state sets 不同分组的状态集合
pos = dict() # record the set which state belongs to 记录状态对应的分组
# initialization 2 sets, nonterminal states and final states
equal = {1: set(), 2: set()}
for st in states:
if st not in self.dfa.finalstates:
equal[1] = equal[1].union(set([st]))
pos[st] = 1
for fst in self.dfa.finalstates:
equal[2] = equal[2].union(set([fst]))
pos[fst] = 2
unchecked = []
cnt = 3 # the number of sets
unchecked.extend([[equal[1], 1], [equal[2], 2]])
while len(unchecked):
[equalst, id] = unchecked.pop()
for sy in self.dfa.symbol:
diff = dict()
for st in equalst:
if tostate[st][sy] == 0:
if 0 in diff:
diff[0].add(st)
else:
diff[0] = set([st])
else:
if pos[tostate[st][sy]] in diff:
diff[pos[tostate[st][sy]]].add(st)
else:
diff[pos[tostate[st][sy]]] = set([st])
if len(diff) > 1:
for k, v in diff.items():
if k:
for i in v:
equal[id].remove(i)
if cnt in equal:
equal[cnt] = equal[cnt].union(set([i]))
else:
equal[cnt] = set([i])
if len(equal[id]) == 0:
equal.pop(id)
for i in v:
pos[i] = cnt
unchecked.append([equal[cnt], cnt])
cnt += 1
break
if len(equal) == len(states):
self.minDFA = self.dfa
else:
NFA2DFA.reNumber(states, pos)
self.minDFA = self.dfa.newBuildFromEqualStates(equal, pos)
def Analysis(self, string):
string = string.replace('@', epsilon)
curst = self.dfa.startstate
for ch in string:
if ch == epsilon:
continue
st = list(self.dfa.getMove(curst, ch))
if len(st) == 0:
return False
curst = st[0]
if curst in self.dfa.finalstates:
return True
return False
if __name__ == '__main__':
# using example
regex = input('Please input the regex: ')
a = Regex2NFA(regex)
a.displayNFA()
b = NFA2DFA(a.nfa)
b.displayDFA()
b.minimise()
b.displayminDFA()
while True:
try:
s = input('Please input a word to analysis (take @ as ε): ')
if b.Analysis(s):
print('Accepted')
else:
print('Unaccepted')
except EOFError:
break