-
Notifications
You must be signed in to change notification settings - Fork 177
/
Copy pathboolean_worlds_td.py
232 lines (207 loc) · 9.57 KB
/
boolean_worlds_td.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
#MIT License
#Copyright (c) 2017 Massimiliano Patacchiola
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
#In this script the TD(0) tabular algorithm is used to estimate the utilities
#of the boolean worlds.
import numpy as np
from gridworld import GridWorld
def init_and():
'''Init the AND boolean environment
@return the environment gridworld object
'''
env = GridWorld(5, 5)
#Define the state matrix
state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 1.0]])
#Define the index matrix
index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
[(3,0), (3,1), (3,2), (3,3), (3,4)],
[(2,0), (2,1), (2,2), (2,3), (2,4)],
[(1,0), (1,1), (1,2), (1,3), (1,4)],
[(0,0), (0,1), (0,2), (0,3), (0,4)]])
#Define the reward matrix
reward_matrix = np.array([[-1.0, 0.0, 0.0, 0.0, 1.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[-1.0, 0.0, 0.0, 0.0, -1.0]])
#Define the transition matrix
transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
[0.1, 0.8, 0.1, 0.0],
[0.0, 0.1, 0.8, 0.1]]),
env.setStateMatrix(state_matrix)
env.setIndexMatrix(index_matrix)
env.setRewardMatrix(reward_matrix)
env.setTransitionMatrix(transition_matrix)
return env, np.zeros((5,5))
def init_nand():
'''Init the NAND boolean environment
@return the environment gridworld object
'''
env = GridWorld(5, 5)
#Define the state matrix
state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 1.0]])
#Define the index matrix
index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
[(3,0), (3,1), (3,2), (3,3), (3,4)],
[(2,0), (2,1), (2,2), (2,3), (2,4)],
[(1,0), (1,1), (1,2), (1,3), (1,4)],
[(0,0), (0,1), (0,2), (0,3), (0,4)]])
#Define the reward matrix
reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, -1.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 1.0]])
#Define the transition matrix
transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
[0.1, 0.8, 0.1, 0.0],
[0.0, 0.1, 0.8, 0.1],
[0.1, 0.0, 0.1, 0.8]])
env.setStateMatrix(state_matrix)
env.setIndexMatrix(index_matrix)
env.setRewardMatrix(reward_matrix)
env.setTransitionMatrix(transition_matrix)
return env, np.zeros((5,5))
def init_or():
'''Init the OR boolean environment
@return the environment gridworld object
'''
env = GridWorld(5, 5)
#Define the state matrix
state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 1.0]])
#Define the index matrix
index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
[(3,0), (3,1), (3,2), (3,3), (3,4)],
[(2,0), (2,1), (2,2), (2,3), (2,4)],
[(1,0), (1,1), (1,2), (1,3), (1,4)],
[(0,0), (0,1), (0,2), (0,3), (0,4)]])
#Define the reward matrix
reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[-1.0, 0.0, 0.0, 0.0, 1.0]])
#Define the transition matrix
transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
[0.1, 0.8, 0.1, 0.0],
[0.0, 0.1, 0.8, 0.1],
[0.1, 0.0, 0.1, 0.8]])
env.setStateMatrix(state_matrix)
env.setIndexMatrix(index_matrix)
env.setRewardMatrix(reward_matrix)
env.setTransitionMatrix(transition_matrix)
return env, np.zeros((5,5))
def init_xor():
'''Init the XOR boolean environment
@return the environment gridworld object
'''
env = GridWorld(5, 5)
#Define the state matrix
state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 1.0]])
#Define the index matrix
index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
[(3,0), (3,1), (3,2), (3,3), (3,4)],
[(2,0), (2,1), (2,2), (2,3), (2,4)],
[(1,0), (1,1), (1,2), (1,3), (1,4)],
[(0,0), (0,1), (0,2), (0,3), (0,4)]])
#Define the reward matrix
reward_matrix = np.array([[-1.0, 0.0, 0.0, 0.0, 1.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, -1.0]])
#Define the transition matrix
transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
[0.1, 0.8, 0.1, 0.0],
[0.0, 0.1, 0.8, 0.1],
[0.1, 0.0, 0.1, 0.8]])
env.setStateMatrix(state_matrix)
env.setIndexMatrix(index_matrix)
env.setRewardMatrix(reward_matrix)
env.setTransitionMatrix(transition_matrix)
return env, np.zeros((5,5))
def update_utility(utility_matrix, observation, new_observation,
reward, alpha, gamma, done):
'''Return the updated utility matrix
@param utility_matrix the matrix before the update
@param observation the state obsrved at t
@param new_observation the state observed at t+1
@param reward the reward observed after the action
@param alpha the step size (learning rate)
@param gamma the discount factor
@return the updated utility matrix
'''
if done:
u = utility_matrix[observation[0], observation[1]]
utility_matrix[observation[0], observation[1]] += alpha * (reward - u)
else:
u = utility_matrix[observation[0], observation[1]]
u_t1 = utility_matrix[new_observation[0], new_observation[1]]
utility_matrix[observation[0], observation[1]] += \
alpha * (reward + gamma * u_t1 - u)
return utility_matrix
def main():
env, utility_matrix_and = init_and()
env_nand, utility_matrix_nand = init_nand()
env_or, utility_matrix_or = init_or()
env_xor, utility_matrix_xor = init_xor()
gamma = 0.999
alpha = 0.1 #constant step size
tot_epoch = 300000
print_epoch = 1000
for epoch in range(tot_epoch):
#Reset and return the first observation
observation = env.reset(exploring_starts=False)
for step in range(1000):
#Take the action from the action matrix
action = np.random.randint(0,4)
#Move one step in the environment and get obs and reward
new_observation, reward, done = env.step(action)
utility_matrix_and = update_utility(utility_matrix_and, observation,
new_observation, reward, alpha, gamma, done)
observation = new_observation
#print(utility_matrix)
if done: break
if(epoch % print_epoch == 0):
print("")
print("Utility matrix after " + str(epoch+1) + " iterations:")
print(np.flipud(utility_matrix_and))
#Time to check the utility matrix obtained
print("Utility matrix after " + str(tot_epoch) + " iterations:")
print(np.flipud(utility_matrix_and))
if __name__ == "__main__":
main()