-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathq_approximation.lua
198 lines (165 loc) · 6.09 KB
/
q_approximation.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
local validator = require 'validator'
local Algebra = require 'algebra'
local Matrix = Algebra.Matrix
local Vector = Algebra.Vector
local CSV = require 'csv_helper'
------------------------------------------------------
------------------------------------------------------
-- Q(λ) with Linear Function Approximation Algorithm
------------------------------------------------------
------------------------------------------------------
local alpha = 0
local gamma = 0
local lambda = 0
local epsilon = 0
local theta = { }
local actions = 0
local choosed_action = 0
local state_features = { }
local eligibility_traces = { }
local active_state_features = { }
local first_step = true
----------------------------------------------------------------------
-- Determine state-action features from state-features for a state S
----------------------------------------------------------------------
local observe_active_state_features = function()
local fn = function(state_feature) return state_feature() end
return Vector.map(state_features, fn)
end
local print_vector = function(vector)
local stringify = "[ "
for i=1, #vector do
stringify = stringify .. i .. "=" .. vector[i] .. ", "
end
print(stringify .. " ]")
end
local print_matrix = function(matrix)
for r=1, #matrix do
print_vector(matrix[r])
end
end
---------------------------------------
-- Update rules of eligibility traces
---------------------------------------
local null_eligibility_traces = function()
eligibility_traces = Matrix.create(actions, #state_features)
end
local accumulating_eligibility_traces = function(action, active_state_features)
for feature_index=1, #eligibility_traces[action] do
if active_state_features[feature_index] > 0 then
local previous_eligibility_value = eligibility_traces[action][feature_index]
eligibility_traces[action][feature_index] = previous_eligibility_value + 1
end
end
end
local exploitation_eligibility_traces = function()
local exploitation_fn = function(eligibility_value)
return gamma*lambda*eligibility_value
end
eligibility_traces = Matrix.map(eligibility_traces, exploitation_fn)
end
------------------------------
-- All operations on actions
------------------------------
-- Single action with max Q-value
local max_action_and_q = function(theta, active_state_features)
local q_values = Matrix.vector_product(theta, active_state_features)
local max_qs = Vector.max(q_values)
local sample = robot.random.uniform_int(1, #max_qs)
local action = max_qs[sample]
return action, q_values[action]
end
-------------------------------
-- All operations on Q-values
-------------------------------
-- Q-value for a given action: based only on state-feature present in s,a
local q_value = function(action, active_state_features)
return Vector.scalar_product(active_state_features, theta[action])
end
-------------------------------------
-- All operations on weights - theta
-------------------------------------
local theta_update = function(delta)
local e_fn = function(eligibility_value)
return alpha*delta*eligibility_value
end
local eligibility_contribution = Matrix.map(eligibility_traces, e_fn)
theta = Matrix.addition(eligibility_contribution, theta)
end
local random_weights = function()
function random_generator(r, c)
return robot.random.uniform()
end
return Matrix.create(actions, #state_features, random_generator)
end
-----------------------------
-- The kernel of the LFA-Q(λ)
-----------------------------
local config = function(state_action_space, weights, hyperparameters, initial_action)
validator.check_state_action_space(state_action_space)
validator.check_weights(weights, state_action_space)
validator.check_hyperparameters(hyperparameters)
validator.check_initial_action(initial_action, state_action_space)
alpha = hyperparameters.alpha
gamma = hyperparameters.gamma
lambda = hyperparameters.lambda
epsilon = hyperparameters.epsilon
actions = state_action_space.actions
state_features = state_action_space.state_features
theta = next(weights) and weights or random_weights()
choosed_action = initial_action or robot.random.uniform(1, actions)
CSV.create_csv("action_choosed.csv", { "phase", "action" })
CSV.append("action_choosed.csv", { "designer", initial_action })
CSV.create_csv("cumulative_return.csv", { "cumulative return", "max Q", "max action", "action performed" })
end
local start_episode = function()
null_eligibility_traces()
active_state_features = observe_active_state_features()
end
local learn = function(reward)
local q_a = q_value(choosed_action, active_state_features)
active_state_features = observe_active_state_features()
local max_action, max_q = max_action_and_q(theta, active_state_features)
local cumulative_return = reward + gamma * max_q
CSV.append("cumulative_return.csv", { cumulative_return, max_q, max_action, choosed_action })
local learning_error = cumulative_return - q_a
theta_update(learning_error)
end
local epsilon_greedy_strategy = function()
local action = robot.random.uniform_int(1, actions)
local sample = robot.random.uniform()
if sample < 1 - epsilon then
action, _ = max_action_and_q(theta, active_state_features)
exploitation_eligibility_traces()
CSV.append("action_choosed.csv", { "exploitation", action })
else
null_eligibility_traces()
CSV.append("action_choosed.csv", { "exploration", action })
end
return action
end
local stop_episode = function()
return theta
end
-----------------------------------
-- Adapter for ARGoS3 control loop
-----------------------------------
local q_step_argos = function(reward, goal_state)
if first_step then
first_step = false
return false, choosed_action
end
accumulating_eligibility_traces(choosed_action, active_state_features)
learn(reward)
if goal_state() then
return true, 0
end
choosed_action = epsilon_greedy_strategy()
return false, choosed_action
end
return {
config = config,
start_episode = start_episode,
q_step_argos = q_step_argos,
stop_episode = stop_episode
}