-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathneural_Qtrain_gen.py
352 lines (280 loc) · 12.6 KB
/
neural_Qtrain_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import os
import sys
import gym
import tensorflow as tf
import numpy as np
import random
import datetime
"""
Hyper Parameters
"""
GAMMA = 0.99 # discount factor for target Q
INITIAL_EPSILON = 0.6 # starting value of epsilon
FINAL_EPSILON = 0.0 # final value of epsilon
EPSILON_DECAY_STEPS = 400
REPLAY_SIZE = 10000 # experience replay buffer size
BATCH_SIZE = 256 # size of minibatch
TEST_FREQUENCY = 10 # How many episodes to run before visualizing test accuracy
SAVE_FREQUENCY = 1000 # How many episodes to run before saving model (unused)
NUM_EPISODES = 3000 # Episode limitation
EP_MAX_STEPS = 800 # Step limitation in an episode
# The number of test iters (with epsilon set to 0) to run every TEST_FREQUENCY episodes
NUM_TEST_EPS = 3
HIDDEN_NODES = 100
RENDER = False
env_space_min = None
env_space_max = None
# target_network
target_network = None
global network_vars
def init(env, env_name):
"""
Initialise any globals, e.g. the replay_buffer, epsilon, etc.
return:
state_dim: The length of the state vector for the env
action_dim: The length of the action space, i.e. the number of actions
NB: for discrete action envs such as the cartpole and mountain car, this
function can be left unchanged.
Hints for envs with continuous action spaces, e.g. "Pendulum-v0"
1) you'll need to modify this function to discretise the action space and
create a global dictionary mapping from action index to action (which you
can use in `get_env_action()`)
2) for Pendulum-v0 `env.action_space.low[0]` and `env.action_space.high[0]`
are the limits of the action space.
3) setting a global flag iscontinuous which you can use in `get_env_action()`
might help in using the same code for discrete and (discretised) continuous
action spaces
"""
global replay_buffer, epsilon, is_continuous, env_space_max, env_space_min
replay_buffer = []
epsilon = INITIAL_EPSILON
state_dim = env.observation_space.shape[0]
if (env_name == 'Pendulum-v0'):
env_space_min = env.action_space.low[0]
env_space_max = env.action_space.high[0]
action_dim = 100
is_continuous = True;
else:
is_continuous = False;
action_dim = env.action_space.n
return state_dim, action_dim
def get_network(state_dim, action_dim, hidden_nodes=HIDDEN_NODES):
"""
Define the neural network used to approximate the q-function
The suggested structure is to have each output node represent a Q value for
one action. e.g. for cartpole there will be two output nodes.
Hints:
1) Given how q-values are used within RL, is it necessary to have output
activation functions?
2) You will set `target_in` in `get_train_batch` further down. Probably best
to implement that before implementing the loss (there are further hints there)
"""
state_in = tf.placeholder("float", [None, state_dim])
action_in = tf.placeholder("float", [None, action_dim]) # one hot
# q value for the target network for the state, action taken
target_in = tf.placeholder("float", [None])
# TO IMPLEMENT: Q network, whose input is state_in, and has action_dim outputs
# which are the network's esitmation of the Q values for those actions and the
# input state. The final layer should be assigned to the variable q_values
w1 = weight_variable([state_dim, hidden_nodes])
b1 = bias_variable([hidden_nodes])
y1 = tf.nn.relu(tf.matmul(state_in, w1) + b1)
w2 = weight_variable([int(y1.get_shape()[1]), hidden_nodes])
b2 = bias_variable([hidden_nodes])
y2 = tf.nn.tanh(tf.matmul(y1, w2) + b2)
w3 = weight_variable([int(y2.get_shape()[1]), action_dim])
b3 = bias_variable([action_dim])
q_values = tf.matmul(y2, w3) + b3
q_selected_action = tf.reduce_sum(tf.multiply(q_values, action_in), reduction_indices=1)
# TO IMPLEMENT: loss function
# should only be one line, if target_in is implemented correctly
loss = tf.reduce_mean(tf.square(target_in - q_selected_action))
optimise_step = tf.train.AdamOptimizer().minimize(loss)
train_loss_summary_op = tf.summary.scalar("TrainingLoss", loss)
return state_in, action_in, target_in, q_values, q_selected_action, loss, optimise_step, train_loss_summary_op, w1, b1, w2, b2, w3, b3
def weight_variable(shape):
initial = tf.truncated_normal(shape)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.01, shape=shape)
return tf.Variable(initial)
def init_session():
global session, writer
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())
# Setup Logging
logdir = "tensorboard/" + datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, session.graph)
def get_action(state, state_in, q_values, epsilon, test_mode, action_dim):
Q_estimates = q_values.eval(feed_dict={state_in: [state]})[0]
epsilon_to_use = 0.0 if test_mode else epsilon
if random.random() < epsilon_to_use:
action = random.randint(0, action_dim - 1)
else:
action = np.random.choice(
np.flatnonzero(Q_estimates == Q_estimates.max()))
return action
def get_env_action(action):
"""
Modify for continous action spaces that you have discretised, see hints in
`init()`
"""
global is_continuous;
if is_continuous:
discrete = env_space_min + (action/99 * (env_space_max - env_space_min))
return [discrete]
else:
return action
def update_replay_buffer(replay_buffer, state, action, reward, next_state, done,
action_dim):
"""
Update the replay buffer with provided input in the form:
(state, one_hot_action, reward, next_state, done)
Hint: the minibatch passed to do_train_step is one entry (randomly sampled)
from the replay_buffer
"""
# TO IMPLEMENT: append to the replay_buffer
# ensure the action is encoded one hot
one_hot_action = [0] * action_dim
one_hot_action[action] = 1
# append to buffer
replay_buffer.append((state, one_hot_action, reward, next_state, done))
# Ensure replay_buffer doesn't grow larger than REPLAY_SIZE
if len(replay_buffer) > REPLAY_SIZE:
replay_buffer.pop(0)
return None
def do_train_step(replay_buffer, state_in, action_in, target_in,
q_values, q_selected_action, loss, optimise_step,
train_loss_summary_op, batch_presentations_count):
target_batch, state_batch, action_batch = \
get_train_batch(q_values, state_in, replay_buffer)
summary, _ = session.run([train_loss_summary_op, optimise_step], feed_dict={
target_in: target_batch,
state_in: state_batch,
action_in: action_batch
})
writer.add_summary(summary, batch_presentations_count)
def get_train_batch(q_values, state_in, replay_buffer):
"""
Generate Batch samples for training by sampling the replay buffer"
Batches values are suggested to be the following;
state_batch: Batch of state values
action_batch: Batch of action values
target_batch: Target batch for (s,a) pair i.e. one application
of the bellman update rule.
return:
target_batch, state_batch, action_batch
Hints:
1) To calculate the target batch values, you will need to use the
q_values for the next_state for each entry in the batch.
2) The target value, combined with your loss defined in `get_network()` should
reflect the equation in the middle of slide 12 of Deep RL 1 Lecture
notes here: https://webcms3.cse.unsw.edu.au/COMP9444/17s2/resources/12494
"""
minibatch = random.sample(replay_buffer, BATCH_SIZE)
state_batch = [data[0] for data in minibatch]
action_batch = [data[1] for data in minibatch]
reward_batch = [data[2] for data in minibatch]
next_state_batch = [data[3] for data in minibatch]
eval_q_values = q_values
eval_state = state_in
global target_network
target_q_values = target_network[3]
target_state = target_network[0]
# evaluate next state
target_batch = []
Q_value_batch = eval_q_values.eval(feed_dict={
eval_state: next_state_batch
})
Q_target_batch = target_q_values.eval(feed_dict={
target_state: next_state_batch
})
for i in range(0, BATCH_SIZE):
sample_is_done = minibatch[i][4]
if sample_is_done:
target_batch.append(reward_batch[i])
else:
# TO IMPLEMENT: set the target_val to the correct Q value update
max_index = np.argmax(Q_value_batch[i])
selected_q_next = Q_target_batch[i][max_index]
target_val = reward_batch[i] + GAMMA * selected_q_next
target_batch.append(target_val)
return target_batch, state_batch, action_batch
def qtrain(env, state_dim, action_dim,
state_in, action_in, target_in, q_values, q_selected_action,
loss, optimise_step, train_loss_summary_op, w1, b1, w2, b2, w3, b3,
num_episodes=NUM_EPISODES, ep_max_steps=EP_MAX_STEPS,
test_frequency=TEST_FREQUENCY, num_test_eps=NUM_TEST_EPS,
final_epsilon=FINAL_EPSILON, epsilon_decay_steps=EPSILON_DECAY_STEPS,
force_test_mode=False, render=True):
global epsilon, target_network
target_network = get_network(state_dim, action_dim)
# Record the number of times we do a training batch, take a step
batch_presentations_count = total_steps = total_reward = 0
for episode in range(num_episodes):
# initialize task
state = env.reset()
if render: env.render()
# Update epsilon once per episode - exp decaying
epsilon -= (epsilon - final_epsilon) / epsilon_decay_steps
# in test mode we set epsilon to 0
test_mode = force_test_mode or \
((episode % test_frequency) < num_test_eps and
episode > num_test_eps)
# udpate target network for every 5 episodes
if (num_episodes % 5 == 0):
session.run(tf.assign(target_network[8], w1))
session.run(tf.assign(target_network[9], b1))
session.run(tf.assign(target_network[10], w2))
session.run(tf.assign(target_network[11], b2))
session.run(tf.assign(target_network[12], w3))
session.run(tf.assign(target_network[13], b3))
if (episode + 100 >= num_episodes):
test_mode = True
ep_reward = 0
for step in range(ep_max_steps):
total_steps += 1
# get an action and take a step in the environment
action = get_action(state, state_in, q_values, epsilon, test_mode,
action_dim)
env_action = get_env_action(action)
next_state, reward, done, _ = env.step(env_action)
ep_reward += reward
# display the updated environment
if render: env.render() # comment this line to possibly reduce training time
# add the s,a,r,s' samples to the replay_buffer
update_replay_buffer(replay_buffer, state, action, reward,
next_state, done, action_dim)
state = next_state
# perform a training step if the replay_buffer has a batch worth of samples
if (len(replay_buffer) > BATCH_SIZE):
do_train_step(replay_buffer, state_in, action_in, target_in,
q_values, q_selected_action, loss, optimise_step,
train_loss_summary_op, batch_presentations_count)
batch_presentations_count += 1
if done:
break
total_reward += ep_reward
test_or_train = "test" if test_mode else "train"
print("end {0} episode {1}, ep reward: {2}, ave reward: {3}, \
Batch presentations: {4}, epsilon: {5}".format(
test_or_train, episode, ep_reward, total_reward / (episode + 1),
batch_presentations_count, epsilon
))
def setup():
default_env_name = 'CartPole-v0'
# default_env_name = 'MountainCar-v0'
# default_env_name = 'Pendulum-v0'
# if env_name provided as cmd line arg, then use that
env_name = sys.argv[1] if len(sys.argv) > 1 else default_env_name
env = gym.make(env_name)
state_dim, action_dim = init(env, env_name)
network_vars = get_network(state_dim, action_dim)
init_session()
return env, state_dim, action_dim, network_vars
def main():
env, state_dim, action_dim, network_vars = setup()
qtrain(env, state_dim, action_dim, *network_vars, render=False)
if __name__ == "__main__":
main()