You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to train a DDQN agent (a self driving car in GTAV) with some state observations and discrete actions. I have done an initial training of 1 million steps and agent is now able to gather around 3,500 rewards averaged in 5 episodes. I now want to continue training it with a lowered epsilon and learning rates parameters. I reload the model using the checkpoint I saved, I see that my q_net weights are updated and when I check agent.collect_policy.trainable_variables[0] before and after I reload the model via "train_checkpointer.initialize_or_restore()" I see that values are updated.
However once I continue training, initial assesment of the model (greedy policy over 5 episodes) show correct behaviour - the one I saved last. But when the training loop is started, I see collect_policy takes actions that are not even close to the greed policy. I expect it'd follow the greedy policy and take random actions along the way with %10 chance, as epsilon is 0.1
Am I reloading this agent wrongly?
import gym
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies import policy_saver
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
import os
import matplotlib
import matplotlib.pyplot as plt
os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.4"
num_iterations = 2000000 # @param {type:"integer"}
initial_collect_steps = 2000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_max_length = 1000000 # @param {type:"integer"}
batch_size = 64 # @param {type:"integer"}
learning_rate = 1e-4 # @param {type:"number"}
log_interval = 100 # @param {type:"integer"}
num_eval_episodes = 5 # @param {type:"integer"}
eval_interval = 20000 # @param {type:"integer"}
n_step_update = 2
epsilon = 0.1
gamma = 0.95
target_tau = 0.9
target_period = 1000
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
def dense_layer(num_units):
return tf.keras.layers.Dense(
num_units,
activation=tf.keras.activations.relu,
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=2.0, mode='fan_in', distribution='truncated_normal'))
def collect_step(environment, policy):
time_step = environment.current_time_step()
action_step = policy.action(time_step)
next_time_step = environment.step(action_step.action)
traj = trajectory.from_transition(time_step, action_step, next_time_step)
# Add trajectory to the replay buffer
replay_buffer.add_batch(traj)
## CREATE ENVIRONMENT ##
env_name = 'GtaEnv-v0'
env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
## CREATE ENVIRONMENT ##
## CREATE MODEL ##
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
num_actions,
activation=None,
kernel_initializer=tf.keras.initializers.RandomUniform(
minval=-0.03, maxval=0.03),
bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
## CREATE MODEL ##
## CREATE AGENT ##
train_step_counter = tf.Variable(0)
agent = dqn_agent.DdqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
n_step_update=n_step_update,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter,
epsilon_greedy=epsilon,
gamma=gamma,
target_update_tau= target_tau,
target_update_period=target_period)
agent.initialize()
## CREATE AGENT ##
# print(q_net.trainable_weights[0])
## START TRAINING STUFF ##
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=replay_buffer_max_length)
## restore checkpointer ##
checkpoint_dir = ('D:\\ReinforcementLearning\\models\\TF_v15_ddqn')
train_checkpointer = common.Checkpointer(
ckpt_dir=checkpoint_dir,
max_to_keep=1,
agent=agent,
policy=agent.policy,
replay_buffer=replay_buffer,
global_step=train_step_counter
)
train_checkpointer.initialize_or_restore()
# print(q_net.trainable_weights[0])
eval_policy = agent.policy
collect_policy = agent.collect_policy
step = agent.train_step_counter.numpy()
print("continue from step: ")
print(step)
dataset = replay_buffer.as_dataset(
num_parallel_calls=3, sample_batch_size=batch_size,
num_steps=n_step_update + 1).prefetch(3)
iterator = iter(dataset)
print("Initial evaluation of greedy policy: .")
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
train_step_counter = tf.Variable(step)
print("Continue training.")
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
for _ in range(collect_steps_per_iteration):
collect_step(train_env, agent.collect_policy)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
train_loss = agent.train(experience)
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss.loss))
if step % eval_interval == 0:
print("Evaluation greedy policy over 5 episodes. ")
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))
returns.append(avg_return)
print(returns)
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.ylim(top=10000)
plt.show()
In first training I saved the model as:
checkpoint_dir = ('D:\\ReinforcementLearning\\models\\TF_v15_ddqn')
train_checkpointer = common.Checkpointer(
ckpt_dir=checkpoint_dir,
max_to_keep=1,
agent=agent,
policy=agent.policy,
replay_buffer=replay_buffer,
global_step=train_step_counter
)
....
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
for _ in range(collect_steps_per_iteration):
collect_step(train_env, agent.collect_policy)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
train_loss = agent.train(experience)
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss.loss))
if step % eval_interval == 0:
print("Evaluation greedy policy over 5 episodes. ")
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))
if avg_return > max(returns):
tf_best_policy_saver.save(best_policy_dir) ## save the best policy if its return is greater than any return in list
returns.append(avg_return)
print(returns)
train_checkpointer.save(step)
The text was updated successfully, but these errors were encountered:
I am trying to train a DDQN agent (a self driving car in GTAV) with some state observations and discrete actions. I have done an initial training of 1 million steps and agent is now able to gather around 3,500 rewards averaged in 5 episodes. I now want to continue training it with a lowered epsilon and learning rates parameters. I reload the model using the checkpoint I saved, I see that my q_net weights are updated and when I check agent.collect_policy.trainable_variables[0] before and after I reload the model via "train_checkpointer.initialize_or_restore()" I see that values are updated.
However once I continue training, initial assesment of the model (greedy policy over 5 episodes) show correct behaviour - the one I saved last. But when the training loop is started, I see collect_policy takes actions that are not even close to the greed policy. I expect it'd follow the greedy policy and take random actions along the way with %10 chance, as epsilon is 0.1
Am I reloading this agent wrongly?
In first training I saved the model as:
The text was updated successfully, but these errors were encountered: