Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Why collect_policy of DDQN agent seems to be unrelated to policy when I reload from checkpoint? #828

Open
aonurgiray opened this issue Mar 11, 2023 · 0 comments

Comments

@aonurgiray
Copy link

I am trying to train a DDQN agent (a self driving car in GTAV) with some state observations and discrete actions. I have done an initial training of 1 million steps and agent is now able to gather around 3,500 rewards averaged in 5 episodes. I now want to continue training it with a lowered epsilon and learning rates parameters. I reload the model using the checkpoint I saved, I see that my q_net weights are updated and when I check agent.collect_policy.trainable_variables[0] before and after I reload the model via "train_checkpointer.initialize_or_restore()" I see that values are updated.

However once I continue training, initial assesment of the model (greedy policy over 5 episodes) show correct behaviour - the one I saved last. But when the training loop is started, I see collect_policy takes actions that are not even close to the greed policy. I expect it'd follow the greedy policy and take random actions along the way with %10 chance, as epsilon is 0.1

Am I reloading this agent wrongly?

import gym
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies import policy_saver
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
import os
import matplotlib
import matplotlib.pyplot as plt

os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.4"
num_iterations = 2000000 # @param {type:"integer"}

initial_collect_steps = 2000  # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_max_length = 1000000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-4  # @param {type:"number"}
log_interval = 100  # @param {type:"integer"}

num_eval_episodes = 5  # @param {type:"integer"}
eval_interval = 20000  # @param {type:"integer"}
n_step_update = 2
epsilon = 0.1
gamma = 0.95
target_tau = 0.9
target_period = 1000

def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]


def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))
          
def collect_step(environment, policy):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

# Add trajectory to the replay buffer
  replay_buffer.add_batch(traj)


## CREATE ENVIRONMENT ##
env_name = 'GtaEnv-v0'
env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
## CREATE ENVIRONMENT ##


## CREATE MODEL ##
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]

q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))

q_net = sequential.Sequential(dense_layers + [q_values_layer])

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
## CREATE MODEL ##

## CREATE AGENT ##
train_step_counter = tf.Variable(0)

agent = dqn_agent.DdqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    n_step_update=n_step_update,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter,
    epsilon_greedy=epsilon,
    gamma=gamma,
    target_update_tau= target_tau,
    target_update_period=target_period)

agent.initialize()
## CREATE AGENT ##
# print(q_net.trainable_weights[0])

## START TRAINING STUFF ##


replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

## restore checkpointer ##
checkpoint_dir = ('D:\\ReinforcementLearning\\models\\TF_v15_ddqn')
train_checkpointer = common.Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=1,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=train_step_counter
)


train_checkpointer.initialize_or_restore()
# print(q_net.trainable_weights[0])

eval_policy = agent.policy
collect_policy = agent.collect_policy

step = agent.train_step_counter.numpy()
print("continue from step: ")
print(step)

dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, sample_batch_size=batch_size,
    num_steps=n_step_update + 1).prefetch(3)

iterator = iter(dataset)

print("Initial evaluation of greedy policy: .")
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

train_step_counter = tf.Variable(step)


print("Continue training.")
for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  for _ in range(collect_steps_per_iteration):
    collect_step(train_env, agent.collect_policy)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience)

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss.loss))

  if step % eval_interval == 0:
    print("Evaluation greedy policy over 5 episodes. ")
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))

    returns.append(avg_return)
    print(returns)

iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.ylim(top=10000)
plt.show()

In first training I saved the model as:

checkpoint_dir = ('D:\\ReinforcementLearning\\models\\TF_v15_ddqn')
train_checkpointer = common.Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=1,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=train_step_counter
)


....


for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  for _ in range(collect_steps_per_iteration):
    collect_step(train_env, agent.collect_policy)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience)

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss.loss))

  if step % eval_interval == 0:
    print("Evaluation greedy policy over 5 episodes. ")
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1:.2f}'.format(step, avg_return))
    if avg_return > max(returns):
        tf_best_policy_saver.save(best_policy_dir)  ## save the best policy if its return is greater than any return in list

    returns.append(avg_return)
    print(returns)
    train_checkpointer.save(step)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant