You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from tensorflow._api.v2.compat.v1 import train
from tf_agents.environments import suite_gym, tf_py_environment
import tensorflow as tf
from tf_agents.agents.ppo import ppo_agent, ppo_clip_agent
from networks import create_networks
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
import matplotlib.pyplot as plt
import numpy as np
env_name='LunarLander-v2'
# env_name = 'CartPole-v0'
# env_name = 'Pendulum-v0'
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
learning_rate = 1e-3
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
actor_net, value_net = create_networks(train_env, lstm=False, custom_nn=False)
train_step_counter = tf.Variable(0)
tf_agent = ppo_clip_agent.PPOClipAgent(
train_env.time_step_spec(),
train_env.action_spec(),
optimizer,
actor_net=actor_net,
value_net=value_net,
gradient_clipping=0.5,
entropy_regularization=0.0,
importance_ratio_clipping=0.2,
normalize_observations=False,
normalize_rewards=False,
use_gae=True,
num_epochs=1,
debug_summaries=False,
summarize_grads_and_vars=False,
train_step_counter=train_step_counter
)
tf_agent.initialize()
replay_buffer_capacity=1001
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
tf_agent.collect_data_spec,
batch_size=train_env.batch_size, #num_parallel_environments
max_length=replay_buffer_capacity)
# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
total_episodes = 5000
collect_policy = tf_agent.collect_policy
eval_policy = tf_agent.policy
collect_episodes_per_iteration=5
collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
train_env,
collect_policy,
observers=[replay_buffer.add_batch],
num_episodes=collect_episodes_per_iteration)
for episode in range(total_episodes):
collect_driver.run()
trajectories = replay_buffer.gather_all()
train_loss, extra = tf_agent.train(experience=trajectories)
print(f'Episode = {episode}; Train loss = {train_loss}')
step = tf_agent.train_step_counter.numpy()
replay_buffer.clear()
prev_time_step = eval_env.reset()
# prev_time_step = train_env.reset() # For collect policy visualization
episodic_reward = 0
while True:
eval_env.render()
action_step = eval_policy.action(prev_time_step, eval_policy.get_initial_state(batch_size=1))
# action_step = collect_policy.action(prev_time_step, collect_policy.get_initial_state(batch_size=1)) # Run collect policy
time_step = eval_env.step(action_step.action)
# time_step = train_env.step(action_step.action) # Run collect policy
episodic_reward += time_step.reward[0]
# End this episode if Last Step
if time_step.is_last():
break
prev_time_step = time_step
ep_reward_list.append(episodic_reward)
# Mean of last 40 episodes
avg_reward = np.mean(ep_reward_list[-40:])
print("Episode * {} * Avg Reward is ==> {}".format(episode, avg_reward))
avg_reward_list.append(avg_reward)
# Plotting graph
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()
I was trying to change network layer parameters, learning_rate, prioritzed replay buffer, other environments, PPO vs PPOClip agent, Agent hyperparameters, other replay buffer Capacity, other Driver and it parameters, but unlike the agent in the tutorial (github repo), my PPO agent doesn't train correctly in any numbers of steps:
Episode = 3000; Train loss = 2271.637939453125
Episode * 3000 * Avg Reward is ==> -398.26611328125
Episode = 3001; Train loss = 9885.1669921875
Episode * 3001 * Avg Reward is ==> -428.7572326660156
I'm trying to implement a PPO agent to play with LunarLander-v2 with tf_agents library like it was in this tutorial (github repo)
networks.py
main.py
I was trying to change network layer parameters, learning_rate, prioritzed replay buffer, other environments, PPO vs PPOClip agent, Agent hyperparameters, other replay buffer Capacity, other Driver and it parameters, but unlike the agent in the tutorial (github repo), my PPO agent doesn't train correctly in any numbers of steps:
Jupiter notebook is here: https://colab.research.google.com/drive/1hOPP4uG7izcLrO9prbUEilo5U_Kz1DWr?usp=sharing
Where did I go wrong?
The text was updated successfully, but these errors were encountered: