From c963be29efcf1239df831f48e7878d5b273c2c75 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 5 Apr 2021 12:47:28 +0200 Subject: [PATCH 01/62] Remove HER --- utils/exp_manager.py | 7 ------- utils/utils.py | 4 ++-- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index ce4e702bf..8b24a96e0 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -21,7 +21,6 @@ from stable_baselines3.common.sb2_compat.rmsprop_tf_like import RMSpropTFLike # noqa: F401 from stable_baselines3.common.utils import constant_fn from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv, VecFrameStack, VecNormalize, VecTransposeImage -from stable_baselines3.common.vec_env.obs_dict_wrapper import ObsDictWrapper # For custom activation fn from torch import nn as nn # noqa: F401 @@ -522,12 +521,6 @@ def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) - # check if wrapper for dict support is needed - if self.algo == "her": - if self.verbose > 0: - print("Wrapping into a ObsDictWrapper") - env = ObsDictWrapper(env) - return env def _load_pretrained_agent(self, hyperparams: Dict[str, Any], env: VecEnv) -> BaseAlgorithm: diff --git a/utils/utils.py b/utils/utils.py index b2751e1d1..03af1427f 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -9,7 +9,7 @@ import torch as th # noqa: F401 import yaml from sb3_contrib import QRDQN, TQC -from stable_baselines3 import A2C, DDPG, DQN, HER, PPO, SAC, TD3 +from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3 from stable_baselines3.common.callbacks import BaseCallback from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.sb2_compat.rmsprop_tf_like import RMSpropTFLike # noqa: F401 @@ -23,7 +23,7 @@ "ddpg": DDPG, "dqn": DQN, "ppo": PPO, - "her": HER, + # "her": HER, "sac": SAC, "td3": TD3, # SB3 Contrib, From 40ce4c549e8237345a6e70a81675f279610382d7 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 5 Apr 2021 16:31:26 +0200 Subject: [PATCH 02/62] Add basic support for refactored HER --- hyperparams/sac.yml | 22 +++++++++++++++++++++- utils/exp_manager.py | 12 +++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index ddb609450..d3a60de33 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -102,7 +102,7 @@ HalfCheetahBulletEnv-v0: # delay: 10 # - utils.wrappers.HistoryWrapper: # horizon: 10 - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper + # env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -114,6 +114,7 @@ HalfCheetahBulletEnv-v0: train_freq: 64 gradient_steps: 64 learning_starts: 10000 + replay_buffer_kwargs: "dict(handle_timeout_termination=True)" use_sde: True policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" @@ -285,6 +286,25 @@ CarRacing-v0: use_sde_at_warmup: True policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])" +# === HER Robotics GoalEnvs === + +FetchReach-v1: + n_timesteps: !!float 20000 + policy: 'MultiInputPolicy' + buffer_size: 1000000 + ent_coef: 'auto' + batch_size: 256 + gamma: 0.95 + learning_rate: 0.001 + learning_starts: 1000 + normalize: True + replay_buffer_class: HerReplayBuffer + replay_buffer_kwargs: "dict( + online_sampling=True, + goal_selection_strategy='future', + n_sampled_goal=4 + )" + # ==== Custom Envs === donkey-generated-track-v0: diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 8b24a96e0..f3fa25a6a 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -13,6 +13,8 @@ from optuna.integration.skopt import SkoptSampler from optuna.pruners import BasePruner, MedianPruner, SuccessiveHalvingPruner from optuna.samplers import BaseSampler, RandomSampler, TPESampler +# For using HER with GoalEnv +from stable_baselines3 import HerReplayBuffer # noqa: F401 from stable_baselines3.common.base_class import BaseAlgorithm from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback, EvalCallback from stable_baselines3.common.env_util import make_vec_env @@ -322,11 +324,11 @@ def _preprocess_hyperparams( # Pre-process normalize config hyperparams = self._preprocess_normalization(hyperparams) - # Pre-process policy keyword arguments - if "policy_kwargs" in hyperparams.keys(): - # Convert to python object if needed - if isinstance(hyperparams["policy_kwargs"], str): - hyperparams["policy_kwargs"] = eval(hyperparams["policy_kwargs"]) + # Pre-process policy/buffer keyword arguments + # Convert to python object if needed + for kwargs_key in {"policy_kwargs", "replay_buffer_class", "replay_buffer_kwargs"}: + if kwargs_key in hyperparams.keys() and isinstance(hyperparams[kwargs_key], str): + hyperparams[kwargs_key] = eval(hyperparams[kwargs_key]) # Delete keys so the dict can be pass to the model constructor if "n_envs" in hyperparams.keys(): From 7fad10456d070d8389f8dee49a3cedf1ae3a1df6 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 7 Apr 2021 19:49:47 +0200 Subject: [PATCH 03/62] Add TQC --- hyperparams/sac.yml | 1 + hyperparams/tqc.yml | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index d3a60de33..5cee64f1a 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -304,6 +304,7 @@ FetchReach-v1: goal_selection_strategy='future', n_sampled_goal=4 )" + policy_kwargs: "dict(net_arch=[64, 64])" # ==== Custom Envs === diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index c12bf6fc9..39fd07113 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -206,3 +206,23 @@ InvertedPendulumSwingupBulletEnv-v0: learning_starts: 10000 use_sde: True policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" + +# === HER Robotics GoalEnvs === + +FetchReach-v1: + n_timesteps: !!float 20000 + policy: 'MultiInputPolicy' + buffer_size: 1000000 + ent_coef: 'auto' + batch_size: 256 + gamma: 0.95 + learning_rate: 0.001 + learning_starts: 1000 + normalize: True + replay_buffer_class: HerReplayBuffer + replay_buffer_kwargs: "dict( + online_sampling=True, + goal_selection_strategy='future', + n_sampled_goal=4 + )" + policy_kwargs: "dict(net_arch=[64, 64], n_critics=1)" From b4d26c9e9d5306f88163add9000b40fa9e6f9cb7 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 27 Apr 2021 16:21:32 +0200 Subject: [PATCH 04/62] Add space engineer env --- hyperparams/tqc.yml | 26 ++++++++++++++++++++++++++ utils/import_envs.py | 5 +++++ 2 files changed, 31 insertions(+) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index c12bf6fc9..66acf6bad 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -206,3 +206,29 @@ InvertedPendulumSwingupBulletEnv-v0: learning_starts: 10000 use_sde: True policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" + +# Space Engineers envs + +SpaceEngineers-WalkingRobot-IK-v0: + env_wrapper: + - utils.wrappers.HistoryWrapper: + horizon: 2 + - utils.wrappers.TimeFeatureWrapper: + test_mode: False + n_timesteps: !!float 2e6 + policy: 'MlpPolicy' + learning_rate: !!float 7.3e-4 + buffer_size: 100000 + batch_size: 256 + ent_coef: 'auto' + gamma: 0.99 + tau: 0.01 + train_freq: [1, "episode"] + gradient_steps: -1 + learning_starts: 1000 + use_sde_at_warmup: True + use_sde: True + sde_sample_freq: 4 + # TODO: try with different value: + # top_quantiles_to_drop_per_net: 5 + policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300], n_critics=2)" diff --git a/utils/import_envs.py b/utils/import_envs.py index faf26f8cb..6628a093c 100644 --- a/utils/import_envs.py +++ b/utils/import_envs.py @@ -27,3 +27,8 @@ import gym_donkeycar # pytype: disable=import-error except ImportError: gym_donkeycar = None + +try: + import gym_space_engineers # pytype: disable=import-error +except ImportError: + gym_space_engineers = None From e7e9ee6dee1c9114bc0a1d7276eb5589762dc50b Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 29 Apr 2021 10:09:09 +0200 Subject: [PATCH 05/62] Update hyperparams --- hyperparams/tqc.yml | 65 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 39fd07113..30bfcf340 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -226,3 +226,68 @@ FetchReach-v1: n_sampled_goal=4 )" policy_kwargs: "dict(net_arch=[64, 64], n_critics=1)" + +FetchPush-v1: + n_timesteps: !!float 1e6 + policy: 'MultiInputPolicy' + buffer_size: 1000000 + batch_size: 2048 + gamma: 0.95 + learning_rate: !!float 1e-3 + tau: 0.05 + replay_buffer_class: HerReplayBuffer + replay_buffer_kwargs: "dict( + online_sampling=True, + goal_selection_strategy='future', + n_sampled_goal=4 + )" + policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)" + +FetchSlide-v1: + n_timesteps: !!float 3e6 + policy: 'MultiInputPolicy' + buffer_size: 1000000 + batch_size: 2048 + gamma: 0.95 + learning_rate: !!float 1e-3 + tau: 0.05 + replay_buffer_class: HerReplayBuffer + replay_buffer_kwargs: "dict( + online_sampling=True, + goal_selection_strategy='future', + n_sampled_goal=4 + )" + policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)" + +FetchPickAndPlace-v1: + n_timesteps: !!float 1e6 + policy: 'MultiInputPolicy' + buffer_size: 1000000 + batch_size: 1024 + gamma: 0.95 + learning_rate: !!float 1e-3 + tau: 0.05 + replay_buffer_class: HerReplayBuffer + replay_buffer_kwargs: "dict( + online_sampling=True, + goal_selection_strategy='future', + n_sampled_goal=4 + )" + policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)" + +parking-v0: + n_timesteps: !!float 1e5 + policy: 'MultiInputPolicy' + max_episode_length: 100 + buffer_size: 1000000 + batch_size: 1024 + gamma: 0.95 + learning_rate: !!float 1e-3 + tau: 0.05 + replay_buffer_class: HerReplayBuffer + replay_buffer_kwargs: "dict( + online_sampling=True, + goal_selection_strategy='future', + n_sampled_goal=4 + )" + policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)" From ae706ebab4934bee86670df1421a3ce11fb145f6 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 29 Apr 2021 11:52:54 +0200 Subject: [PATCH 06/62] Fix hyperparam --- hyperparams/tqc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 30bfcf340..42825524a 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -278,7 +278,6 @@ FetchPickAndPlace-v1: parking-v0: n_timesteps: !!float 1e5 policy: 'MultiInputPolicy' - max_episode_length: 100 buffer_size: 1000000 batch_size: 1024 gamma: 0.95 @@ -288,6 +287,7 @@ parking-v0: replay_buffer_kwargs: "dict( online_sampling=True, goal_selection_strategy='future', - n_sampled_goal=4 + n_sampled_goal=4, + max_episode_length=100 )" policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)" From 73bea6e6306932af14886862433a0e65f24dcdcb Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 4 May 2021 17:51:26 +0200 Subject: [PATCH 07/62] Removed unused callback --- CHANGELOG.md | 4 +++- hyperparams/a2c.yml | 1 - hyperparams/sac.yml | 22 ----------------- utils/callbacks.py | 55 ------------------------------------------ utils/wrappers.py | 58 --------------------------------------------- version.txt | 2 +- 6 files changed, 4 insertions(+), 138 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bc50b79e..2d9937c6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ -## Release 1.1.0a5 (WIP) +## Release 1.1.0a6 (WIP) ### Breaking Changes +- Removed `PlotNoiseRatioCallback` +- Removed `PlotActionWrapper` ### New Features - Add support for recording videos of best models and checkpoints (@mcres) diff --git a/hyperparams/a2c.yml b/hyperparams/a2c.yml index e2beecc74..8f08ba340 100644 --- a/hyperparams/a2c.yml +++ b/hyperparams/a2c.yml @@ -76,7 +76,6 @@ LunarLanderContinuous-v2: # Tuned MountainCarContinuous-v0: - # env_wrapper: utils.wrappers.PlotActionWrapper normalize: true n_envs: 4 n_steps: 100 diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index ddb609450..f7d386f3f 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -1,6 +1,5 @@ # Tuned MountainCarContinuous-v0: - # env_wrapper: utils.wrappers.PlotActionWrapper n_timesteps: !!float 50000 policy: 'MlpPolicy' learning_rate: !!float 3e-4 @@ -14,27 +13,6 @@ MountainCarContinuous-v0: learning_starts: 0 use_sde: True policy_kwargs: "dict(log_std_init=-3.67, net_arch=[64, 64])" -# Note: the hyperparams for HalfCheetah also work: -# MountainCarContinuous-v0: -# # env_wrapper: -# # - utils.wrappers.PlotActionWrapper: -# # plot_freq: 1 # Every 1 episode -# callback: -# - utils.callbacks.PlotNoiseRatioCallback: -# display_freq: 500 -# n_timesteps: !!float 50000 -# policy: 'MlpPolicy' -# learning_rate: !!float 7.3e-4 -# buffer_size: 300000 -# batch_size: 256 -# ent_coef: 'auto' -# gamma: 0.98 -# tau: 0.02 -# train_freq: 64 -# gradient_steps: 64 -# learning_starts: 0 -# use_sde: True -# policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" Pendulum-v0: n_timesteps: 20000 diff --git a/utils/callbacks.py b/utils/callbacks.py index cac30d294..60e8f0efa 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -80,58 +80,3 @@ def _on_step(self) -> bool: if self.verbose > 1: print(f"Saving VecNormalize to {path}") return True - - -class PlotNoiseRatioCallback(BaseCallback): - """ - Callback for plotting noise contribution to the exploration. - Warning: it only works with 1D action space env for now (like MountainCarContinuous) - - :param display_freq: (int) Display the plot every ``display_freq`` steps. - :param verbose: (int) - """ - - def __init__(self, display_freq: int = 1000, verbose: int = 0): - super(PlotNoiseRatioCallback, self).__init__(verbose) - self.display_freq = display_freq - # Action buffers - self.deterministic_actions = [] - self.noisy_actions = [] - self.noises = [] - - def _on_step(self) -> bool: - # We assume this is a DummyVecEnv - assert isinstance(self.training_env, DummyVecEnv) - # Retrieve last observation - obs = self.training_env._obs_from_buf() - # Retrieve stochastic and deterministic action - # we can extract the noise contribution from those two - noisy_action = self.model.predict(obs, deterministic=False)[0].flatten() - deterministic_action = self.model.predict(obs, deterministic=True)[0].flatten() - noise = noisy_action - deterministic_action - - self.deterministic_actions.append(deterministic_action) - self.noisy_actions.append(noisy_action) - self.noises.append(noise) - - if self.n_calls % self.display_freq == 0: - x = np.arange(len(self.noisy_actions)) - - self.deterministic_actions = np.array(self.deterministic_actions) - self.noises = np.array(self.noises) - - plt.figure("Deterministic action and noise during exploration", figsize=(6.4, 4.8)) - # plt.title('Deterministic action and noise during exploration', fontsize=14) - plt.xlabel("Timesteps", fontsize=14) - plt.xticks(fontsize=13) - plt.ylabel("Action", fontsize=14) - plt.plot(x, self.deterministic_actions, label="deterministic action", linewidth=2) - plt.plot(x, self.noises, label="exploration noise", linewidth=2) - plt.plot(x, self.noisy_actions, label="noisy action", linewidth=2) - plt.legend(fontsize=13) - plt.show() - # Reset - self.noisy_actions = [] - self.deterministic_actions = [] - self.noises = [] - return True diff --git a/utils/wrappers.py b/utils/wrappers.py index 4b3aa77eb..2379aa266 100644 --- a/utils/wrappers.py +++ b/utils/wrappers.py @@ -306,61 +306,3 @@ def step(self, action): obs_dict["observation"] = self._create_obs_from_history() return obs_dict, reward, done, info - - -class PlotActionWrapper(gym.Wrapper): - """ - Wrapper for plotting the taken actions. - Only works with 1D actions for now. - Optionally, it can be used to plot the observations too. - - :param env: (gym.Env) - :param plot_freq: (int) Plot every `plot_freq` episodes - """ - - def __init__(self, env, plot_freq=5): - super(PlotActionWrapper, self).__init__(env) - self.plot_freq = plot_freq - self.current_episode = 0 - # Observation buffer (Optional) - # self.observations = [] - # Action buffer - self.actions = [] - - def reset(self): - self.current_episode += 1 - if self.current_episode % self.plot_freq == 0: - self.plot() - # Reset - self.actions = [] - obs = self.env.reset() - self.actions.append([]) - # self.observations.append(obs) - return obs - - def step(self, action): - obs, reward, done, info = self.env.step(action) - - self.actions[-1].append(action) - # self.observations.append(obs) - - return obs, reward, done, info - - def plot(self): - actions = self.actions - x = np.arange(sum([len(episode) for episode in actions])) - plt.figure("Actions") - plt.title("Actions during exploration", fontsize=14) - plt.xlabel("Timesteps", fontsize=14) - plt.ylabel("Action", fontsize=14) - - start = 0 - for i in range(len(self.actions)): - end = start + len(self.actions[i]) - plt.plot(x[start:end], self.actions[i]) - # Clipped actions: real behavior, note that it is between [-2, 2] for the Pendulum - # plt.scatter(x[start:end], np.clip(self.actions[i], -1, 1), s=1) - # plt.scatter(x[start:end], self.actions[i], s=1) - start = end - - plt.show() diff --git a/version.txt b/version.txt index c84ce1899..1406d2fc7 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.1.0a5 +1.1.0a6 From 7e96e108cfcf0fb516816f0534d4e9b44027d020 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 4 May 2021 19:00:14 +0200 Subject: [PATCH 08/62] Update CI --- .github/workflows/ci.yml | 4 ++-- .github/workflows/trained_agents.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2564a4725..245058ce2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7] # 3.8 not supported yet by pytype + python-version: [3.6, 3.7, 3.8] steps: - uses: actions/checkout@v2 @@ -30,7 +30,7 @@ jobs: run: | python -m pip install --upgrade pip # cpu version of pytorch - faster to download - pip install torch==1.8.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html # temp fix: use pybullet 3.0.8 (issue with numpy for 3.0.9) pip install pybullet==3.0.8 pip install -r requirements.txt diff --git a/.github/workflows/trained_agents.yml b/.github/workflows/trained_agents.yml index 98cdced99..55d127281 100644 --- a/.github/workflows/trained_agents.yml +++ b/.github/workflows/trained_agents.yml @@ -29,7 +29,7 @@ jobs: run: | python -m pip install --upgrade pip # cpu version of pytorch - faster to download - pip install torch==1.8.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html # temp fix: use pybullet 3.0.8 (issue with numpy for 3.0.9) pip install pybullet==3.0.8 pip install -r requirements.txt From 7c4f1bc4093c17765d1ef8f1c8b34c5453ccdb39 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 4 May 2021 19:01:16 +0200 Subject: [PATCH 09/62] Add partial support for parallel training --- hyperparams/sac.yml | 2 ++ utils/callbacks.py | 63 ++++++++++++++++++++++++++++++++++++++++++--- utils/wrappers.py | 1 - 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index f7d386f3f..aa4afd497 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -15,6 +15,8 @@ MountainCarContinuous-v0: policy_kwargs: "dict(log_std_init=-3.67, net_arch=[64, 64])" Pendulum-v0: + callback: + - utils.callbacks.ParallelTrainCallback n_timesteps: 20000 policy: 'MlpPolicy' learning_rate: !!float 1e-3 diff --git a/utils/callbacks.py b/utils/callbacks.py index 60e8f0efa..693ecbad1 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -1,11 +1,14 @@ import os +import tempfile +from copy import deepcopy +from threading import Thread from typing import Optional -import numpy as np import optuna -from matplotlib import pyplot as plt +from sb3_contrib import TQC +from stable_baselines3 import SAC from stable_baselines3.common.callbacks import BaseCallback, EvalCallback -from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv +from stable_baselines3.common.vec_env import VecEnv class TrialEvalCallback(EvalCallback): @@ -80,3 +83,57 @@ def _on_step(self) -> bool: if self.verbose > 1: print(f"Saving VecNormalize to {path}") return True + + +class ParallelTrainCallback(BaseCallback): + def __init__(self, gradient_steps: int = 100, verbose: int = 0): + super(ParallelTrainCallback, self).__init__(verbose) + self.gradient_steps = 0 + self.batch_size = 0 + self._model_ready = True + self._model = None + self.gradient_steps = gradient_steps + self.process = None + self.model_class = None + + def _init_callback(self) -> None: + temp_file = tempfile.TemporaryFile() + self.model.save(temp_file) + # TODO: add support for other algorithms + for model_class in [SAC, TQC]: + if isinstance(self.model, model_class): + self.model_class = model_class + break + + assert self.model_class is not None + self._model = self.model_class.load(temp_file) + + self.batch_size = self._model.batch_size + # TODO: update SB3 and check train freq instead + # of gradient_steps > 0 + self.model.gradient_steps = 1 + self.model.tau = 0.0 + self.model.learning_rate = 0.0 + self.model.batch_size = 1 + + def train(self) -> None: + self._model_ready = False + self.process = Thread(target=self._train_thread, daemon=True) + self.process.start() + + def _train_thread(self) -> None: + self._model.train(gradient_steps=self.gradient_steps, batch_size=self.batch_size) + self._model_ready = True + self.logger.record("train/n_updates_real", self._model._n_updates, exclude="tensorboard") + + def _on_step(self) -> bool: + return True + + def _on_rollout_end(self) -> None: + if self._model_ready: + self._model.replay_buffer = deepcopy(self.model.replay_buffer) + self.model.set_parameters(self._model.get_parameters()) + self.model.actor = self.model.policy.actor + self.train() + # Do not wait for the training loop to finish + # self.process.join() diff --git a/utils/wrappers.py b/utils/wrappers.py index 2379aa266..9cdaf783f 100644 --- a/utils/wrappers.py +++ b/utils/wrappers.py @@ -1,6 +1,5 @@ import gym import numpy as np -from matplotlib import pyplot as plt from sb3_contrib.common.wrappers import TimeFeatureWrapper # noqa: F401 (backward compatibility) from scipy.signal import iirfilter, sosfilt, zpk2sos From b635157463181d9ceb5bbacd0aa820f4ead17b7c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 5 May 2021 10:07:13 +0200 Subject: [PATCH 10/62] Cleanup --- utils/callbacks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/callbacks.py b/utils/callbacks.py index 693ecbad1..d025d0ffe 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -88,7 +88,6 @@ def _on_step(self) -> bool: class ParallelTrainCallback(BaseCallback): def __init__(self, gradient_steps: int = 100, verbose: int = 0): super(ParallelTrainCallback, self).__init__(verbose) - self.gradient_steps = 0 self.batch_size = 0 self._model_ready = True self._model = None @@ -105,7 +104,7 @@ def _init_callback(self) -> None: self.model_class = model_class break - assert self.model_class is not None + assert self.model_class is not None, f"{self.model} is not supported for parallel training" self._model = self.model_class.load(temp_file) self.batch_size = self._model.batch_size From 296668950627b15cfe5320d4be39a82c0affb22c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 5 May 2021 19:18:58 +0200 Subject: [PATCH 11/62] Avoid modify by reference + add sleep time --- utils/callbacks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/utils/callbacks.py b/utils/callbacks.py index d025d0ffe..08a869d3c 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -1,5 +1,6 @@ import os import tempfile +import time from copy import deepcopy from threading import Thread from typing import Optional @@ -86,7 +87,7 @@ def _on_step(self) -> bool: class ParallelTrainCallback(BaseCallback): - def __init__(self, gradient_steps: int = 100, verbose: int = 0): + def __init__(self, gradient_steps: int = 100, verbose: int = 0, sleep_time: float = 0.0): super(ParallelTrainCallback, self).__init__(verbose) self.batch_size = 0 self._model_ready = True @@ -94,6 +95,7 @@ def __init__(self, gradient_steps: int = 100, verbose: int = 0): self.gradient_steps = gradient_steps self.process = None self.model_class = None + self.sleep_time = sleep_time def _init_callback(self) -> None: temp_file = tempfile.TemporaryFile() @@ -126,12 +128,14 @@ def _train_thread(self) -> None: self.logger.record("train/n_updates_real", self._model._n_updates, exclude="tensorboard") def _on_step(self) -> bool: + if self.sleep_time > 0: + time.sleep(self.sleep_time) return True def _on_rollout_end(self) -> None: if self._model_ready: self._model.replay_buffer = deepcopy(self.model.replay_buffer) - self.model.set_parameters(self._model.get_parameters()) + self.model.set_parameters(deepcopy(self._model.get_parameters())) self.model.actor = self.model.policy.actor self.train() # Do not wait for the training loop to finish From aa6d934a42fbd9ff613a2a0b8338f940a5c4af32 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 6 May 2021 14:40:55 +0200 Subject: [PATCH 12/62] Take learning starts into account --- utils/callbacks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/callbacks.py b/utils/callbacks.py index 08a869d3c..f49123a4c 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -137,6 +137,7 @@ def _on_rollout_end(self) -> None: self._model.replay_buffer = deepcopy(self.model.replay_buffer) self.model.set_parameters(deepcopy(self._model.get_parameters())) self.model.actor = self.model.policy.actor - self.train() + if self.num_timesteps >= self._model.learning_starts: + self.train() # Do not wait for the training loop to finish # self.process.join() From 563d853026b357153696e9d99cb28ec8a44f1b59 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 11 May 2021 13:28:33 +0200 Subject: [PATCH 13/62] Update hyperparams --- hyperparams/tqc.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 66acf6bad..44c05759f 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -208,13 +208,13 @@ InvertedPendulumSwingupBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" # Space Engineers envs - SpaceEngineers-WalkingRobot-IK-v0: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 - - utils.wrappers.TimeFeatureWrapper: - test_mode: False + callback: + - utils.callbacks.ParallelTrainCallback: + gradient_steps: 256 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -230,5 +230,5 @@ SpaceEngineers-WalkingRobot-IK-v0: use_sde: True sde_sample_freq: 4 # TODO: try with different value: - # top_quantiles_to_drop_per_net: 5 - policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300], n_critics=2)" + top_quantiles_to_drop_per_net: 5 + policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=1)" From 779c4c6ba974e2800f24c604c868c98390385f25 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 11 May 2021 15:36:52 +0200 Subject: [PATCH 14/62] Add dict obs support --- CHANGELOG.md | 7 ++++++- hyperparams/ddpg.yml | 8 -------- hyperparams/sac.yml | 9 --------- hyperparams/td3.yml | 9 --------- hyperparams/tqc.yml | 8 -------- requirements.txt | 4 ++-- scripts/build_docker.sh | 2 +- setup.cfg | 2 ++ tests/test_enjoy.py | 4 ++++ tests/test_hyperparams_opt.py | 2 +- utils/exp_manager.py | 1 + utils/utils.py | 1 - version.txt | 2 +- 13 files changed, 18 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bc50b79e..6c6a988f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,15 @@ -## Release 1.1.0a5 (WIP) +## Release 1.1.0a6 (WIP) ### Breaking Changes +- Upgrade to SB3 >= 1.1.0a6 +- Upgrade to sb3-contrib >= 1.1.0a6 +- Add timeout handling (cf SB3 doc) +- `HER` is now a replay buffer class and no more an algorithm ### New Features - Add support for recording videos of best models and checkpoints (@mcres) - Add support for recording videos of training experiments (@mcres) +- Add support for dictionary observations ### Bug fixes - Fixed video rendering for PyBullet envs on Linux diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml index 90cab3d00..a0b111af3 100644 --- a/hyperparams/ddpg.yml +++ b/hyperparams/ddpg.yml @@ -60,7 +60,6 @@ BipedalWalkerHardcore-v3: # Tuned HalfCheetahBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -75,7 +74,6 @@ HalfCheetahBulletEnv-v0: # Tuned AntBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -90,7 +88,6 @@ AntBulletEnv-v0: # Tuned HopperBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -106,7 +103,6 @@ HopperBulletEnv-v0: # Tuned Walker2DBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -123,7 +119,6 @@ Walker2DBulletEnv-v0: # TO BE tested HumanoidBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 2e6 policy: 'MlpPolicy' gamma: 0.98 @@ -138,7 +133,6 @@ HumanoidBulletEnv-v0: # Tuned ReacherBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' gamma: 0.98 @@ -152,7 +146,6 @@ ReacherBulletEnv-v0: # To be tuned InvertedDoublePendulumBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -167,7 +160,6 @@ InvertedDoublePendulumBulletEnv-v0: # To be tuned InvertedPendulumSwingupBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' gamma: 0.98 diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index 5cee64f1a..05a4d54a3 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -120,7 +120,6 @@ HalfCheetahBulletEnv-v0: # Tuned AntBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -136,7 +135,6 @@ AntBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" HopperBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -152,7 +150,6 @@ HopperBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" Walker2DBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -170,7 +167,6 @@ Walker2DBulletEnv-v0: # Tuned ReacherBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -199,7 +195,6 @@ HumanoidBulletEnv-v0: # Tuned InvertedDoublePendulumBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 5e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -216,7 +211,6 @@ InvertedDoublePendulumBulletEnv-v0: # Tuned InvertedPendulumSwingupBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -314,7 +308,6 @@ donkey-generated-track-v0: max_episode_steps: 500 - utils.wrappers.HistoryWrapper: horizon: 5 - - sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -338,8 +331,6 @@ NeckEnvRelative-v2: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 - - sb3_contrib.common.wrappers.TimeFeatureWrapper: - test_mode: False # - utils.wrappers.LowPassFilterWrapper: # freq: 2.0 # df: 25.0 diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml index 1c067a162..23802f6e1 100644 --- a/hyperparams/td3.yml +++ b/hyperparams/td3.yml @@ -60,7 +60,6 @@ BipedalWalkerHardcore-v3: # Tuned HalfCheetahBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -74,7 +73,6 @@ HalfCheetahBulletEnv-v0: policy_kwargs: "dict(net_arch=[400, 300])" AntBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -88,7 +86,6 @@ AntBulletEnv-v0: policy_kwargs: "dict(net_arch=[400, 300])" HopperBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -102,7 +99,6 @@ HopperBulletEnv-v0: policy_kwargs: "dict(net_arch=[400, 300])" Walker2DBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -118,7 +114,6 @@ Walker2DBulletEnv-v0: # TO BE tested HumanoidBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 2e6 policy: 'MlpPolicy' gamma: 0.98 @@ -132,7 +127,6 @@ HumanoidBulletEnv-v0: # Tuned ReacherBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' gamma: 0.98 @@ -147,7 +141,6 @@ ReacherBulletEnv-v0: # Tuned InvertedDoublePendulumBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.98 @@ -162,7 +155,6 @@ InvertedDoublePendulumBulletEnv-v0: # Tuned InvertedPendulumSwingupBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' gamma: 0.98 @@ -176,7 +168,6 @@ InvertedPendulumSwingupBulletEnv-v0: policy_kwargs: "dict(net_arch=[400, 300])" MinitaurBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' gamma: 0.99 diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 42825524a..80135b84e 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -72,7 +72,6 @@ BipedalWalkerHardcore-v3: # Tuned HalfCheetahBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -89,7 +88,6 @@ HalfCheetahBulletEnv-v0: # Tuned AntBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -106,7 +104,6 @@ AntBulletEnv-v0: # Tuned HopperBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -124,7 +121,6 @@ HopperBulletEnv-v0: # Tuned Walker2DBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e6 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -141,7 +137,6 @@ Walker2DBulletEnv-v0: ReacherBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -159,7 +154,6 @@ ReacherBulletEnv-v0: # Almost tuned HumanoidBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 1e7 policy: 'MlpPolicy' learning_rate: lin_7.3e-4 @@ -176,7 +170,6 @@ HumanoidBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" InvertedDoublePendulumBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 5e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -192,7 +185,6 @@ InvertedDoublePendulumBulletEnv-v0: policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])" InvertedPendulumSwingupBulletEnv-v0: - env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper n_timesteps: !!float 3e5 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 diff --git a/requirements.txt b/requirements.txt index 5f21bab68..9200a569c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -stable-baselines3[extra,tests,docs]>=1.0 +stable-baselines3[extra,tests,docs]>=1.1.0a6 box2d-py==2.3.8 pybullet gym-minigrid @@ -7,4 +7,4 @@ optuna pytablewriter seaborn pyyaml>=5.1 -sb3-contrib>=1.0 +sb3-contrib>=1.1.0a6 diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh index ee5e9a234..fb1e7d07e 100755 --- a/scripts/build_docker.sh +++ b/scripts/build_docker.sh @@ -3,7 +3,7 @@ PARENT=stablebaselines/stable-baselines3 TAG=stablebaselines/rl-baselines3-zoo -VERSION=1.1.0a5 +VERSION=1.1.0a6 if [[ ${USE_GPU} == "True" ]]; then PARENT="${PARENT}:${VERSION}" diff --git a/setup.cfg b/setup.cfg index 09b9d88e0..c74b8320e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,7 @@ [tool:pytest] filterwarnings = + # Tensorboard warnings + ignore::DeprecationWarning:tensorboard # Gym warnings ignore::UserWarning:gym markers = diff --git a/tests/test_enjoy.py b/tests/test_enjoy.py index 48de16374..658f5d52a 100644 --- a/tests/test_enjoy.py +++ b/tests/test_enjoy.py @@ -22,6 +22,10 @@ def test_trained_agents(trained_model): algo, env_id = trained_models[trained_model] args = ["-n", str(N_STEPS), "-f", FOLDER, "--algo", algo, "--env", env_id, "--no-render"] + # Since SB3 >= 1.1.0, HER is no more an algorithm but a replay buffer class + if algo == "her": + return + # Skip mujoco envs if "Fetch" in trained_model: return diff --git a/tests/test_hyperparams_opt.py b/tests/test_hyperparams_opt.py index 6c5f0b016..57230d740 100644 --- a/tests/test_hyperparams_opt.py +++ b/tests/test_hyperparams_opt.py @@ -25,7 +25,7 @@ def _assert_eq(left, right): # Test for TD3 experiments["td3-Pendulum-v0"] = ("td3", "Pendulum-v0") # Test for HER -experiments["her-parking-v0"] = ("her", "parking-v0") +experiments["tqc-parking-v0"] = ("tqc", "parking-v0") # Test for TQC experiments["tqc-Pendulum-v0"] = ("tqc", "Pendulum-v0") diff --git a/utils/exp_manager.py b/utils/exp_manager.py index f3fa25a6a..aafa59010 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -13,6 +13,7 @@ from optuna.integration.skopt import SkoptSampler from optuna.pruners import BasePruner, MedianPruner, SuccessiveHalvingPruner from optuna.samplers import BaseSampler, RandomSampler, TPESampler + # For using HER with GoalEnv from stable_baselines3 import HerReplayBuffer # noqa: F401 from stable_baselines3.common.base_class import BaseAlgorithm diff --git a/utils/utils.py b/utils/utils.py index 1a2f7f10e..c25cd72fb 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -23,7 +23,6 @@ "ddpg": DDPG, "dqn": DQN, "ppo": PPO, - # "her": HER, "sac": SAC, "td3": TD3, # SB3 Contrib, diff --git a/version.txt b/version.txt index c84ce1899..1406d2fc7 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.1.0a5 +1.1.0a6 From 43188ad0db4b009fe722b535efebcf10c4822ef9 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 12 May 2021 10:04:37 +0200 Subject: [PATCH 15/62] Update test env --- tests/test_wrappers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index 3eb651ecf..5ec00c0f4 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -8,7 +8,7 @@ def test_wrappers(): - env = gym.make("HalfCheetahBulletEnv-v0") + env = gym.make("AntBulletEnv-v0") env = DelayedRewardWrapper(env) env = ActionNoiseWrapper(env) env = HistoryWrapper(env) @@ -25,7 +25,7 @@ def test_wrappers(): ], ) def test_get_wrapper(env_wrapper): - env = gym.make("HalfCheetahBulletEnv-v0") + env = gym.make("AntBulletEnv-v0") hyperparams = {"env_wrapper": env_wrapper} wrapper_class = get_wrapper_class(hyperparams) if env_wrapper is not None: From b461048e280659909b535f4ba7b0d6438d297324 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 12 May 2021 11:50:29 +0200 Subject: [PATCH 16/62] Version bump --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9200a569c..969f7c07e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -stable-baselines3[extra,tests,docs]>=1.1.0a6 +stable-baselines3[extra,tests,docs]>=1.1.0a7 box2d-py==2.3.8 pybullet gym-minigrid @@ -7,4 +7,4 @@ optuna pytablewriter seaborn pyyaml>=5.1 -sb3-contrib>=1.1.0a6 +sb3-contrib>=1.1.0a7 From f73a65e5743f6848f0a86a66dec9fc00cd75ee2d Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 14 May 2021 16:57:01 +0200 Subject: [PATCH 17/62] Update for symmetric control + catch zmq error --- hyperparams/tqc.yml | 16 ++++++++-------- utils/exp_manager.py | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 4fd678785..3046d412f 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -291,21 +291,21 @@ SpaceEngineers-WalkingRobot-IK-v0: horizon: 2 callback: - utils.callbacks.ParallelTrainCallback: - gradient_steps: 256 + gradient_steps: 50 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 - buffer_size: 100000 + buffer_size: 300000 batch_size: 256 ent_coef: 'auto' - gamma: 0.99 + gamma: 0.98 tau: 0.01 train_freq: [1, "episode"] gradient_steps: -1 - learning_starts: 1000 - use_sde_at_warmup: True - use_sde: True - sde_sample_freq: 4 + learning_starts: 500 + # use_sde_at_warmup: True + use_sde: False + # sde_sample_freq: 4 # TODO: try with different value: - top_quantiles_to_drop_per_net: 5 + # top_quantiles_to_drop_per_net: 5 policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=1)" diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 7a11778ca..b23fe826e 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -10,6 +10,7 @@ import numpy as np import optuna import yaml +import zmq from optuna.integration.skopt import SkoptSampler from optuna.pruners import BasePruner, MedianPruner, SuccessiveHalvingPruner from optuna.samplers import BaseSampler, RandomSampler, TPESampler @@ -183,7 +184,7 @@ def learn(self, model: BaseAlgorithm) -> None: try: model.learn(self.n_timesteps, **kwargs) - except KeyboardInterrupt: + except (KeyboardInterrupt, zmq.error.ZMQError): # this allows to save the model when interrupting training pass finally: From da441b83247e6bbf16740b27c213f389d99a1bbe Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sat, 15 May 2021 23:59:50 +0200 Subject: [PATCH 18/62] Save best model --- hyperparams/tqc.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 3046d412f..4e457028e 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -291,7 +291,7 @@ SpaceEngineers-WalkingRobot-IK-v0: horizon: 2 callback: - utils.callbacks.ParallelTrainCallback: - gradient_steps: 50 + gradient_steps: 100 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -303,9 +303,9 @@ SpaceEngineers-WalkingRobot-IK-v0: train_freq: [1, "episode"] gradient_steps: -1 learning_starts: 500 - # use_sde_at_warmup: True - use_sde: False - # sde_sample_freq: 4 + use_sde_at_warmup: True + use_sde: True + sde_sample_freq: 4 # TODO: try with different value: # top_quantiles_to_drop_per_net: 5 - policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=1)" + policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=2)" From bf05e92551a59eee815027823992e08fad4c3d50 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sun, 16 May 2021 15:57:25 +0200 Subject: [PATCH 19/62] Fix parallel save (maybe issue with optimizer) --- utils/callbacks.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/utils/callbacks.py b/utils/callbacks.py index 793ec5ae6..ca5914542 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -2,6 +2,7 @@ import tempfile import time from copy import deepcopy +from functools import wraps from threading import Thread from typing import Optional @@ -127,15 +128,28 @@ def _init_callback(self) -> None: self._model = self.model_class.load(temp_file) self.batch_size = self._model.batch_size + # TODO: update SB3 and check train freq instead # of gradient_steps > 0 self.model.gradient_steps = 1 self.model.tau = 0.0 self.model.learning_rate = 0.0 + self.model.lr_schedule = lambda _: 0.0 self.model.batch_size = 1 + # Hack: Re-add correct values at save time + def patch_save(function): + @wraps(function) + def wrapper(*args, **kwargs): + self._model.save(*args, **kwargs) + + return wrapper + + self.model.save = patch_save(self.model.save) + def train(self) -> None: self._model_ready = False + self.process = Thread(target=self._train_thread, daemon=True) self.process.start() From a9d63db830de24c6df45bd438b056cbc36ab2dc0 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sun, 16 May 2021 19:12:47 +0200 Subject: [PATCH 20/62] Update hyperparams --- hyperparams/sac.yml | 1 + hyperparams/tqc.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index 312b9afc7..a67aca9d7 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -21,6 +21,7 @@ Pendulum-v0: policy: 'MlpPolicy' learning_rate: !!float 1e-3 use_sde: True + sde_sample_freq: 4 train_freq: [1, "episode"] gradient_steps: -1 policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])" diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 4e457028e..974421cfd 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -291,7 +291,7 @@ SpaceEngineers-WalkingRobot-IK-v0: horizon: 2 callback: - utils.callbacks.ParallelTrainCallback: - gradient_steps: 100 + gradient_steps: 200 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -302,7 +302,7 @@ SpaceEngineers-WalkingRobot-IK-v0: tau: 0.01 train_freq: [1, "episode"] gradient_steps: -1 - learning_starts: 500 + learning_starts: 400 use_sde_at_warmup: True use_sde: True sde_sample_freq: 4 From d3951205e47afb52ccebb12f288a935e4750e44e Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 17 May 2021 19:38:13 +0200 Subject: [PATCH 21/62] Update best params --- hyperparams/tqc.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 974421cfd..490add50e 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -299,13 +299,12 @@ SpaceEngineers-WalkingRobot-IK-v0: batch_size: 256 ent_coef: 'auto' gamma: 0.98 - tau: 0.01 + tau: 0.02 train_freq: [1, "episode"] gradient_steps: -1 learning_starts: 400 use_sde_at_warmup: True use_sde: True sde_sample_freq: 4 - # TODO: try with different value: - # top_quantiles_to_drop_per_net: 5 + top_quantiles_to_drop_per_net: 2 policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=2)" From 563412826ade58bac2bed981403d16f6bfac47c8 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 19 May 2021 13:11:14 +0200 Subject: [PATCH 22/62] Update hyperparams --- hyperparams/tqc.yml | 2 +- utils/callbacks.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 490add50e..ca9265802 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -291,7 +291,7 @@ SpaceEngineers-WalkingRobot-IK-v0: horizon: 2 callback: - utils.callbacks.ParallelTrainCallback: - gradient_steps: 200 + gradient_steps: 400 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 diff --git a/utils/callbacks.py b/utils/callbacks.py index ca5914542..b4a664ee0 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -136,6 +136,9 @@ def _init_callback(self) -> None: self.model.learning_rate = 0.0 self.model.lr_schedule = lambda _: 0.0 self.model.batch_size = 1 + # Deactivate logger (TODO(toni): fix this when configuring logger works) + self.model.log_interval = 1000000 + # TODO: change learning starts when using gSDE # Hack: Re-add correct values at save time def patch_save(function): From c5b7f55ae12cf7070208ea44381a0f26f450c91f Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 19 May 2021 14:20:20 +0200 Subject: [PATCH 23/62] Prepare big network experiment --- hyperparams/tqc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index ca9265802..d9532a992 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -302,9 +302,9 @@ SpaceEngineers-WalkingRobot-IK-v0: tau: 0.02 train_freq: [1, "episode"] gradient_steps: -1 - learning_starts: 400 + learning_starts: 1200 use_sde_at_warmup: True use_sde: True sde_sample_freq: 4 top_quantiles_to_drop_per_net: 2 - policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=2)" + policy_kwargs: "dict(log_std_init=-3, net_arch=[1024, 1024], n_critics=2)" From 976833dd419b4d27cf1d290021745f2a2a8667e6 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 19 May 2021 18:04:14 +0200 Subject: [PATCH 24/62] Revert to normal net --- hyperparams/tqc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index d9532a992..84dd8989b 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -307,4 +307,4 @@ SpaceEngineers-WalkingRobot-IK-v0: use_sde: True sde_sample_freq: 4 top_quantiles_to_drop_per_net: 2 - policy_kwargs: "dict(log_std_init=-3, net_arch=[1024, 1024], n_critics=2)" + policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=2)" From c6b2dce2086ea0e95283af7ed45cb8fd2c432b6c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 19 May 2021 18:19:07 +0200 Subject: [PATCH 25/62] Add exception for windows --- utils/callbacks.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/utils/callbacks.py b/utils/callbacks.py index b4a664ee0..0865711e0 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -117,7 +117,14 @@ def __init__(self, gradient_steps: int = 100, verbose: int = 0, sleep_time: floa def _init_callback(self) -> None: temp_file = tempfile.TemporaryFile() + + # Windows TemporaryFile is not a io Buffer + # we save the model in the logs/ folder + if os.name == "nt": + temp_file = os.path.join("logs", "model_tmp.zip") + self.model.save(temp_file) + # TODO: add support for other algorithms for model_class in [SAC, TQC]: if isinstance(self.model, model_class): From 67033907c68be85c713eed3fd0776cf5e9631394 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 19 May 2021 18:43:19 +0200 Subject: [PATCH 26/62] Update plot script: allow multiple envs --- scripts/plot_train.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/plot_train.py b/scripts/plot_train.py index 3dea2b20d..ca4093242 100644 --- a/scripts/plot_train.py +++ b/scripts/plot_train.py @@ -15,7 +15,7 @@ parser = argparse.ArgumentParser("Gather results, plot training reward/success") parser.add_argument("-a", "--algo", help="Algorithm to include", type=str, required=True) -parser.add_argument("-e", "--env", help="Environment to include", type=str, required=True) +parser.add_argument("-e", "--env", help="Environment(s) to include", nargs="+", type=str, required=True) parser.add_argument("-f", "--exp-folder", help="Folders to include", type=str, required=True) parser.add_argument("--figsize", help="Figure size, width, height in inches.", nargs=2, type=int, default=[6.4, 4.8]) parser.add_argument("--fontsize", help="Font size", type=int, default=14) @@ -28,7 +28,7 @@ algo = args.algo -env = args.env +envs = args.env log_path = os.path.join(args.exp_folder, algo) x_axis = {"steps": X_TIMESTEPS, "episodes": X_EPISODES, "time": X_WALLTIME}[args.x_axis] @@ -37,11 +37,16 @@ y_axis = {"success": "is_success", "reward": "r"}[args.y_axis] y_label = {"success": "Training Success Rate", "reward": "Training Episodic Reward"}[args.y_axis] -dirs = [ - os.path.join(log_path, folder) - for folder in os.listdir(log_path) - if (env in folder and os.path.isdir(os.path.join(log_path, folder))) -] +dirs = [] + +for env in envs: + dirs.extend( + [ + os.path.join(log_path, folder) + for folder in os.listdir(log_path) + if (env in folder and os.path.isdir(os.path.join(log_path, folder))) + ] + ) plt.figure(y_label, figsize=args.figsize) plt.title(y_label, fontsize=args.fontsize) From a771249e8182bb586f6cf5ea2ee7f350dc7fee00 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 21 May 2021 11:37:19 +0200 Subject: [PATCH 27/62] Add bert params --- hyperparams/tqc.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 84dd8989b..864be6a9c 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -308,3 +308,29 @@ SpaceEngineers-WalkingRobot-IK-v0: sde_sample_freq: 4 top_quantiles_to_drop_per_net: 2 policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=2)" + +# ======== Real Robot envs ============ + +WalkingBertSim-v1: + env_wrapper: + - utils.wrappers.HistoryWrapper: + horizon: 2 + callback: + - utils.callbacks.ParallelTrainCallback: + gradient_steps: 400 + n_timesteps: !!float 2e6 + policy: 'MlpPolicy' + learning_rate: !!float 7.3e-4 + buffer_size: 300000 + batch_size: 256 + ent_coef: 'auto' + gamma: 0.98 + tau: 0.02 + train_freq: [1, "episode"] + gradient_steps: -1 + learning_starts: 1200 + use_sde_at_warmup: True + use_sde: True + sde_sample_freq: 4 + top_quantiles_to_drop_per_net: 2 + policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=2)" From c9663716c8d1829cc225a200ce69da28e46db49b Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 26 May 2021 13:42:40 +0200 Subject: [PATCH 28/62] Save multirobot hyperparams --- hyperparams/ppo.yml | 27 +++++++++++++++++++++++++++ hyperparams/tqc.yml | 10 ++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml index c85a9825e..23acff97c 100644 --- a/hyperparams/ppo.yml +++ b/hyperparams/ppo.yml @@ -383,3 +383,30 @@ CarRacing-v0: policy_kwargs: "dict(log_std_init=-2, ortho_init=False, )" + +SpaceEngineers-WalkingRobot-IK-v0: + env_wrapper: + - utils.wrappers.HistoryWrapper: + horizon: 2 + - sb3_contrib.common.wrappers.TimeFeatureWrapper + normalize: true + n_envs: 4 + n_timesteps: !!float 2e6 + policy: 'MlpPolicy' + batch_size: 128 + n_steps: 256 + gamma: 0.99 + gae_lambda: 0.9 + n_epochs: 20 + ent_coef: 0.0 + sde_sample_freq: 4 + max_grad_norm: 0.5 + vf_coef: 0.5 + learning_rate: !!float 3e-5 + use_sde: True + clip_range: 0.4 + policy_kwargs: "dict(log_std_init=-2, + ortho_init=False, + activation_fn=nn.ReLU, + net_arch=[dict(pi=[256, 256], vf=[256, 256])] + )" diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 864be6a9c..938d50609 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -291,18 +291,20 @@ SpaceEngineers-WalkingRobot-IK-v0: horizon: 2 callback: - utils.callbacks.ParallelTrainCallback: - gradient_steps: 400 + gradient_steps: 200 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 - buffer_size: 300000 + buffer_size: 40000 batch_size: 256 ent_coef: 'auto' gamma: 0.98 tau: 0.02 - train_freq: [1, "episode"] + # train_freq: [1, "episode"] + train_freq: 100 + n_envs: 4 gradient_steps: -1 - learning_starts: 1200 + learning_starts: 800 use_sde_at_warmup: True use_sde: True sde_sample_freq: 4 From a99f6916d2e5b0793154f8836e3c28cdf6de1271 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sun, 30 May 2021 13:10:23 +0200 Subject: [PATCH 29/62] Add POC for VecEnvWrapper --- utils/exp_manager.py | 14 +++++++++++--- utils/utils.py | 11 ++++++++--- utils/wrappers.py | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index b23fe826e..1218c204d 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -87,6 +87,7 @@ def __init__( self.env_wrapper = None self.frame_stack = None self.seed = seed + self.vec_env_wrapper = None self.vec_env_class = {"dummy": DummyVecEnv, "subproc": SubprocVecEnv}[vec_env_type] @@ -144,7 +145,7 @@ def setup_experiment(self) -> Optional[BaseAlgorithm]: :return: the initialized RL model """ hyperparams, saved_hyperparams = self.read_hyperparameters() - hyperparams, self.env_wrapper, self.callbacks = self._preprocess_hyperparams(hyperparams) + hyperparams, self.env_wrapper, self.callbacks, self.vec_env_wrapper = self._preprocess_hyperparams(hyperparams) self.create_log_folder() self.create_callbacks() @@ -293,7 +294,7 @@ def _preprocess_normalization(self, hyperparams: Dict[str, Any]) -> Dict[str, An def _preprocess_hyperparams( self, hyperparams: Dict[str, Any] - ) -> Tuple[Dict[str, Any], Optional[Callable], List[BaseCallback]]: + ) -> Tuple[Dict[str, Any], Optional[Callable], List[BaseCallback], Optional[Callable]]: self.n_envs = hyperparams.get("n_envs", 1) if self.verbose > 0: @@ -337,11 +338,15 @@ def _preprocess_hyperparams( if "env_wrapper" in hyperparams.keys(): del hyperparams["env_wrapper"] + vec_env_wrapper = get_wrapper_class(hyperparams, "vec_env_wrapper") + if "vec_env_wrapper" in hyperparams.keys(): + del hyperparams["vec_env_wrapper"] + callbacks = get_callback_list(hyperparams) if "callback" in hyperparams.keys(): del hyperparams["callback"] - return hyperparams, env_wrapper, callbacks + return hyperparams, env_wrapper, callbacks, vec_env_wrapper def _preprocess_action_noise( self, hyperparams: Dict[str, Any], saved_hyperparams: Dict[str, Any], env: VecEnv @@ -495,6 +500,9 @@ def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) monitor_kwargs=monitor_kwargs, ) + if self.vec_env_wrapper is not None: + env = self.vec_env_wrapper(env) + # Wrap the env into a VecNormalize wrapper if needed # and load saved statistics when present env = self._maybe_normalize(env, eval_env) diff --git a/utils/utils.py b/utils/utils.py index c25cd72fb..ac151f2da 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -40,7 +40,7 @@ def flatten_dict_observations(env: gym.Env) -> gym.Env: return gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) -def get_wrapper_class(hyperparams: Dict[str, Any]) -> Optional[Callable[[gym.Env], gym.Env]]: +def get_wrapper_class(hyperparams: Dict[str, Any], key: str = "env_wrapper") -> Optional[Callable[[gym.Env], gym.Env]]: """ Get one or more Gym environment wrapper class specified as a hyper parameter "env_wrapper". @@ -65,8 +65,8 @@ def get_module_name(wrapper_name): def get_class_name(wrapper_name): return wrapper_name.split(".")[-1] - if "env_wrapper" in hyperparams.keys(): - wrapper_name = hyperparams.get("env_wrapper") + if key in hyperparams.keys(): + wrapper_name = hyperparams.get(key) if wrapper_name is None: return None @@ -202,6 +202,11 @@ def create_test_env( if "env_wrapper" in hyperparams.keys(): del hyperparams["env_wrapper"] + # Ignore for now + # TODO: handle it properly + if "vec_env_wrapper" in hyperparams.keys(): + del hyperparams["vec_env_wrapper"] + vec_env_kwargs = {} vec_env_cls = DummyVecEnv if n_envs > 1 or (ExperimentManager.is_bullet(env_id) and should_render): diff --git a/utils/wrappers.py b/utils/wrappers.py index 9cdaf783f..884923924 100644 --- a/utils/wrappers.py +++ b/utils/wrappers.py @@ -1,7 +1,48 @@ +from copy import deepcopy + import gym import numpy as np from sb3_contrib.common.wrappers import TimeFeatureWrapper # noqa: F401 (backward compatibility) from scipy.signal import iirfilter, sosfilt, zpk2sos +from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs, VecEnvStepReturn, VecEnvWrapper + + +class VecForceResetWrapper(VecEnvWrapper): + """ + For all environments to reset at once, + and tell the agent the trajectory was truncated. + + :param venv: The vectorized environment + """ + + def __init__(self, venv: VecEnv): + super().__init__(venv=venv) + + def reset(self) -> VecEnvObs: + return self.venv.reset() + + def step_wait(self) -> VecEnvStepReturn: + for env_idx in range(self.num_envs): + obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] = self.envs[env_idx].step( + self.actions[env_idx] + ) + self._save_obs(env_idx, obs) + + if self.buf_dones.any(): + for env_idx in range(self.num_envs): + self.buf_infos[env_idx]["terminal_observation"] = self.buf_obs[None][env_idx] + if not self.buf_dones[env_idx]: + self.buf_infos[env_idx]["TimeLimit.truncated"] = True + self.buf_dones[env_idx] = True + obs = self.envs[env_idx].reset() + self._save_obs(env_idx, obs) + + return ( + self._obs_from_buf(), + np.copy(self.buf_rews), + np.copy(self.buf_dones), + deepcopy(self.buf_infos), + ) class DoneOnSuccessWrapper(gym.Wrapper): From c7bd1b934ab2e662190cec4cb9e5405f91774bed Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sun, 30 May 2021 15:14:59 +0200 Subject: [PATCH 30/62] Add support for SubProc --- utils/wrappers.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/utils/wrappers.py b/utils/wrappers.py index 884923924..23f231497 100644 --- a/utils/wrappers.py +++ b/utils/wrappers.py @@ -5,6 +5,7 @@ from sb3_contrib.common.wrappers import TimeFeatureWrapper # noqa: F401 (backward compatibility) from scipy.signal import iirfilter, sosfilt, zpk2sos from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs, VecEnvStepReturn, VecEnvWrapper +from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv, _flatten_obs class VecForceResetWrapper(VecEnvWrapper): @@ -17,11 +18,15 @@ class VecForceResetWrapper(VecEnvWrapper): def __init__(self, venv: VecEnv): super().__init__(venv=venv) + self.use_subproc = isinstance(venv, SubprocVecEnv) def reset(self) -> VecEnvObs: return self.venv.reset() def step_wait(self) -> VecEnvStepReturn: + if self.use_subproc: + return self._subproc_step_wait() + for env_idx in range(self.num_envs): obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] = self.envs[env_idx].step( self.actions[env_idx] @@ -44,6 +49,26 @@ def step_wait(self) -> VecEnvStepReturn: deepcopy(self.buf_infos), ) + def _subproc_step_wait(self) -> VecEnvStepReturn: + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rewards, dones, infos = zip(*results) + dones = np.stack(dones) + obs = list(obs) + updated_remotes = [] + if np.array(dones).any(): + for idx, remote in enumerate(self.remotes): + if not dones[idx]: + infos[idx]["terminal_observation"] = obs[idx] + infos[idx]["TimeLimit.truncated"] = True + dones[idx] = True + remote.send(("reset", None)) + updated_remotes.append((idx, remote)) + + for idx, remote in updated_remotes: + obs[idx] = remote.recv() + return _flatten_obs(obs, self.observation_space), np.stack(rewards), dones, infos + class DoneOnSuccessWrapper(gym.Wrapper): """ From f9f97d9e581b1aa5a245ede6626381aceb6e3cf5 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 2 Jun 2021 21:24:50 +0200 Subject: [PATCH 31/62] Update params --- hyperparams/tqc.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 938d50609..0acef8f26 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -289,13 +289,15 @@ SpaceEngineers-WalkingRobot-IK-v0: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 + vec_env_wrapper: + - utils.wrappers.VecForceResetWrapper callback: - utils.callbacks.ParallelTrainCallback: - gradient_steps: 200 + gradient_steps: 400 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 - buffer_size: 40000 + buffer_size: 100000 batch_size: 256 ent_coef: 'auto' gamma: 0.98 @@ -305,11 +307,9 @@ SpaceEngineers-WalkingRobot-IK-v0: n_envs: 4 gradient_steps: -1 learning_starts: 800 - use_sde_at_warmup: True - use_sde: True - sde_sample_freq: 4 + use_sde: False top_quantiles_to_drop_per_net: 2 - policy_kwargs: "dict(log_std_init=-3, net_arch=[256, 256], n_critics=2)" + policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300], n_critics=2)" # ======== Real Robot envs ============ From 00d7cd9bb2c722f7b05e551f196a51e62cae584f Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 4 Jun 2021 11:47:35 +0200 Subject: [PATCH 32/62] Add Phase Feature --- hyperparams/tqc.yml | 3 +++ utils/wrappers.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 0acef8f26..d0ded5691 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -289,6 +289,9 @@ SpaceEngineers-WalkingRobot-IK-v0: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 + - utils.wrappers.PhaseWrapper: + period: 40 + n_components: 4 vec_env_wrapper: - utils.wrappers.VecForceResetWrapper callback: diff --git a/utils/wrappers.py b/utils/wrappers.py index 23f231497..b59c88706 100644 --- a/utils/wrappers.py +++ b/utils/wrappers.py @@ -1,9 +1,11 @@ from copy import deepcopy +from typing import Union import gym import numpy as np from sb3_contrib.common.wrappers import TimeFeatureWrapper # noqa: F401 (backward compatibility) from scipy.signal import iirfilter, sosfilt, zpk2sos +from stable_baselines3.common.type_aliases import GymObs, GymStepReturn from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs, VecEnvStepReturn, VecEnvWrapper from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv, _flatten_obs @@ -371,3 +373,43 @@ def step(self, action): obs_dict["observation"] = self._create_obs_from_history() return obs_dict, reward, done, info + + +class PhaseWrapper(gym.Wrapper): + """Add phase as input""" + + def __init__(self, env: gym.Env, period: int = 40, n_components: int = 4): + obs_space = env.observation_space + + assert len(obs_space.shape) == 1, "Only 1D observation spaces are supported" + + low, high = obs_space.low, obs_space.high + low, high = np.concatenate((low, [-1.0] * 2 * n_components)), np.concatenate((high, [1.0] * 2 * n_components)) + + env.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) + + super(PhaseWrapper, self).__init__(env) + self._current_step = 0 + self._n_components = n_components + self._period = period + + def reset(self) -> GymObs: + self._current_step = 0 + return self._get_obs(self.env.reset()) + + def step(self, action: Union[int, np.ndarray]) -> GymStepReturn: + self._current_step += 1 + obs, reward, done, info = self.env.step(action) + return self._get_obs(obs), reward, done, info + + def _get_obs(self, obs: np.ndarray) -> np.ndarray: + """ + Concatenate the phase feature to the current observation. + """ + k = 2 * np.pi / self._period + phase_feature = [] + for i in range(1, self._n_components + 1): + phase_feature.append(np.cos(i * k * self._current_step)) + phase_feature.append(np.sin(i * k * self._current_step)) + + return np.append(obs, phase_feature) From 04f7cc85cd0a14d77e243cbfb8ea49330770df31 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 4 Jun 2021 13:25:51 +0200 Subject: [PATCH 33/62] Test with phase only --- hyperparams/tqc.yml | 3 --- utils/wrappers.py | 9 ++++++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index d0ded5691..0acef8f26 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -289,9 +289,6 @@ SpaceEngineers-WalkingRobot-IK-v0: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 - - utils.wrappers.PhaseWrapper: - period: 40 - n_components: 4 vec_env_wrapper: - utils.wrappers.VecForceResetWrapper callback: diff --git a/utils/wrappers.py b/utils/wrappers.py index b59c88706..03a9508bc 100644 --- a/utils/wrappers.py +++ b/utils/wrappers.py @@ -378,12 +378,15 @@ def step(self, action): class PhaseWrapper(gym.Wrapper): """Add phase as input""" - def __init__(self, env: gym.Env, period: int = 40, n_components: int = 4): + def __init__(self, env: gym.Env, period: int = 40, n_components: int = 4, phase_only: bool = False): obs_space = env.observation_space assert len(obs_space.shape) == 1, "Only 1D observation spaces are supported" low, high = obs_space.low, obs_space.high + + if phase_only: + low, high = [], [] low, high = np.concatenate((low, [-1.0] * 2 * n_components)), np.concatenate((high, [1.0] * 2 * n_components)) env.observation_space = gym.spaces.Box(low=low, high=high, dtype=np.float32) @@ -392,6 +395,7 @@ def __init__(self, env: gym.Env, period: int = 40, n_components: int = 4): self._current_step = 0 self._n_components = n_components self._period = period + self._phase_only = phase_only def reset(self) -> GymObs: self._current_step = 0 @@ -412,4 +416,7 @@ def _get_obs(self, obs: np.ndarray) -> np.ndarray: phase_feature.append(np.cos(i * k * self._current_step)) phase_feature.append(np.sin(i * k * self._current_step)) + if self._phase_only: + return np.array(phase_feature) + return np.append(obs, phase_feature) From 87cfd3991a15513c61cfab5a00d0e1d7529abc2e Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 4 Jun 2021 13:32:31 +0200 Subject: [PATCH 34/62] Add stop on reward threshold callback --- utils/callbacks.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/utils/callbacks.py b/utils/callbacks.py index 0865711e0..6e691b8d6 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -10,6 +10,7 @@ from sb3_contrib import TQC from stable_baselines3 import SAC from stable_baselines3.common.callbacks import BaseCallback, EvalCallback +from stable_baselines3.common.utils import safe_mean from stable_baselines3.common.vec_env import VecEnv @@ -189,3 +190,31 @@ def _on_training_end(self) -> None: if self.verbose > 0: print("Waiting for training thread to terminate") self.process.join() + + +class StopTrainingOnMeanRewardThreshold(BaseCallback): + """ + Stop the training once a threshold in mean episodic reward + has been reached (i.e. when the model is good enough). + + :param reward_threshold: Minimum expected reward per episode + to stop training. + :param verbose: + """ + + def __init__(self, reward_threshold: float, verbose: int = 0): + super().__init__(verbose=verbose) + self.reward_threshold = reward_threshold + + def _on_step(self) -> bool: + continue_training = True + if len(self.model.ep_info_buffer) > 0 and len(self.model.ep_info_buffer[0]) > 0: + mean_reward = safe_mean([ep_info["r"] for ep_info in self.model.ep_info_buffer]) + # Convert np.bool_ to bool, otherwise callback() is False won't work + continue_training = bool(mean_reward < self.reward_threshold) + if self.verbose > 0 and not continue_training: + print( + f"Stopping training because the mean reward {mean_reward:.2f} " + f" is above the threshold {self.reward_threshold}" + ) + return continue_training From 497e669693b3255cdd5a884d51fa30315ebb0343 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 4 Jun 2021 14:51:26 +0200 Subject: [PATCH 35/62] Update params --- hyperparams/tqc.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 0acef8f26..2ec1d731b 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -294,6 +294,9 @@ SpaceEngineers-WalkingRobot-IK-v0: callback: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 + - utils.callbacks.StopTrainingOnMeanRewardThreshold: + reward_threshold: 150 + verbose: 1 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 From 16f93e46ab785205a64183ed1cd10c4b8869e62c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sun, 6 Jun 2021 13:48:53 +0200 Subject: [PATCH 36/62] Hack for zmq + early termination --- utils/exp_manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 16e394bdb..2359491ac 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -24,7 +24,7 @@ from stable_baselines3.common.preprocessing import is_image_space, is_image_space_channels_first from stable_baselines3.common.sb2_compat.rmsprop_tf_like import RMSpropTFLike # noqa: F401 from stable_baselines3.common.utils import constant_fn -from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv, VecFrameStack, VecNormalize, VecTransposeImage +from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv, VecEnvWrapper, VecFrameStack, VecNormalize, VecTransposeImage # For custom activation fn from torch import nn as nn # noqa: F401 @@ -193,6 +193,12 @@ def learn(self, model: BaseAlgorithm) -> None: finally: # Release resources try: + # Hack for zmq on Windows to allow early termination + env_tmp = model.env + while isinstance(env_tmp, VecEnvWrapper): + env_tmp = env_tmp.venv + env_tmp.waiting = False + model.env.close() except EOFError: pass From 96f3f75c0c133d8ece1ca217fec607cc7b5eab94 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 7 Jun 2021 16:10:27 +0200 Subject: [PATCH 37/62] Add human teleop --- hyperparams/human.yml | 7 ++ requirements.txt | 1 + setup.cfg | 1 + utils/teleop.py | 267 ++++++++++++++++++++++++++++++++++++++++++ utils/utils.py | 3 + 5 files changed, 279 insertions(+) create mode 100644 hyperparams/human.yml create mode 100644 utils/teleop.py diff --git a/hyperparams/human.yml b/hyperparams/human.yml new file mode 100644 index 000000000..6a564514d --- /dev/null +++ b/hyperparams/human.yml @@ -0,0 +1,7 @@ +# Space Engineers envs +SpaceEngineers-WalkingRobot-IK-v1: + env_wrapper: + - utils.wrappers.HistoryWrapper: + horizon: 2 + n_timesteps: !!float 2e6 + policy: 'MlpPolicy' diff --git a/requirements.txt b/requirements.txt index a212f7ac2..1fc86ca17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ sb3-contrib>=1.1.0a7 cloudpickle>=1.5.0 # tmp fix: ROM missing in newest release atari-py==0.2.6 +pygame diff --git a/setup.cfg b/setup.cfg index c74b8320e..6841b0fcb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,6 +20,7 @@ per-file-ignores = ./scripts/all_plots.py:E501 ./scripts/plot_train.py:E501 ./scripts/plot_training_success.py:E501 + ./utils/teleop.py:F405 exclude = # No need to traverse our git directory diff --git a/utils/teleop.py b/utils/teleop.py new file mode 100644 index 000000000..8eea21e3d --- /dev/null +++ b/utils/teleop.py @@ -0,0 +1,267 @@ +import os +import time +from typing import List, Optional, Tuple + +import numpy as np +import pygame +from gym_space_engineers.envs.walking_robot_ik import Task +from pygame.locals import * # noqa: F403 +from sb3_contrib import TQC +from stable_baselines3.common.base_class import BaseAlgorithm + +# TELEOP_RATE = 1 / 60 + +UP = (1, 0) +LEFT = (0, 1) +RIGHT = (0, -1) +DOWN = (-1, 0) +STOP = (0, 0) +KEY_CODE_SPACE = 32 + +# MAX_TURN = 1 +# # Smoothing constants +# STEP_THROTTLE = 0.8 +# STEP_TURN = 0.8 + +GREEN = (72, 205, 40) +RED = (205, 39, 46) +GREY = (187, 179, 179) +BLACK = (36, 36, 36) +WHITE = (230, 230, 230) +ORANGE = (200, 110, 0) + +# pytype: disable=name-error +moveBindingsGame = {K_UP: UP, K_LEFT: LEFT, K_RIGHT: RIGHT, K_DOWN: DOWN} # noqa: F405 +# pytype: enable=name-error +pygame.font.init() +FONT = pygame.font.SysFont("Open Sans", 25) +SMALL_FONT = pygame.font.SysFont("Open Sans", 20) +KEY_MIN_DELAY = 0.4 + + +class HumanTeleop(BaseAlgorithm): + def __init__( + self, + policy, + env, + tensorboard_log=None, + verbose=0, + seed=None, + device=None, + _init_setup_model: bool = False, + forward_controller_path: str = os.environ.get("FORWARD_CONTROLLER_PATH"), # noqa: B008 + turn_controller_path: str = os.environ.get("TURN_CONTROLLER_PATH"), # noqa: B008 + deterministic: bool = True, + ): + super(HumanTeleop, self).__init__( + policy=None, env=env, policy_base=None, learning_rate=0.0, verbose=verbose, seed=seed + ) + + # Used to prevent from multiple successive key press + self.last_time_pressed = {} + self.event_buttons = None + self.exit_thread = False + self.process = None + self.window = None + self.max_speed = 0.0 + + # Pretrained model + self.forward_controller = TQC.load(forward_controller_path) + self.turn_controller = TQC.load(turn_controller_path) + self.deterministic = deterministic + + def _excluded_save_params(self) -> List[str]: + """ + Returns the names of the parameters that should be excluded by default + when saving the model. + + :return: (List[str]) List of parameters that should be excluded from save + """ + # Exclude aliases + return super()._excluded_save_params() + ["process", "window", "forward_controller", "turn_controller", "exit_thread"] + + def _setup_model(self): + self.exit_thread = False + + def init_buttons(self): + """ + Initialize the last_time_pressed timers that prevent + successive key press. + """ + self.event_buttons = [] + for key in self.event_buttons: + self.last_time_pressed[key] = 0 + + def check_key(self, keys, key): + """ + Check if a key was pressed and update associated timer. + + :param keys: (dict) + :param key: (any hashable type) + :return: (bool) Returns true when a given key was pressed, False otherwise + """ + if key is None: + return False + if keys[key] and (time.time() - self.last_time_pressed[key]) > KEY_MIN_DELAY: + # avoid multiple key press + self.last_time_pressed[key] = time.time() + return True + return False + + def handle_keys_event(self, keys): + """ + Handle the events induced by key press: + e.g. change of mode, toggling recording, ... + """ + + # Switch from "MANUAL" to "AUTONOMOUS" mode + # if self.check_key(keys, self.button_switch_mode) or self.check_key(keys, self.button_pause): + # self.is_manual = not self.is_manual + + def main_loop(self, total_timesteps=-1): + """ + Pygame loop that listens to keyboard events. + """ + pygame.init() + # Create a pygame window + self.window = pygame.display.set_mode((800, 500), RESIZABLE) # pytype: disable=name-error + + # Init values and fill the screen + move, task = "stay", None + # TODO: implement "stay" + self.update_screen(move) + + n_steps = 0 + action = np.array([[0.0, 0.0]]) + self.max_speed = self.env.get_attr("max_speed") + + while not self.exit_thread: + x, theta = 0, 0 + # Record pressed keys + keys = pygame.key.get_pressed() + for keycode in moveBindingsGame.keys(): + if keys[keycode]: + x_tmp, th_tmp = moveBindingsGame[keycode] + x += x_tmp + theta += th_tmp + + self.handle_keys_event(keys) + # For now only handle one model at once + if x > 0: + move = "forward" + elif x < 0: + move = "backward" + elif theta > 0: + move = "turn_right" + elif theta < 0: + move = "turn_left" + else: + move = "stay" + + if move != "stay": + task = Task(move) + # TODO: check if the task has changed + self.env.env_method("change_task", task) + self.env.set_attr("max_speed", self.max_speed) + # TODO: update for the frame stack by stepping fast in the env? + # self._last_obs = self.env.env_method("change_task", task) + + if task in [Task.FORWARD, Task.BACKWARD]: + action = self.forward_controller.predict(self._last_obs, deterministic=self.deterministic) + else: + action = self.turn_controller.predict(self._last_obs, deterministic=self.deterministic) + else: + task = None + self.env.set_attr("max_speed", 0.0) + + self._last_obs, reward, done, infos = self.env.step(action) + + self.update_screen(move) + + n_steps += 1 + if total_timesteps > 0: + self.exit_thread = n_steps >= total_timesteps + + for event in pygame.event.get(): + if (event.type == QUIT or event.type == KEYDOWN) and event.key in [ # pytype: disable=name-error + K_ESCAPE, # pytype: disable=name-error + K_q, # pytype: disable=name-error + ]: + self.exit_thread = True + pygame.display.flip() + # Limit FPS + # pygame.time.Clock().tick(1 / TELEOP_RATE) + + def write_text(self, text, x, y, font, color=GREY): + """ + :param text: (str) + :param x: (int) + :param y: (int) + :param font: (str) + :param color: (tuple) + """ + text = str(text) + text = font.render(text, True, color) + self.window.blit(text, (x, y)) + + def clear(self) -> None: + self.window.fill((0, 0, 0)) + + def update_screen(self, move: str) -> None: + """ + Update pygame window. + + :param action: + """ + self.clear() + self.write_text(f"Task: {move}", 20, 0, FONT, WHITE) + + def _get_torch_save_params(self) -> Tuple[List[str], List[str]]: + """ + Get the name of the torch variables that will be saved. + ``th.save`` and ``th.load`` will be used with the right device + instead of the default pickling strategy. + + :return: (Tuple[List[str], List[str]]) + name of the variables with state dicts to save, name of additional torch tensors, + """ + return [], [] + + def learn( + self, + total_timesteps, + callback=None, + log_interval=100, + tb_log_name="run", + eval_env=None, + eval_freq=-1, + n_eval_episodes=5, + eval_log_path=None, + reset_num_timesteps=True, + ) -> "HumanTeleop": + self._last_obs = self.env.reset() + # Wait for teleop process + # time.sleep(3) + self.main_loop(total_timesteps) + + return self + + def predict( + self, + observation: np.ndarray, + state: Optional[np.ndarray] = None, + mask: Optional[np.ndarray] = None, + deterministic: bool = False, + ) -> Tuple[np.ndarray, Optional[np.ndarray]]: + """ + Get the model's action(s) from an observation + + :param observation: (np.ndarray) the input observation + :param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies) + :param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies) + :param deterministic: (bool) Whether or not to return deterministic actions. + :return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state + (used in recurrent policies) + """ + # TODO: launch separate thread to handle user keyboard events + return self.model.predict(observation, deterministic) diff --git a/utils/utils.py b/utils/utils.py index ac151f2da..cf71ca4a5 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -18,6 +18,8 @@ # For custom activation fn from torch import nn as nn # noqa: F401 pylint: disable=unused-import +from utils.teleop import HumanTeleop + ALGOS = { "a2c": A2C, "ddpg": DDPG, @@ -28,6 +30,7 @@ # SB3 Contrib, "qrdqn": QRDQN, "tqc": TQC, + "human": HumanTeleop, } From 4b4373143abf39dab6716c577c41fa3865cab360 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 7 Jun 2021 16:58:00 +0200 Subject: [PATCH 38/62] Bug fixes for teleop --- utils/teleop.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/utils/teleop.py b/utils/teleop.py index 8eea21e3d..5decd313c 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -50,6 +50,7 @@ def __init__( device=None, _init_setup_model: bool = False, forward_controller_path: str = os.environ.get("FORWARD_CONTROLLER_PATH"), # noqa: B008 + backward_controller_path: str = os.environ.get("BACKWARD_CONTROLLER_PATH"), # noqa: B008 turn_controller_path: str = os.environ.get("TURN_CONTROLLER_PATH"), # noqa: B008 deterministic: bool = True, ): @@ -66,7 +67,14 @@ def __init__( self.max_speed = 0.0 # Pretrained model + # set BACKWARD_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_41\rl_model_90000_steps.zip + # set FORWARD_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_57\SpaceEngineers-WalkingRobot-IK-v0.zip + # set TURN_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_42\SpaceEngineers-WalkingRobot-IK-v0.zip + assert forward_controller_path is not None + assert backward_controller_path is not None + assert turn_controller_path is not None self.forward_controller = TQC.load(forward_controller_path) + self.backward_controller = TQC.load(backward_controller_path) self.turn_controller = TQC.load(turn_controller_path) self.deterministic = deterministic @@ -124,7 +132,7 @@ def main_loop(self, total_timesteps=-1): """ pygame.init() # Create a pygame window - self.window = pygame.display.set_mode((800, 500), RESIZABLE) # pytype: disable=name-error + self.window = pygame.display.set_mode((200, 200), RESIZABLE) # pytype: disable=name-error # Init values and fill the screen move, task = "stay", None @@ -132,7 +140,7 @@ def main_loop(self, total_timesteps=-1): self.update_screen(move) n_steps = 0 - action = np.array([[0.0, 0.0]]) + action = np.array([self.env.action_space.sample()]) * 0.0 self.max_speed = self.env.get_attr("max_speed") while not self.exit_thread: @@ -151,9 +159,9 @@ def main_loop(self, total_timesteps=-1): move = "forward" elif x < 0: move = "backward" - elif theta > 0: - move = "turn_right" elif theta < 0: + move = "turn_right" + elif theta > 0: move = "turn_left" else: move = "stay" @@ -166,10 +174,14 @@ def main_loop(self, total_timesteps=-1): # TODO: update for the frame stack by stepping fast in the env? # self._last_obs = self.env.env_method("change_task", task) - if task in [Task.FORWARD, Task.BACKWARD]: - action = self.forward_controller.predict(self._last_obs, deterministic=self.deterministic) - else: - action = self.turn_controller.predict(self._last_obs, deterministic=self.deterministic) + controller = { + Task.FORWARD: self.forward_controller, + Task.BACKWARD: self.backward_controller, + Task.TURN_LEFT: self.turn_controller, + Task.TURN_RIGHT: self.turn_controller, + }[task] + + action = controller.predict(self._last_obs, deterministic=self.deterministic) else: task = None self.env.set_attr("max_speed", 0.0) @@ -214,7 +226,7 @@ def update_screen(self, move: str) -> None: :param action: """ self.clear() - self.write_text(f"Task: {move}", 20, 0, FONT, WHITE) + self.write_text(f"Task: {move}", 50, 50, FONT, WHITE) def _get_torch_save_params(self) -> Tuple[List[str], List[str]]: """ From 265f7ce8106a4cfe8d05041f68d07ac5fcbeeee2 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 7 Jun 2021 17:28:31 +0200 Subject: [PATCH 39/62] One controller per movement --- utils/teleop.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/utils/teleop.py b/utils/teleop.py index 5decd313c..e40bf860e 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -51,9 +51,24 @@ def __init__( _init_setup_model: bool = False, forward_controller_path: str = os.environ.get("FORWARD_CONTROLLER_PATH"), # noqa: B008 backward_controller_path: str = os.environ.get("BACKWARD_CONTROLLER_PATH"), # noqa: B008 - turn_controller_path: str = os.environ.get("TURN_CONTROLLER_PATH"), # noqa: B008 + turn_left_controller_path: str = os.environ.get("TURN_LEFT_CONTROLLER_PATH"), # noqa: B008 + turn_right_controller_path: str = os.environ.get("TURN_RIGHT_CONTROLLER_PATH"), # noqa: B008 deterministic: bool = True, ): + assert forward_controller_path is not None + assert backward_controller_path is not None + assert turn_left_controller_path is not None + assert turn_right_controller_path is not None + # Pretrained model + # set BACKWARD_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_41\rl_model_90000_steps.zip + # set FORWARD_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_57\SpaceEngineers-WalkingRobot-IK-v0.zip + # set TURN_LEFT_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_74\rl_model_40000_steps.zip + # set TURN_RIGHT_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_42\SpaceEngineers-WalkingRobot-IK-v0.zip + self.forward_controller = TQC.load(forward_controller_path) + self.backward_controller = TQC.load(backward_controller_path) + self.turn_left_controller = TQC.load(turn_left_controller_path) + self.turn_right_controller = TQC.load(turn_right_controller_path) + super(HumanTeleop, self).__init__( policy=None, env=env, policy_base=None, learning_rate=0.0, verbose=verbose, seed=seed ) @@ -66,16 +81,6 @@ def __init__( self.window = None self.max_speed = 0.0 - # Pretrained model - # set BACKWARD_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_41\rl_model_90000_steps.zip - # set FORWARD_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_57\SpaceEngineers-WalkingRobot-IK-v0.zip - # set TURN_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_42\SpaceEngineers-WalkingRobot-IK-v0.zip - assert forward_controller_path is not None - assert backward_controller_path is not None - assert turn_controller_path is not None - self.forward_controller = TQC.load(forward_controller_path) - self.backward_controller = TQC.load(backward_controller_path) - self.turn_controller = TQC.load(turn_controller_path) self.deterministic = deterministic def _excluded_save_params(self) -> List[str]: @@ -177,8 +182,8 @@ def main_loop(self, total_timesteps=-1): controller = { Task.FORWARD: self.forward_controller, Task.BACKWARD: self.backward_controller, - Task.TURN_LEFT: self.turn_controller, - Task.TURN_RIGHT: self.turn_controller, + Task.TURN_LEFT: self.turn_left_controller, + Task.TURN_RIGHT: self.turn_right_controller, }[task] action = controller.predict(self._last_obs, deterministic=self.deterministic) @@ -252,8 +257,6 @@ def learn( reset_num_timesteps=True, ) -> "HumanTeleop": self._last_obs = self.env.reset() - # Wait for teleop process - # time.sleep(3) self.main_loop(total_timesteps) return self From 7b070efc6d00f9159651b1088f3248e249e173b3 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 9 Jun 2021 10:39:11 +0200 Subject: [PATCH 40/62] Update hyperparams --- hyperparams/tqc.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 2ec1d731b..dda847bf8 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -285,7 +285,7 @@ parking-v0: policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)" # Space Engineers envs -SpaceEngineers-WalkingRobot-IK-v0: +SE-WalkingSymmetric-IK-v0: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 @@ -295,7 +295,7 @@ SpaceEngineers-WalkingRobot-IK-v0: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - utils.callbacks.StopTrainingOnMeanRewardThreshold: - reward_threshold: 150 + reward_threshold: 250 verbose: 1 n_timesteps: !!float 2e6 policy: 'MlpPolicy' @@ -304,7 +304,7 @@ SpaceEngineers-WalkingRobot-IK-v0: batch_size: 256 ent_coef: 'auto' gamma: 0.98 - tau: 0.02 + tau: 0.05 # train_freq: [1, "episode"] train_freq: 100 n_envs: 4 From b841d9fc022ae0155b836d40f03bd4ae1eab1390 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 9 Jun 2021 15:38:38 +0200 Subject: [PATCH 41/62] Update name --- hyperparams/tqc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index dda847bf8..5e0794c7a 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -285,7 +285,7 @@ parking-v0: policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)" # Space Engineers envs -SE-WalkingSymmetric-IK-v0: +SE-Symmetric-v0: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 @@ -295,7 +295,7 @@ SE-WalkingSymmetric-IK-v0: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - utils.callbacks.StopTrainingOnMeanRewardThreshold: - reward_threshold: 250 + reward_threshold: 2500 verbose: 1 n_timesteps: !!float 2e6 policy: 'MlpPolicy' From 2143e7419acfa268dafe06583044355f313ab270 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 9 Jun 2021 15:52:12 +0200 Subject: [PATCH 42/62] Reformat --- utils/exp_manager.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 87aef9af7..5e7182ade 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -26,7 +26,15 @@ from stable_baselines3.common.preprocessing import is_image_space, is_image_space_channels_first from stable_baselines3.common.sb2_compat.rmsprop_tf_like import RMSpropTFLike # noqa: F401 from stable_baselines3.common.utils import constant_fn -from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv, VecEnvWrapper, VecFrameStack, VecNormalize, VecTransposeImage +from stable_baselines3.common.vec_env import ( + DummyVecEnv, + SubprocVecEnv, + VecEnv, + VecEnvWrapper, + VecFrameStack, + VecNormalize, + VecTransposeImage, +) # For custom activation fn from torch import nn as nn # noqa: F401 From 8bae4c5175f207e045bdea3a8a167e3e65964c03 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 10 Jun 2021 17:14:06 +0200 Subject: [PATCH 43/62] Update env name + add defaults --- hyperparams/tqc.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 5e0794c7a..2d564a33d 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -285,7 +285,7 @@ parking-v0: policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)" # Space Engineers envs -SE-Symmetric-v0: +SE-Symmetric-v1: &defaults env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 @@ -295,7 +295,7 @@ SE-Symmetric-v0: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - utils.callbacks.StopTrainingOnMeanRewardThreshold: - reward_threshold: 2500 + reward_threshold: 250 verbose: 1 n_timesteps: !!float 2e6 policy: 'MlpPolicy' @@ -314,6 +314,15 @@ SE-Symmetric-v0: top_quantiles_to_drop_per_net: 2 policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300], n_critics=2)" +SE-TurnLeft-v1: + <<: *defaults + callback: + - utils.callbacks.ParallelTrainCallback: + gradient_steps: 400 + - utils.callbacks.StopTrainingOnMeanRewardThreshold: + reward_threshold: 2500 + verbose: 1 + # ======== Real Robot envs ============ WalkingBertSim-v1: From eb9580bc7023881820e4c06ab1b6076e7c5559ca Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 11 Jun 2021 18:07:47 +0200 Subject: [PATCH 44/62] Prepare hyperparam optim --- hyperparams/tqc.yml | 6 +++--- utils/hyperparams_opt.py | 29 +++++++++++++++++------------ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 2d564a33d..eb0c408b7 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -294,9 +294,9 @@ SE-Symmetric-v1: &defaults callback: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - - utils.callbacks.StopTrainingOnMeanRewardThreshold: - reward_threshold: 250 - verbose: 1 + # - utils.callbacks.StopTrainingOnMeanRewardThreshold: + # reward_threshold: 250 + # verbose: 1 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 diff --git a/utils/hyperparams_opt.py b/utils/hyperparams_opt.py index d1a9a0f0a..09783ac17 100644 --- a/utils/hyperparams_opt.py +++ b/utils/hyperparams_opt.py @@ -150,21 +150,22 @@ def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]: gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128, 256, 512, 1024, 2048]) - buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)]) - learning_starts = trial.suggest_categorical("learning_starts", [0, 1000, 10000, 20000]) + buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(5e4), int(1e5)]) + learning_starts = trial.suggest_categorical("learning_starts", [0, 1000]) # train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 300]) - train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512]) + # train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512]) + train_freq = 100 # Polyak coeff - tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08]) + tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.008, 0.01, 0.02, 0.05, 0.08]) # gradient_steps takes too much time # gradient_steps = trial.suggest_categorical('gradient_steps', [1, 100, 300]) gradient_steps = train_freq # ent_coef = trial.suggest_categorical('ent_coef', ['auto', 0.5, 0.1, 0.05, 0.01, 0.0001]) ent_coef = "auto" # You can comment that out when not using gSDE - log_std_init = trial.suggest_uniform("log_std_init", -4, 1) + # log_std_init = trial.suggest_uniform("log_std_init", -4, 1) # NOTE: Add "verybig" to net_arch when tuning HER - net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"]) + net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big", "large", "verybig"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) net_arch = { @@ -172,8 +173,8 @@ def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]: "medium": [256, 256], "big": [400, 300], # Uncomment for tuning HER - # "large": [256, 256, 256], - # "verybig": [512, 512, 512], + "large": [256, 256, 256], + "verybig": [512, 512, 512], }[net_arch] target_entropy = "auto" @@ -181,6 +182,8 @@ def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]: # # target_entropy = trial.suggest_categorical('target_entropy', ['auto', 5, 1, 0, -1, -5, -10, -20, -50]) # target_entropy = trial.suggest_uniform('target_entropy', -10, 10) + # log_std_init=log_std_init, + hyperparams = { "gamma": gamma, "learning_rate": learning_rate, @@ -192,7 +195,7 @@ def sample_sac_params(trial: optuna.Trial) -> Dict[str, Any]: "ent_coef": ent_coef, "tau": tau, "target_entropy": target_entropy, - "policy_kwargs": dict(log_std_init=log_std_init, net_arch=net_arch), + "policy_kwargs": dict(net_arch=net_arch), } if trial.using_her_replay_buffer: @@ -387,10 +390,12 @@ def sample_tqc_params(trial: optuna.Trial) -> Dict[str, Any]: # TQC is SAC + Distributional RL hyperparams = sample_sac_params(trial) - n_quantiles = trial.suggest_int("n_quantiles", 5, 50) - top_quantiles_to_drop_per_net = trial.suggest_int("top_quantiles_to_drop_per_net", 0, n_quantiles - 1) + # n_quantiles = trial.suggest_int("n_quantiles", 5, 50) + n_quantiles = trial.suggest_categorical("n_quantiles", [15, 25, 50]) + top_quantiles_to_drop_per_net = trial.suggest_int("top_quantiles_to_drop_per_net", 0, 10) + n_critics = trial.suggest_int("n_quantiles", 1, 2) - hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles}) + hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles, "n_critics": n_critics}) hyperparams["top_quantiles_to_drop_per_net"] = top_quantiles_to_drop_per_net return hyperparams From 77308d7a584a4ec7884063aa2b8032230273b691 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 11 Jun 2021 20:03:20 +0200 Subject: [PATCH 45/62] Fixes for hyperparam optim --- utils/exp_manager.py | 21 +++++++++++++++++++++ utils/hyperparams_opt.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 5e7182ade..2586ae55d 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -638,9 +638,30 @@ def objective(self, trial: optuna.Trial) -> float: try: model.learn(self.n_timesteps, callback=eval_callback) # Free memory + env_tmp = model.env + while isinstance(env_tmp, VecEnvWrapper): + env_tmp = env_tmp.venv + env_tmp.waiting = False + + env_tmp = eval_env + while isinstance(env_tmp, VecEnvWrapper): + env_tmp = env_tmp.venv + env_tmp.waiting = False + model.env.close() eval_env.close() except (AssertionError, ValueError) as e: + # Hack for zmq on Windows to allow early termination + env_tmp = model.env + while isinstance(env_tmp, VecEnvWrapper): + env_tmp = env_tmp.venv + env_tmp.waiting = False + + env_tmp = eval_env + while isinstance(env_tmp, VecEnvWrapper): + env_tmp = env_tmp.venv + env_tmp.waiting = False + # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() diff --git a/utils/hyperparams_opt.py b/utils/hyperparams_opt.py index 09783ac17..2c04207e3 100644 --- a/utils/hyperparams_opt.py +++ b/utils/hyperparams_opt.py @@ -393,7 +393,7 @@ def sample_tqc_params(trial: optuna.Trial) -> Dict[str, Any]: # n_quantiles = trial.suggest_int("n_quantiles", 5, 50) n_quantiles = trial.suggest_categorical("n_quantiles", [15, 25, 50]) top_quantiles_to_drop_per_net = trial.suggest_int("top_quantiles_to_drop_per_net", 0, 10) - n_critics = trial.suggest_int("n_quantiles", 1, 2) + n_critics = trial.suggest_int("n_critics", 1, 2) hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles, "n_critics": n_critics}) hyperparams["top_quantiles_to_drop_per_net"] = top_quantiles_to_drop_per_net From 78c064fba1fec745c574a54d0a6aab677209f47a Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 14 Jun 2021 09:35:28 +0200 Subject: [PATCH 46/62] Test with PPO --- hyperparams/ppo.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml index 4573ee0a4..850c29355 100644 --- a/hyperparams/ppo.yml +++ b/hyperparams/ppo.yml @@ -386,22 +386,24 @@ CarRacing-v0: ortho_init=False, )" -SpaceEngineers-WalkingRobot-IK-v0: +SE-Symmetric-v1: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 - sb3_contrib.common.wrappers.TimeFeatureWrapper + vec_env_wrapper: + - utils.wrappers.VecForceResetWrapper normalize: true n_envs: 4 n_timesteps: !!float 2e6 policy: 'MlpPolicy' batch_size: 128 - n_steps: 256 + n_steps: 128 gamma: 0.99 gae_lambda: 0.9 n_epochs: 20 ent_coef: 0.0 - sde_sample_freq: 4 + sde_sample_freq: 2 max_grad_norm: 0.5 vf_coef: 0.5 learning_rate: !!float 3e-5 From 6d24d70e35d70c837e684cab3ee057bb559ccf5f Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 14 Jun 2021 17:24:37 +0200 Subject: [PATCH 47/62] Better disable train --- utils/callbacks.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/utils/callbacks.py b/utils/callbacks.py index 6e691b8d6..4f7bfeea6 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -137,22 +137,22 @@ def _init_callback(self) -> None: self.batch_size = self._model.batch_size - # TODO: update SB3 and check train freq instead - # of gradient_steps > 0 - self.model.gradient_steps = 1 - self.model.tau = 0.0 - self.model.learning_rate = 0.0 - self.model.lr_schedule = lambda _: 0.0 - self.model.batch_size = 1 - # Deactivate logger (TODO(toni): fix this when configuring logger works) - self.model.log_interval = 1000000 - # TODO: change learning starts when using gSDE + # Disable train method + def patch_train(function): + @wraps(function) + def wrapper(*args, **kwargs): + return + + return wrapper + # Add logger for parallel training + self._model.set_logger(self.model.logger) + self.model.train = patch_train(self.model.train) # Hack: Re-add correct values at save time def patch_save(function): @wraps(function) def wrapper(*args, **kwargs): - self._model.save(*args, **kwargs) + return self._model.save(*args, **kwargs) return wrapper @@ -167,7 +167,6 @@ def train(self) -> None: def _train_thread(self) -> None: self._model.train(gradient_steps=self.gradient_steps, batch_size=self.batch_size) self._model_ready = True - self.logger.record("train/n_updates_real", self._model._n_updates, exclude="tensorboard") def _on_step(self) -> bool: if self.sleep_time > 0: From 1d6b67ed1c2fe56424d2c327a97b6c45b619418a Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 16 Jul 2021 16:46:33 +0200 Subject: [PATCH 48/62] Add mixture of experts policy --- utils/networks.py | 166 ++++++++++++++++++++++++++++++++++++++++++++++ utils/utils.py | 3 + 2 files changed, 169 insertions(+) create mode 100644 utils/networks.py diff --git a/utils/networks.py b/utils/networks.py new file mode 100644 index 000000000..4a09bbc40 --- /dev/null +++ b/utils/networks.py @@ -0,0 +1,166 @@ +import os +from typing import Dict, List, Optional, Tuple, Type + +import gym +import torch as th +from sb3_contrib import TQC +from sb3_contrib.tqc.policies import TQCPolicy +from stable_baselines3.common.policies import BasePolicy, register_policy +from stable_baselines3.common.preprocessing import get_action_dim +from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, create_mlp +from torch import nn + +# CAP the standard deviation of the actor +LOG_STD_MAX = 2 +LOG_STD_MIN = -20 + + +class MixtureActor(BasePolicy): + """ + Actor network (policy) for SAC. + + :param observation_space: Obervation space + :param action_space: Action space + :param net_arch: Network architecture + :param features_extractor: Network to extract features + (a CNN when using images, a nn.Flatten() layer otherwise) + :param features_dim: Number of features + :param activation_fn: Activation function + :param use_sde: Whether to use State Dependent Exploration or not + :param log_std_init: Initial value for the log standard deviation + :param full_std: Whether to use (n_features x n_actions) parameters + for the std instead of only (n_features,) when using gSDE. + :param sde_net_arch: Network architecture for extracting features + when using gSDE. If None, the latent features from the policy will be used. + Pass an empty list to use the states as features. + :param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure + a positive standard deviation (cf paper). It allows to keep variance + above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough. + :param clip_mean: Clip the mean output when using gSDE to avoid numerical instability. + :param normalize_images: Whether to normalize images or not, + dividing by 255.0 (True by default) + """ + + def __init__( + self, + observation_space: gym.spaces.Space, + action_space: gym.spaces.Space, + net_arch: List[int], + features_extractor: nn.Module, + features_dim: int, + activation_fn: Type[nn.Module] = nn.ReLU, + normalize_images: bool = True, + ): + super().__init__( + observation_space, + action_space, + features_extractor=features_extractor, + normalize_images=normalize_images, + squash_output=True, + ) + + expert_paths = ["FORWARD", "BACKWARD", "TURN_LEFT", "TURN_RIGHT"] + self.num_experts = len(expert_paths) + self.experts = [] + for path in expert_paths: + actor = TQC.load(os.environ[f"{path}_CONTROLLER_PATH"]).actor + self.experts.append(actor) + + features_dim = self.experts[0].features_dim + self.experts = nn.ModuleList(self.experts) + # TODO: replace with MLP? + # self.w_gate = nn.Parameter(th.zeros(features_dim, num_experts), requires_grad=True) + gating_net_arch = [64, 64] + gating_net = create_mlp(features_dim, self.num_experts, gating_net_arch, activation_fn) + gating_net += [nn.Softmax(1)] + self.gating_net = nn.Sequential(*gating_net) + self.action_dim = get_action_dim(self.action_space) + + def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, Dict[str, th.Tensor]]: + """ + Get the parameters for the action distribution. + + :param obs: + :return: + Mean, standard deviation and optional keyword arguments. + """ + features = self.extract_features(obs) + expert_means = th.zeros(obs.shape[0], self.num_experts, self.action_dim).to(obs.device) + expert_stds = th.zeros(obs.shape[0], self.num_experts, self.action_dim).to(obs.device) + + for i in range(self.num_experts): + latent_pi = self.experts[i].latent_pi(features) + expert_means[:, i, :] = self.experts[i].mu(latent_pi) + # Unstructured exploration (Original implementation) + log_std = self.log_std(latent_pi) + # Original Implementation to cap the standard deviation + expert_stds[:, i, :] = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) + + # gates: [batch_size, num_experts] + gates = self.gating_net(features) + + # expert_means: [batch_size, num_experts, action_dim] + # mean_actions: [batch_size, action_dim] + mean_actions = (gates * expert_means).sum(dim=1) + log_std = (gates * expert_stds).sum(dim=1) + + return mean_actions, log_std, {} + + def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor: + mean_actions, log_std, kwargs = self.get_action_dist_params(obs) + # Note: the action is squashed + return self.action_dist.actions_from_params(mean_actions, log_std, deterministic=deterministic, **kwargs) + + def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: + mean_actions, log_std, kwargs = self.get_action_dist_params(obs) + # return action and associated log prob + return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs) + + def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: + return self.forward(observation, deterministic) + + +class MixtureMlpPolicy(TQCPolicy): + """ + Policy class (with both actor and critic) for TQC. + + :param observation_space: Observation space + :param action_space: Action space + :param lr_schedule: Learning rate schedule (could be constant) + :param net_arch: The specification of the policy and value networks. + :param activation_fn: Activation function + :param use_sde: Whether to use State Dependent Exploration or not + :param log_std_init: Initial value for the log standard deviation + :param sde_net_arch: Network architecture for extracting features + when using gSDE. If None, the latent features from the policy will be used. + Pass an empty list to use the states as features. + :param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure + a positive standard deviation (cf paper). It allows to keep variance + above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough. + :param clip_mean: Clip the mean output when using gSDE to avoid numerical instability. + :param features_extractor_class: Features extractor to use. + :param features_extractor_kwargs: Keyword arguments + to pass to the features extractor. + :param normalize_images: Whether to normalize images or not, + dividing by 255.0 (True by default) + :param optimizer_class: The optimizer to use, + ``th.optim.Adam`` by default + :param optimizer_kwargs: Additional keyword arguments, + excluding the learning rate, to pass to the optimizer + :param n_critics: Number of critic networks to create. + :param share_features_extractor: Whether to share or not the features extractor + between the actor and the critic (this saves computation time) + """ + + def __init__(self, *args, **kwargs): + super(MixtureMlpPolicy, self).__init__( + *args, + **kwargs, + ) + + def make_actor(self, features_extractor: Optional[BaseFeaturesExtractor] = None) -> MixtureActor: + actor_kwargs = self._update_features_extractor(self.actor_kwargs, features_extractor) + return MixtureActor(**actor_kwargs).to(self.device) + + +register_policy("MixtureMlpPolicy", MixtureMlpPolicy) diff --git a/utils/utils.py b/utils/utils.py index cf71ca4a5..a0290dfb1 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -20,6 +20,9 @@ from utils.teleop import HumanTeleop +# Register Additional policies +import utils.networks # noqa: F401 + ALGOS = { "a2c": A2C, "ddpg": DDPG, From c3473ed496d8311dd81cbae5dbbf45fbbbb18b90 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 16 Jul 2021 18:24:18 +0200 Subject: [PATCH 49/62] Bug fixes --- hyperparams/tqc.yml | 13 ++++++++++--- utils/networks.py | 8 ++++++-- utils/teleop.py | 8 ++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 69aa82d38..aa7e791c8 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -241,9 +241,9 @@ SE-Symmetric-v1: &defaults callback: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - # - utils.callbacks.StopTrainingOnMeanRewardThreshold: - # reward_threshold: 250 - # verbose: 1 + - utils.callbacks.StopTrainingOnMeanRewardThreshold: + reward_threshold: 250 + verbose: 1 n_timesteps: !!float 2e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 @@ -270,6 +270,13 @@ SE-TurnLeft-v1: reward_threshold: 2500 verbose: 1 +SE-MultiTask-v1: + <<: *defaults + policy: 'MixtureMlpPolicy' + callback: + - utils.callbacks.ParallelTrainCallback: + gradient_steps: 400 + # ======== Real Robot envs ============ WalkingBertSim-v1: diff --git a/utils/networks.py b/utils/networks.py index 4a09bbc40..8341e0d46 100644 --- a/utils/networks.py +++ b/utils/networks.py @@ -50,6 +50,9 @@ def __init__( features_dim: int, activation_fn: Type[nn.Module] = nn.ReLU, normalize_images: bool = True, + # ignore + *_args, + **_kwargs, ): super().__init__( observation_space, @@ -75,6 +78,7 @@ def __init__( gating_net += [nn.Softmax(1)] self.gating_net = nn.Sequential(*gating_net) self.action_dim = get_action_dim(self.action_space) + self.action_dist = self.experts[0].action_dist def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, Dict[str, th.Tensor]]: """ @@ -92,12 +96,12 @@ def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, latent_pi = self.experts[i].latent_pi(features) expert_means[:, i, :] = self.experts[i].mu(latent_pi) # Unstructured exploration (Original implementation) - log_std = self.log_std(latent_pi) + log_std = self.experts[i].log_std(latent_pi) # Original Implementation to cap the standard deviation expert_stds[:, i, :] = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) # gates: [batch_size, num_experts] - gates = self.gating_net(features) + gates = self.gating_net(features).unsqueeze(-1) # expert_means: [batch_size, num_experts, action_dim] # mean_actions: [batch_size, action_dim] diff --git a/utils/teleop.py b/utils/teleop.py index e40bf860e..ca749eade 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -60,10 +60,10 @@ def __init__( assert turn_left_controller_path is not None assert turn_right_controller_path is not None # Pretrained model - # set BACKWARD_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_41\rl_model_90000_steps.zip - # set FORWARD_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_57\SpaceEngineers-WalkingRobot-IK-v0.zip - # set TURN_LEFT_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_74\rl_model_40000_steps.zip - # set TURN_RIGHT_CONTROLLER_PATH=logs\tqc\SpaceEngineers-WalkingRobot-IK-v0_42\SpaceEngineers-WalkingRobot-IK-v0.zip + # set BACKWARD_CONTROLLER_PATH=logs\pretrained-tqc\SE-Symmetric-v1_2\SE-Symmetric-v1.zip + # set FORWARD_CONTROLLER_PATH=logs\pretrained-tqc\SE-Symmetric-v1_1\SE-Symmetric-v1.zip + # set TURN_LEFT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_1\SE-TurnLeft-v1.zip + # set TURN_RIGHT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_2\SE-TurnLeft-v1.zip self.forward_controller = TQC.load(forward_controller_path) self.backward_controller = TQC.load(backward_controller_path) self.turn_left_controller = TQC.load(turn_left_controller_path) From 2bd21b68fa1af2a0e94c4bf5f2eb5d1826309ab1 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 19 Jul 2021 17:41:50 +0200 Subject: [PATCH 50/62] Stop grad + additional experts --- hyperparams/tqc.yml | 6 ++++++ utils/networks.py | 33 ++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index aa7e791c8..161df8947 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -273,9 +273,15 @@ SE-TurnLeft-v1: SE-MultiTask-v1: <<: *defaults policy: 'MixtureMlpPolicy' + learning_rate: !!float 7.3e-4 + gamma: 0.98 + tau: 0.05 + buffer_size: 200000 callback: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 + policy_kwargs: "dict(net_arch=[400, 300], n_critics=2, n_additional_experts=1)" + # ======== Real Robot envs ============ diff --git a/utils/networks.py b/utils/networks.py index 8341e0d46..baedb280e 100644 --- a/utils/networks.py +++ b/utils/networks.py @@ -50,6 +50,7 @@ def __init__( features_dim: int, activation_fn: Type[nn.Module] = nn.ReLU, normalize_images: bool = True, + n_additional_experts: int = 0, # ignore *_args, **_kwargs, @@ -64,11 +65,19 @@ def __init__( expert_paths = ["FORWARD", "BACKWARD", "TURN_LEFT", "TURN_RIGHT"] self.num_experts = len(expert_paths) + self.n_additional_experts = n_additional_experts + print(f"{n_additional_experts} additional experts") + self.num_experts += self.n_additional_experts self.experts = [] for path in expert_paths: actor = TQC.load(os.environ[f"{path}_CONTROLLER_PATH"]).actor self.experts.append(actor) + # Add additional experts + for _ in range(self.n_additional_experts): + actor = TQC.load(os.environ[f"{expert_paths[0]}_CONTROLLER_PATH"]).actor + self.experts.append(actor) + features_dim = self.experts[0].features_dim self.experts = nn.ModuleList(self.experts) # TODO: replace with MLP? @@ -93,15 +102,20 @@ def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, expert_stds = th.zeros(obs.shape[0], self.num_experts, self.action_dim).to(obs.device) for i in range(self.num_experts): - latent_pi = self.experts[i].latent_pi(features) - expert_means[:, i, :] = self.experts[i].mu(latent_pi) - # Unstructured exploration (Original implementation) - log_std = self.experts[i].log_std(latent_pi) - # Original Implementation to cap the standard deviation - expert_stds[:, i, :] = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) + # Allow grad for one expert only + with th.set_grad_enabled(i >= self.num_experts - self.n_additional_experts): + latent_pi = self.experts[i].latent_pi(features) + expert_means[:, i, :] = self.experts[i].mu(latent_pi) + # Unstructured exploration (Original implementation) + log_std = self.experts[i].log_std(latent_pi) + # Original Implementation to cap the standard deviation + expert_stds[:, i, :] = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) # gates: [batch_size, num_experts] - gates = self.gating_net(features).unsqueeze(-1) + input_commands = features.clone() + # TODO: extract task features only? + # input_commands[:-2] = 0.0 + gates = self.gating_net(input_commands).unsqueeze(-1) # expert_means: [batch_size, num_experts, action_dim] # mean_actions: [batch_size, action_dim] @@ -156,7 +170,8 @@ class MixtureMlpPolicy(TQCPolicy): between the actor and the critic (this saves computation time) """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, n_additional_experts: int = 0, **kwargs): + self.n_additional_experts = n_additional_experts super(MixtureMlpPolicy, self).__init__( *args, **kwargs, @@ -164,7 +179,7 @@ def __init__(self, *args, **kwargs): def make_actor(self, features_extractor: Optional[BaseFeaturesExtractor] = None) -> MixtureActor: actor_kwargs = self._update_features_extractor(self.actor_kwargs, features_extractor) - return MixtureActor(**actor_kwargs).to(self.device) + return MixtureActor(n_additional_experts=self.n_additional_experts, **actor_kwargs).to(self.device) register_policy("MixtureMlpPolicy", MixtureMlpPolicy) From 79c0deb63a99dd2ad0ae19f1346344966df4ecef Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 20 Jul 2021 17:32:21 +0200 Subject: [PATCH 51/62] Learn from scratch --- hyperparams/tqc.yml | 6 +++--- utils/networks.py | 13 +++++++++++-- utils/teleop.py | 36 +++++++++++++++++++++++------------- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 161df8947..85c133fd2 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -259,7 +259,7 @@ SE-Symmetric-v1: &defaults learning_starts: 800 use_sde: False top_quantiles_to_drop_per_net: 2 - policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300], n_critics=2)" + policy_kwargs: "dict(net_arch=[256, 256], n_critics=2)" SE-TurnLeft-v1: <<: *defaults @@ -267,7 +267,7 @@ SE-TurnLeft-v1: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - utils.callbacks.StopTrainingOnMeanRewardThreshold: - reward_threshold: 2500 + reward_threshold: 250 verbose: 1 SE-MultiTask-v1: @@ -280,7 +280,7 @@ SE-MultiTask-v1: callback: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - policy_kwargs: "dict(net_arch=[400, 300], n_critics=2, n_additional_experts=1)" + policy_kwargs: "dict(net_arch=[400, 300], n_critics=2, n_additional_experts=4)" # ======== Real Robot envs ============ diff --git a/utils/networks.py b/utils/networks.py index baedb280e..4d65f0dee 100644 --- a/utils/networks.py +++ b/utils/networks.py @@ -63,11 +63,20 @@ def __init__( squash_output=True, ) - expert_paths = ["FORWARD", "BACKWARD", "TURN_LEFT", "TURN_RIGHT"] + # Pretrained model + # set BACKWARD_CONTROLLER_PATH=logs\pretrained-tqc\SE-Symmetric-v1_2\SE-Symmetric-v1.zip + # set FORWARD_CONTROLLER_PATH=logs\pretrained-tqc\SE-Symmetric-v1_1\SE-Symmetric-v1.zip + # set TURN_LEFT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_1\SE-TurnLeft-v1.zip + # set TURN_RIGHT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_2\SE-TurnLeft-v1.zip + # set RANDOM_CONTROLLER_PATH=logs\pretrained-tqc\SE-Random-small\SE-TurnLeft-v1.zip + + # expert_paths = ["FORWARD", "BACKWARD", "TURN_LEFT", "TURN_RIGHT"] + expert_paths = [] self.num_experts = len(expert_paths) self.n_additional_experts = n_additional_experts print(f"{n_additional_experts} additional experts") self.num_experts += self.n_additional_experts + self.experts = [] for path in expert_paths: actor = TQC.load(os.environ[f"{path}_CONTROLLER_PATH"]).actor @@ -75,7 +84,7 @@ def __init__( # Add additional experts for _ in range(self.n_additional_experts): - actor = TQC.load(os.environ[f"{expert_paths[0]}_CONTROLLER_PATH"]).actor + actor = TQC.load(os.environ["RANDOM_CONTROLLER_PATH"]).actor self.experts.append(actor) features_dim = self.experts[0].features_dim diff --git a/utils/teleop.py b/utils/teleop.py index ca749eade..516e03f07 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -53,21 +53,30 @@ def __init__( backward_controller_path: str = os.environ.get("BACKWARD_CONTROLLER_PATH"), # noqa: B008 turn_left_controller_path: str = os.environ.get("TURN_LEFT_CONTROLLER_PATH"), # noqa: B008 turn_right_controller_path: str = os.environ.get("TURN_RIGHT_CONTROLLER_PATH"), # noqa: B008 + multi_controller_path: str = os.environ.get("MULTI_CONTROLLER_PATH"), # noqa: B008 deterministic: bool = True, ): - assert forward_controller_path is not None - assert backward_controller_path is not None - assert turn_left_controller_path is not None - assert turn_right_controller_path is not None - # Pretrained model - # set BACKWARD_CONTROLLER_PATH=logs\pretrained-tqc\SE-Symmetric-v1_2\SE-Symmetric-v1.zip - # set FORWARD_CONTROLLER_PATH=logs\pretrained-tqc\SE-Symmetric-v1_1\SE-Symmetric-v1.zip - # set TURN_LEFT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_1\SE-TurnLeft-v1.zip - # set TURN_RIGHT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_2\SE-TurnLeft-v1.zip - self.forward_controller = TQC.load(forward_controller_path) - self.backward_controller = TQC.load(backward_controller_path) - self.turn_left_controller = TQC.load(turn_left_controller_path) - self.turn_right_controller = TQC.load(turn_right_controller_path) + self.multi_controller_path = multi_controller_path + if multi_controller_path is None: + assert forward_controller_path is not None + assert backward_controller_path is not None + assert turn_left_controller_path is not None + assert turn_right_controller_path is not None + # Pretrained model + # set BACKWARD_CONTROLLER_PATH=logs\pretrained-tqc\SE-Symmetric-v1_2\SE-Symmetric-v1.zip + # set FORWARD_CONTROLLER_PATH=logs\pretrained-tqc\SE-Symmetric-v1_1\SE-Symmetric-v1.zip + # set TURN_LEFT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_1\SE-TurnLeft-v1.zip + # set TURN_RIGHT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_2\SE-TurnLeft-v1.zip + self.forward_controller = TQC.load(forward_controller_path) + self.backward_controller = TQC.load(backward_controller_path) + self.turn_left_controller = TQC.load(turn_left_controller_path) + self.turn_right_controller = TQC.load(turn_right_controller_path) + else: + self.forward_controller = TQC.load(multi_controller_path) + self.backward_controller = self.forward_controller + self.turn_left_controller = self.forward_controller + self.turn_right_controller = self.forward_controller + super(HumanTeleop, self).__init__( policy=None, env=env, policy_base=None, learning_rate=0.0, verbose=verbose, seed=seed @@ -187,6 +196,7 @@ def main_loop(self, total_timesteps=-1): }[task] action = controller.predict(self._last_obs, deterministic=self.deterministic) + # TODO for multi policy: display proba for each expert else: task = None self.env.set_attr("max_speed", 0.0) From dcd26079591403cc0dda11ef1feeeb5738eaf145 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 20 Jul 2021 17:33:56 +0200 Subject: [PATCH 52/62] Reformat --- utils/teleop.py | 1 - utils/utils.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/utils/teleop.py b/utils/teleop.py index 516e03f07..519573302 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -77,7 +77,6 @@ def __init__( self.turn_left_controller = self.forward_controller self.turn_right_controller = self.forward_controller - super(HumanTeleop, self).__init__( policy=None, env=env, policy_base=None, learning_rate=0.0, verbose=verbose, seed=seed ) diff --git a/utils/utils.py b/utils/utils.py index a0290dfb1..ad6903dd0 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -18,10 +18,9 @@ # For custom activation fn from torch import nn as nn # noqa: F401 pylint: disable=unused-import -from utils.teleop import HumanTeleop - # Register Additional policies import utils.networks # noqa: F401 +from utils.teleop import HumanTeleop ALGOS = { "a2c": A2C, From 2ea1bb4a16afcb34509c0d2ca2210d8321351925 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 21 Jul 2021 00:24:08 +0200 Subject: [PATCH 53/62] Add multi task controller --- hyperparams/human.yml | 2 +- utils/teleop.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hyperparams/human.yml b/hyperparams/human.yml index 6a564514d..5c1d1c473 100644 --- a/hyperparams/human.yml +++ b/hyperparams/human.yml @@ -1,5 +1,5 @@ # Space Engineers envs -SpaceEngineers-WalkingRobot-IK-v1: +SE-WalkingTest-v1: env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 diff --git a/utils/teleop.py b/utils/teleop.py index 519573302..8af06228f 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -72,6 +72,7 @@ def __init__( self.turn_left_controller = TQC.load(turn_left_controller_path) self.turn_right_controller = TQC.load(turn_right_controller_path) else: + # set MULTI_CONTROLLER_PATH=logs\pretrained-tqc\SE-MultiTask-v1_9/rl_model_250000_steps.zip self.forward_controller = TQC.load(multi_controller_path) self.backward_controller = self.forward_controller self.turn_left_controller = self.forward_controller From 49d93917b1b15091c6c537b4929f9d9c55e84bd6 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 29 Jul 2021 08:49:33 +0200 Subject: [PATCH 54/62] One policy to rule them all --- hyperparams/tqc.yml | 9 +++++---- utils/networks.py | 5 +++-- utils/teleop.py | 5 +++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 85c133fd2..5ae24fff4 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -272,15 +272,16 @@ SE-TurnLeft-v1: SE-MultiTask-v1: <<: *defaults - policy: 'MixtureMlpPolicy' + # policy: 'MixtureMlpPolicy' learning_rate: !!float 7.3e-4 - gamma: 0.98 - tau: 0.05 + # gamma: 0.99 + # tau: 0.005 buffer_size: 200000 callback: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - policy_kwargs: "dict(net_arch=[400, 300], n_critics=2, n_additional_experts=4)" + # policy_kwargs: "dict(net_arch=[400, 300], n_critics=2, n_additional_experts=2)" + policy_kwargs: "dict(net_arch=[256, 256], n_critics=5)" # ======== Real Robot envs ============ diff --git a/utils/networks.py b/utils/networks.py index 4d65f0dee..126e53ea1 100644 --- a/utils/networks.py +++ b/utils/networks.py @@ -70,8 +70,9 @@ def __init__( # set TURN_RIGHT_CONTROLLER_PATH=logs\pretrained-tqc\SE-TurnLeft-v1_2\SE-TurnLeft-v1.zip # set RANDOM_CONTROLLER_PATH=logs\pretrained-tqc\SE-Random-small\SE-TurnLeft-v1.zip - # expert_paths = ["FORWARD", "BACKWARD", "TURN_LEFT", "TURN_RIGHT"] - expert_paths = [] + expert_paths = ["FORWARD", "BACKWARD", "TURN_LEFT", "TURN_RIGHT"] + # Uncomment to start without experts + # expert_paths = [] self.num_experts = len(expert_paths) self.n_additional_experts = n_additional_experts print(f"{n_additional_experts} additional experts") diff --git a/utils/teleop.py b/utils/teleop.py index 8af06228f..c8931c794 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -36,7 +36,7 @@ pygame.font.init() FONT = pygame.font.SysFont("Open Sans", 25) SMALL_FONT = pygame.font.SysFont("Open Sans", 20) -KEY_MIN_DELAY = 0.4 +KEY_MIN_DELAY = 0.1 class HumanTeleop(BaseAlgorithm): @@ -72,7 +72,8 @@ def __init__( self.turn_left_controller = TQC.load(turn_left_controller_path) self.turn_right_controller = TQC.load(turn_right_controller_path) else: - # set MULTI_CONTROLLER_PATH=logs\pretrained-tqc\SE-MultiTask-v1_9/rl_model_250000_steps.zip + # set MULTI_CONTROLLER_PATH=logs\multi-task-save\SE-MultiTask-v1_9/rl_model_250000_steps.zip + # set MULTI_CONTROLLER_PATH=logs\multi-task-save\SE-MultiTask-v1_10/rl_model_749925_steps.zip self.forward_controller = TQC.load(multi_controller_path) self.backward_controller = self.forward_controller self.turn_left_controller = self.forward_controller From 6e7cc950aeef9767fadaee81384f43cabf9ae971 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 13 Aug 2021 17:23:35 +0200 Subject: [PATCH 55/62] Add forward left --- hyperparams/tqc.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 5ae24fff4..429f7e71f 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -261,6 +261,15 @@ SE-Symmetric-v1: &defaults top_quantiles_to_drop_per_net: 2 policy_kwargs: "dict(net_arch=[256, 256], n_critics=2)" +SE-ForwardLeft-v1: + <<: *defaults + callback: + - utils.callbacks.ParallelTrainCallback: + gradient_steps: 400 + # - utils.callbacks.StopTrainingOnMeanRewardThreshold: + # reward_threshold: 250 + # verbose: 1 + SE-TurnLeft-v1: <<: *defaults callback: From 559bd44a0b76de54c0b2bbe07bb6ba48f67c7147 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 4 Oct 2021 16:48:13 +0200 Subject: [PATCH 56/62] Rename task --- hyperparams/tqc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 429f7e71f..7dab7e975 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -261,7 +261,7 @@ SE-Symmetric-v1: &defaults top_quantiles_to_drop_per_net: 2 policy_kwargs: "dict(net_arch=[256, 256], n_critics=2)" -SE-ForwardLeft-v1: +SE-Generic-v1: <<: *defaults callback: - utils.callbacks.ParallelTrainCallback: From 34c86763a5027ad5f4767e32e4d574952ceb2d84 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 2 Nov 2021 10:06:18 +0100 Subject: [PATCH 57/62] Remove unused requirements --- requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7f79b3c6c..d5ea73db9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ gym>=0.17,<0.20 -stable-baselines3[extra,tests,docs]>=1.3.0 +# stable-baselines3[extra,tests,docs]>=1.3.0 sb3-contrib>=1.3.0 -box2d-py==2.3.8 -pybullet +# box2d-py==2.3.8 +# pybullet gym-minigrid scikit-optimize optuna @@ -11,9 +11,9 @@ seaborn pyyaml>=5.1 cloudpickle>=1.5.0 # tmp fix: ROM missing in newest release -atari-py==0.2.6 +# atari-py==0.2.6 plotly pygame -panda-gym>=1.1.1 +# panda-gym>=1.1.1 # rliable requires python 3.7+ # rliable>=1.0.5 From 7efd1a0bfcb7a3da021c5fc625c1ccbae84f5d5d Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 3 Nov 2021 10:35:36 +0100 Subject: [PATCH 58/62] Update env name --- hyperparams/tqc.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 7dab7e975..0994107d8 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -232,7 +232,7 @@ parking-v0: )" # Space Engineers envs -SE-Symmetric-v1: &defaults +SE-Forward-v1: &defaults env_wrapper: - utils.wrappers.HistoryWrapper: horizon: 2 @@ -241,10 +241,10 @@ SE-Symmetric-v1: &defaults callback: - utils.callbacks.ParallelTrainCallback: gradient_steps: 400 - - utils.callbacks.StopTrainingOnMeanRewardThreshold: - reward_threshold: 250 - verbose: 1 - n_timesteps: !!float 2e6 + # - utils.callbacks.StopTrainingOnMeanRewardThreshold: + # reward_threshold: 250 + # verbose: 1 + n_timesteps: !!float 5e6 policy: 'MlpPolicy' learning_rate: !!float 7.3e-4 buffer_size: 100000 @@ -261,6 +261,9 @@ SE-Symmetric-v1: &defaults top_quantiles_to_drop_per_net: 2 policy_kwargs: "dict(net_arch=[256, 256], n_critics=2)" +SE-Symmetric-v1: + <<: *defaults + SE-Generic-v1: <<: *defaults callback: From 0c8192c141b318b3cceb2f7296b2c1daf54f897c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 3 Nov 2021 14:20:58 +0100 Subject: [PATCH 59/62] Add new env --- hyperparams/tqc.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 0994107d8..f9c934f10 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -264,6 +264,9 @@ SE-Forward-v1: &defaults SE-Symmetric-v1: <<: *defaults +SE-Corrections-v1: + <<: *defaults + SE-Generic-v1: <<: *defaults callback: From ce70bfd38b52d366242e980a3e0cdddd0644c6d6 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 4 Nov 2021 10:38:02 +0100 Subject: [PATCH 60/62] Fix predict --- utils/teleop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/teleop.py b/utils/teleop.py index c8931c794..ac31d87e6 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -196,7 +196,7 @@ def main_loop(self, total_timesteps=-1): Task.TURN_RIGHT: self.turn_right_controller, }[task] - action = controller.predict(self._last_obs, deterministic=self.deterministic) + action, _ = controller.predict(self._last_obs, deterministic=self.deterministic) # TODO for multi policy: display proba for each expert else: task = None From 87001ed8a40f817d46c950e283d1ca29e405ad71 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 15 Nov 2021 14:47:16 +0100 Subject: [PATCH 61/62] Change the task only when needed --- utils/teleop.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/utils/teleop.py b/utils/teleop.py index ac31d87e6..b855c3891 100644 --- a/utils/teleop.py +++ b/utils/teleop.py @@ -90,6 +90,7 @@ def __init__( self.process = None self.window = None self.max_speed = 0.0 + self._last_task = None self.deterministic = deterministic @@ -183,8 +184,11 @@ def main_loop(self, total_timesteps=-1): if move != "stay": task = Task(move) - # TODO: check if the task has changed - self.env.env_method("change_task", task) + # Check if the task has changed + if task != self._last_task: + self.env.env_method("change_task", task) + self._last_task = task + # Re-enable joints movement self.env.set_attr("max_speed", self.max_speed) # TODO: update for the frame stack by stepping fast in the env? # self._last_obs = self.env.env_method("change_task", task) @@ -200,6 +204,7 @@ def main_loop(self, total_timesteps=-1): # TODO for multi policy: display proba for each expert else: task = None + # Keep the joints at the current position self.env.set_attr("max_speed", 0.0) self._last_obs, reward, done, infos = self.env.step(action) From a652aa6a0874a428ce755df1c43498e1dcace479 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 7 Jan 2022 16:20:40 +0100 Subject: [PATCH 62/62] Sync vec normalize for parallel training --- utils/callbacks.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/utils/callbacks.py b/utils/callbacks.py index 39d6c8d42..394c9611c 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -1,4 +1,5 @@ import os +import pickle import tempfile import time from copy import deepcopy @@ -11,7 +12,7 @@ from stable_baselines3 import SAC from stable_baselines3.common.callbacks import BaseCallback, EvalCallback from stable_baselines3.common.utils import safe_mean -from stable_baselines3.common.vec_env import VecEnv +from stable_baselines3.common.vec_env import VecEnv, sync_envs_normalization class TrialEvalCallback(EvalCallback): @@ -130,6 +131,12 @@ def _init_callback(self) -> None: self.model.save(temp_file) + if self.model.get_vec_normalize_env() is not None: + temp_file_norm = os.path.join("logs", "vec_normalize.pkl") + + with open(temp_file_norm, "wb") as file_handler: + pickle.dump(self.model.get_vec_normalize_env(), file_handler) + # TODO: add support for other algorithms for model_class in [SAC, TQC]: if isinstance(self.model, model_class): @@ -139,6 +146,11 @@ def _init_callback(self) -> None: assert self.model_class is not None, f"{self.model} is not supported for parallel training" self._model = self.model_class.load(temp_file) + if self.model.get_vec_normalize_env() is not None: + with open(temp_file_norm, "rb") as file_handler: + self._model._vec_normalize_env = pickle.load(file_handler) + self._model._vec_normalize_env.training = False + self.batch_size = self._model.batch_size # Disable train method @@ -183,6 +195,10 @@ def _on_rollout_end(self) -> None: self._model.replay_buffer = deepcopy(self.model.replay_buffer) self.model.set_parameters(deepcopy(self._model.get_parameters())) self.model.actor = self.model.policy.actor + # Sync VecNormalize + if self.model.get_vec_normalize_env() is not None: + sync_envs_normalization(self.model.get_vec_normalize_env(), self._model._vec_normalize_env) + if self.num_timesteps >= self._model.learning_starts: self.train() # Do not wait for the training loop to finish