diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index e8a2984d2..af83d2302 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -3,7 +3,7 @@ Changelog ========== -Release 2.4.0a9 (WIP) +Release 2.4.0a10 (WIP) -------------------------- .. note:: @@ -60,12 +60,14 @@ Others: - Fixed various typos (@cschindlbeck) - Remove unnecessary SDE noise resampling in PPO update (@brn-dev) - Updated PyTorch version on CI to 2.3.1 +- Added a warning to recommend using CPU with on policy algorithms (A2C/PPO) and ``MlpPolicy`` Bug Fixes: ^^^^^^^^^^ Documentation: ^^^^^^^^^^^^^^ +- Updated PPO doc to recommend using CPU with ``MlpPolicy`` Release 2.3.2 (2024-04-27) -------------------------- diff --git a/docs/modules/ppo.rst b/docs/modules/ppo.rst index b5e667241..4285cfb50 100644 --- a/docs/modules/ppo.rst +++ b/docs/modules/ppo.rst @@ -88,6 +88,23 @@ Train a PPO agent on ``CartPole-v1`` using 4 environments. vec_env.render("human") +.. note:: + + PPO is meant to be run primarily on the CPU, especially when you are not using a CNN. To improve CPU utilization, try turning off the GPU and using ``SubprocVecEnv`` instead of the default ``DummyVecEnv``: + + .. code-block:: + + from stable_baselines3 import PPO + from stable_baselines3.common.env_util import make_vec_env + from stable_baselines3.common.vec_env import SubprocVecEnv + + if __name__=="__main__": + env = make_vec_env("CartPole-v1", n_envs=8, vec_env_cls=SubprocVecEnv) + model = PPO("MlpPolicy", env, device="cpu") + model.learn(total_timesteps=25_000) + + For more information, see :ref:`Vectorized Environments `, `Issue #1245 `_ or the `Multiprocessing notebook `_. + Results ------- diff --git a/stable_baselines3/common/on_policy_algorithm.py b/stable_baselines3/common/on_policy_algorithm.py index 262453721..dc885242e 100644 --- a/stable_baselines3/common/on_policy_algorithm.py +++ b/stable_baselines3/common/on_policy_algorithm.py @@ -1,5 +1,6 @@ import sys import time +import warnings from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union import numpy as np @@ -135,6 +136,28 @@ def _setup_model(self) -> None: self.observation_space, self.action_space, self.lr_schedule, use_sde=self.use_sde, **self.policy_kwargs ) self.policy = self.policy.to(self.device) + # Warn when not using CPU with MlpPolicy + self._maybe_recommend_cpu() + + def _maybe_recommend_cpu(self, mlp_class_name: str = "ActorCriticPolicy") -> None: + """ + Recommend to use CPU only when using A2C/PPO with MlpPolicy. + + :param: The name of the class for the default MlpPolicy. + """ + policy_class_name = self.policy_class.__name__ + if self.device != th.device("cpu") and policy_class_name == mlp_class_name: + warnings.warn( + f"You are trying to run {self.__class__.__name__} on the GPU, " + "but it is primarily intended to run on the CPU when not using a CNN policy " + f"(you are using {policy_class_name} which should be a MlpPolicy). " + "See https://github.com/DLR-RM/stable-baselines3/issues/1245 " + "for more info. " + "You can pass `device='cpu'` or `export CUDA_VISIBLE_DEVICES=` to force using the CPU." + "Note: The model will train, but the GPU utilization will be poor and " + "the training might take longer than on CPU.", + UserWarning, + ) def collect_rollouts( self, diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt index 636c433a1..852a32b3f 100644 --- a/stable_baselines3/version.txt +++ b/stable_baselines3/version.txt @@ -1 +1 @@ -2.4.0a9 +2.4.0a10 diff --git a/tests/test_run.py b/tests/test_run.py index 31c7b956e..4acabb692 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,6 +1,7 @@ import gymnasium as gym import numpy as np import pytest +import torch as th from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3 from stable_baselines3.common.env_util import make_vec_env @@ -211,8 +212,11 @@ def test_warn_dqn_multi_env(): def test_ppo_warnings(): - """Test that PPO warns and errors correctly on - problematic rollout buffer sizes""" + """ + Test that PPO warns and errors correctly on + problematic rollout buffer sizes, + and recommend using CPU. + """ # Only 1 step: advantage normalization will return NaN with pytest.raises(AssertionError): @@ -234,3 +238,9 @@ def test_ppo_warnings(): loss = model.logger.name_to_value["train/loss"] assert loss > 0 assert not np.isnan(loss) # check not nan (since nan does not equal nan) + + with pytest.warns(UserWarning, match="You are trying to run PPO on the GPU"): + model = PPO("MlpPolicy", "Pendulum-v1") + # Pretend to be on the GPU + model.device = th.device("cuda") + model._maybe_recommend_cpu()