diff --git a/rlberry/agents/adaptiveql/adaptiveql.py b/rlberry/agents/adaptiveql/adaptiveql.py index 660670e28..667ed54e0 100644 --- a/rlberry/agents/adaptiveql/adaptiveql.py +++ b/rlberry/agents/adaptiveql/adaptiveql.py @@ -21,15 +21,33 @@ class AdaptiveQLAgent(AgentWithSimplePolicy): Environment with continuous states and discrete actions. gamma : double, default: 1.0 Discount factor in [0, 1]. - horizon : int + horizon : int, default: 50 Horizon of the objective function. bonus_scale_factor : double, default: 1.0 Constant by which to multiply the exploration bonus, controls the level of exploration. - bonus_type : {"simplified_bernstein"} + bonus_type : string, default: "simplified_bernstein" Type of exploration bonus. Currently, only "simplified_bernstein" is implemented. + Attributes + ---------- + gamma : double, default: 1.0 + Discount factor in [0, 1]. + horizon : int, default: 50 + Horizon of the objective function. + bonus_scale_factor : double, default: 1.0 + Constant by which to multiply the exploration bonus, controls + the level of exploration. + bonus_type : string, default: "simplified_bernstein" + Type of exploration bonus. Currently, only "simplified_bernstein" + is implemented. + v_max : ndarray + Array of the maximum state value as a function of the Horizon. + Qtree : MDPTreePartition + Tree structure to represent the MDP model of transition. + episode : int + Number of episodes done during training of the adaptiveql agent. References ---------- diff --git a/rlberry/agents/agent.py b/rlberry/agents/agent.py index 7d8914f03..104df207d 100644 --- a/rlberry/agents/agent.py +++ b/rlberry/agents/agent.py @@ -29,9 +29,9 @@ class Agent(ABC): Parameters ---------- - env : gym.Env or tuple (constructor, kwargs) - Environment used to fit the agent. - eval_env : gym.Env or tuple (constructor, kwargs) + env : :class:`gym.Env` or tuple (constructor, kwargs) + Environment on which to train the agent. + eval_env : :class:`gym.Env` or tuple (constructor, kwargs) Environment on which to evaluate the agent. If None, copied from env. copy_env : bool If true, makes a deep copy of the environment. diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py index d00a78963..84219c8c8 100644 --- a/rlberry/agents/torch/dqn/dqn.py +++ b/rlberry/agents/torch/dqn/dqn.py @@ -128,6 +128,64 @@ class DQNAgent(AgentTorch, AgentWithSimplePolicy): eval_interval : int, default = None Interval (in number of transitions) between agent evaluations in fit(). If None, never evaluate. + + Attributes + ---------- + gamma : float, default: 0.99 + Discount factor used to discount future rewards in the Bellman equation. + batch_size : int, default: 32 + Batch size used during the training process. + chunk_size : int, default: 8 + Length of sub-trajectories sampled from the replay buffer. + lambda_ : float, default: 0.5 + Q(lambda) parameter used in Q(lambda) algorithm for computing targets. + target_update_parameter : int or float + The parameter that controls the update frequency of the target network. + If int: interval (in number of total online updates) between updates of the target network. + If float: soft update coefficient, which controls the rate at which the target network approaches + the online network. + device : str + Torch device on which the agent's neural networks are placed. Use "cuda:best" to choose the best + available GPU device. + learning_rate : float, default: 1e-3 + Learning rate used by the optimizer during neural network training. + epsilon_init : float, default: 1.0 + Initial epsilon value for epsilon-greedy exploration. Epsilon-greedy policy is used to balance + exploration and exploitation during training. + epsilon_final : float, default: 0.1 + Final epsilon value for epsilon-greedy exploration. Epsilon will approach this value as the agent + gains more experience. + epsilon_decay_interval : int + The number of timesteps after which the epsilon value will approach `epsilon_final`. + loss_function : {"l1", "l2", "smooth_l1"}, default: "l2" + The loss function used to compute the Bellman error during training. The available options are + Mean Absolute Error ("l1"), Mean Squared Error ("l2"), and Smooth L1 Loss ("smooth_l1"). + optimizer_type : {"ADAM", "RMS_PROP"} + The optimization algorithm used during neural network training. Choose between ADAM and RMS_PROP. + q_net_constructor : Callable, str or None + Function/constructor that returns a torch module for the Q-network. + Example: use `rlberry.agents.torch.utils.training.model_factory_from_env` and `q_net_kwargs` + parameter to modify the neural network. + q_net_kwargs : optional, dict + Parameters for `q_net_constructor`. + use_double_dqn : bool, default: False + If True, use Double DQN algorithm, which helps to reduce overestimation bias in Q-value estimates. + use_prioritized_replay : bool, default: False + If True, use Prioritized Experience Replay, which prioritizes transitions in the replay buffer + based on their TD-errors, to improve the learning process. + train_interval : int + The agent updates the model every `train_interval` steps. If -1, the agent only trains at the end + of each episode. + gradient_steps : int + The number of gradient steps to perform at each model update. If -1, the number of timesteps since + the last update will be used. + max_replay_size : int + The maximum number of transitions allowed in the replay buffer. + learning_starts : int + The number of steps of the model to collect transitions for before learning starts. + eval_interval : int, default: None + The interval (in number of transitions) between agent evaluations in the `fit()` method. If None, + the agent won't evaluate during training. """ name = "DQN" diff --git a/rlberry/agents/torch/ppo/ppo.py b/rlberry/agents/torch/ppo/ppo.py index ab625a9bf..fdd27442b 100644 --- a/rlberry/agents/torch/ppo/ppo.py +++ b/rlberry/agents/torch/ppo/ppo.py @@ -112,6 +112,73 @@ class PPOAgent(AgentTorch, AgentWithSimplePolicy): device: str Device on which to put the tensors. 'cuda:best' by default. + Attributes + ---------- + __value_losses__ : list + List of supported value loss types. ["clipped", "mse", "avec"] + __lr_schedule___ : list + List of supported learning rate schedule types. ["constant", "linear"] + copy_env : bool + If True, copy the environment to create multiple environments for parallel interaction. + n_envs : int + Number of environments used by the agent. + n_steps : int + Number of transitions to collect in each environment per update. + batch_size : int + Size of mini batches during each PPO update epoch. + gamma : float + Discount factor used to discount future rewards. + k_epochs : int + Number of PPO epochs per update. + clip_eps : float + PPO clipping range (epsilon). + target_kl: float + Target KL divergence for early stopping. If None, early stopping is disabled. + normalize_advantages : bool + Whether or not to normalize advantages. + gae_lambda : float + Lambda parameter for TD(lambda) and Generalized Advantage Estimation. + entr_coef : float + Entropy coefficient. Controls the contribution of entropy regularization to the policy's objective. + vf_coef : float + Value function loss coefficient. Controls the contribution of the value function loss to the total loss. + value_loss: str + Type of value loss used. Can be "mse", "clipped", or "avec". + max_grad_norm : float + Maximum norm of the gradient of both actor and critic networks. Used for gradient clipping. + learning_rate : float + Learning rate used by the optimizer during neural network training. + lr_schedule : str + Learning rate schedule used during training. Can be "constant" or "linear". + optimizer_type : str + Type of optimizer used during neural network training. + n_eval_episodes : int + Number of episodes used for evaluation. + eval_horizon : int + Maximum number of steps per episode during evaluation. + eval_freq : int + Number of updates between evaluations. If None, no evaluation is performed. + policy_net_fn : function(env, **kwargs) + Function that returns an instance of a policy network (PyTorch). + policy_net_kwargs : dict + Keyword arguments for `policy_net_fn`. + value_net_fn : function(env, **kwargs) + Function that returns an instance of a value network (PyTorch). + value_net_kwargs : dict + Keyword arguments for `value_net_fn`. + eval_env : rlberry.Env + The environment used for evaluation. If None, the same environment as env is used. + state_dim : int + Dimensionality of the continuous state space of the environment. + policy_net : torch.nn.Module + The policy network used by the agent. + value_net : torch.nn.Module + The value network used by the agent. + device : str + Torch device on which the agent's neural networks are placed. + optimizer_kwargs : dict + Keyword arguments for the optimizer used during neural network training. + References ---------- Schulman, J., Wolski, F., Dhariwal, P., Radford, A. & Klimov, O. (2017). diff --git a/rlberry/agents/torch/reinforce/reinforce.py b/rlberry/agents/torch/reinforce/reinforce.py index 8fd87c5a1..f9f0c2e2f 100644 --- a/rlberry/agents/torch/reinforce/reinforce.py +++ b/rlberry/agents/torch/reinforce/reinforce.py @@ -22,10 +22,10 @@ class REINFORCEAgent(AgentTorch, AgentWithSimplePolicy): ---------- env : Model Online model with continuous (Box) state space and discrete actions - batch_size : int - Number of episodes to wait before updating the policy. - horizon : int - Horizon. + batch_size : int, default: 8 + Number of episodes used for the update of the policy netowrk. + horizon : int, default: 256 + Episode length: one transition per episode steps. So total number of transitions used for one policy update is batch_size * horizon. gamma : double Discount factor in [0, 1]. entr_coef : double @@ -47,6 +47,35 @@ class REINFORCEAgent(AgentTorch, AgentWithSimplePolicy): device: str Device to put the tensors on + Attributes + ---------- + device : str + Torch device on which the agent's neural networks are placed. + batch_size : int, default: 8 + Number of episodes used for the update of the policy netowrk. + horizon : int, default: 256 + Episode length: one transition per episode steps. + gamma : float, default: 0.99 + Discount factor used to discount future rewards in the Bellman equation. + state_dim : int + Dimensionality of the continuous state space of the environment. + action_dim : int + Number of discrete actions available in the environment. + policy_net_fn : function(env, **kwargs) + Function that returns an instance of a policy network (PyTorch). + policy_net_kwargs : dict + Keyword arguments for `policy_net_fn`. + optimizer_kwargs : dict + Keyword arguments for the optimizer used during neural network training. + policy_net : torch.nn.Module + The policy network used by the agent. + policy_optimizer : torch.optim.Optimizer + The optimizer used for training the policy network. + memory : Memory + The memory buffer used to store the agent's experiences. + episode : int + A counter that keeps track of the number of episodes. + References ---------- Williams, Ronald J., diff --git a/rlberry/envs/bandits/bandit_base.py b/rlberry/envs/bandits/bandit_base.py index 77590c35a..95ceeb1a2 100644 --- a/rlberry/envs/bandits/bandit_base.py +++ b/rlberry/envs/bandits/bandit_base.py @@ -22,6 +22,19 @@ class Bandit(Model): **kwargs: keywords arguments additional arguments sent to :class:`~rlberry.envs.interface.Model` + Attributes + ---------- + laws: list + laws of the arms. can either be a frozen scipy law or any class that + has a method .rvs(). + n_arms: int + Number of arms. + action_space: spaces.Discrete + Action space when viewing the bandit as a single-state MDP. + rewards: list + For each arm, pre-sample 10 times. + n_rewards: list + Reward counter per arm. """ name = ""