issue rlberry-py#188 (rlberry-py#349)

* attributes and parameters that have the same name have the same description in AgentWithSimplePolicy Fixes rlberry-py#188 * Attributes for adaptiveql * Done with attributes for adaptiveql * Attributes docstrings for DQN (done using chatgpt) * fixed dqn attributes doc * dqn aligned attribs * Attribute docs for reinforce * update dqn attribute doc * docs for ppo attributes (done using chatgpt) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Base bandit environment class attributes. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
BorisHamadej · Jul 24, 2023 · 1dff2e8 · 1dff2e8
1 parent a68ce4b
commit 1dff2e8
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 9 deletions.
diff --git a/rlberry/agents/adaptiveql/adaptiveql.py b/rlberry/agents/adaptiveql/adaptiveql.py
@@ -21,15 +21,33 @@ class AdaptiveQLAgent(AgentWithSimplePolicy):
         Environment with continuous states and discrete actions.
     gamma : double, default: 1.0
         Discount factor in [0, 1].
-    horizon : int
+    horizon : int, default: 50
         Horizon of the objective function.
     bonus_scale_factor : double, default: 1.0
         Constant by which to multiply the exploration bonus, controls
         the level of exploration.
-    bonus_type : {"simplified_bernstein"}
+    bonus_type : string, default: "simplified_bernstein"
         Type of exploration bonus. Currently, only "simplified_bernstein"
         is implemented.
 
+    Attributes
+    ----------
+    gamma : double, default: 1.0
+        Discount factor in [0, 1].
+    horizon : int, default: 50
+        Horizon of the objective function.
+    bonus_scale_factor : double, default: 1.0
+        Constant by which to multiply the exploration bonus, controls
+        the level of exploration.
+    bonus_type : string, default: "simplified_bernstein"
+        Type of exploration bonus. Currently, only "simplified_bernstein"
+        is implemented.
+    v_max : ndarray
+        Array of the maximum state value as a function of the Horizon.
+    Qtree : MDPTreePartition
+        Tree structure to represent the MDP model of transition.
+    episode : int
+        Number of episodes done during training of the adaptiveql agent.
 
     References
     ----------

diff --git a/rlberry/agents/agent.py b/rlberry/agents/agent.py
@@ -29,9 +29,9 @@ class Agent(ABC):
 
     Parameters
     ----------
-    env : gym.Env or tuple (constructor, kwargs)
-        Environment used to fit the agent.
-    eval_env : gym.Env or tuple (constructor, kwargs)
+    env : :class:`gym.Env` or tuple (constructor, kwargs)
+        Environment on which to train the agent.
+    eval_env : :class:`gym.Env` or tuple (constructor, kwargs)
         Environment on which to evaluate the agent. If None, copied from env.
     copy_env : bool
         If true, makes a deep copy of the environment.

diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
@@ -128,6 +128,64 @@ class DQNAgent(AgentTorch, AgentWithSimplePolicy):
     eval_interval : int, default = None
         Interval (in number of transitions) between agent evaluations in fit().
         If None, never evaluate.
+
+    Attributes
+    ----------
+    gamma : float, default: 0.99
+        Discount factor used to discount future rewards in the Bellman equation.
+    batch_size : int, default: 32
+        Batch size used during the training process.
+    chunk_size : int, default: 8
+        Length of sub-trajectories sampled from the replay buffer.
+    lambda_ : float, default: 0.5
+        Q(lambda) parameter used in Q(lambda) algorithm for computing targets.
+    target_update_parameter : int or float
+        The parameter that controls the update frequency of the target network.
+        If int: interval (in number of total online updates) between updates of the target network.
+        If float: soft update coefficient, which controls the rate at which the target network approaches
+        the online network.
+    device : str
+        Torch device on which the agent's neural networks are placed. Use "cuda:best" to choose the best
+        available GPU device.
+    learning_rate : float, default: 1e-3
+        Learning rate used by the optimizer during neural network training.
+    epsilon_init : float, default: 1.0
+        Initial epsilon value for epsilon-greedy exploration. Epsilon-greedy policy is used to balance
+        exploration and exploitation during training.
+    epsilon_final : float, default: 0.1
+        Final epsilon value for epsilon-greedy exploration. Epsilon will approach this value as the agent
+        gains more experience.
+    epsilon_decay_interval : int
+        The number of timesteps after which the epsilon value will approach `epsilon_final`.
+    loss_function : {"l1", "l2", "smooth_l1"}, default: "l2"
+        The loss function used to compute the Bellman error during training. The available options are
+        Mean Absolute Error ("l1"), Mean Squared Error ("l2"), and Smooth L1 Loss ("smooth_l1").
+    optimizer_type : {"ADAM", "RMS_PROP"}
+        The optimization algorithm used during neural network training. Choose between ADAM and RMS_PROP.
+    q_net_constructor : Callable, str or None
+        Function/constructor that returns a torch module for the Q-network.
+        Example: use `rlberry.agents.torch.utils.training.model_factory_from_env` and `q_net_kwargs`
+        parameter to modify the neural network.
+    q_net_kwargs : optional, dict
+        Parameters for `q_net_constructor`.
+    use_double_dqn : bool, default: False
+        If True, use Double DQN algorithm, which helps to reduce overestimation bias in Q-value estimates.
+    use_prioritized_replay : bool, default: False
+        If True, use Prioritized Experience Replay, which prioritizes transitions in the replay buffer
+        based on their TD-errors, to improve the learning process.
+    train_interval : int
+        The agent updates the model every `train_interval` steps. If -1, the agent only trains at the end
+        of each episode.
+    gradient_steps : int
+        The number of gradient steps to perform at each model update. If -1, the number of timesteps since
+        the last update will be used.
+    max_replay_size : int
+        The maximum number of transitions allowed in the replay buffer.
+    learning_starts : int
+        The number of steps of the model to collect transitions for before learning starts.
+    eval_interval : int, default: None
+        The interval (in number of transitions) between agent evaluations in the `fit()` method. If None,
+        the agent won't evaluate during training.
     """
 
     name = "DQN"

diff --git a/rlberry/agents/torch/ppo/ppo.py b/rlberry/agents/torch/ppo/ppo.py
@@ -112,6 +112,73 @@ class PPOAgent(AgentTorch, AgentWithSimplePolicy):
     device: str
         Device on which to put the tensors. 'cuda:best' by default.
 
+    Attributes
+    ----------
+    __value_losses__ : list
+        List of supported value loss types. ["clipped", "mse", "avec"]
+    __lr_schedule___ : list
+        List of supported learning rate schedule types. ["constant", "linear"]
+    copy_env : bool
+        If True, copy the environment to create multiple environments for parallel interaction.
+    n_envs : int
+        Number of environments used by the agent.
+    n_steps : int
+        Number of transitions to collect in each environment per update.
+    batch_size : int
+        Size of mini batches during each PPO update epoch.
+    gamma : float
+        Discount factor used to discount future rewards.
+    k_epochs : int
+        Number of PPO epochs per update.
+    clip_eps : float
+        PPO clipping range (epsilon).
+    target_kl: float
+        Target KL divergence for early stopping. If None, early stopping is disabled.
+    normalize_advantages : bool
+        Whether or not to normalize advantages.
+    gae_lambda : float
+        Lambda parameter for TD(lambda) and Generalized Advantage Estimation.
+    entr_coef : float
+        Entropy coefficient. Controls the contribution of entropy regularization to the policy's objective.
+    vf_coef : float
+        Value function loss coefficient. Controls the contribution of the value function loss to the total loss.
+    value_loss: str
+        Type of value loss used. Can be "mse", "clipped", or "avec".
+    max_grad_norm : float
+        Maximum norm of the gradient of both actor and critic networks. Used for gradient clipping.
+    learning_rate : float
+        Learning rate used by the optimizer during neural network training.
+    lr_schedule : str
+        Learning rate schedule used during training. Can be "constant" or "linear".
+    optimizer_type : str
+        Type of optimizer used during neural network training.
+    n_eval_episodes : int
+        Number of episodes used for evaluation.
+    eval_horizon : int
+        Maximum number of steps per episode during evaluation.
+    eval_freq : int
+        Number of updates between evaluations. If None, no evaluation is performed.
+    policy_net_fn : function(env, **kwargs)
+        Function that returns an instance of a policy network (PyTorch).
+    policy_net_kwargs : dict
+        Keyword arguments for `policy_net_fn`.
+    value_net_fn : function(env, **kwargs)
+        Function that returns an instance of a value network (PyTorch).
+    value_net_kwargs : dict
+        Keyword arguments for `value_net_fn`.
+    eval_env : rlberry.Env
+        The environment used for evaluation. If None, the same environment as env is used.
+    state_dim : int
+        Dimensionality of the continuous state space of the environment.
+    policy_net : torch.nn.Module
+        The policy network used by the agent.
+    value_net : torch.nn.Module
+        The value network used by the agent.
+    device : str
+        Torch device on which the agent's neural networks are placed.
+    optimizer_kwargs : dict
+        Keyword arguments for the optimizer used during neural network training.
+
     References
     ----------
     Schulman, J., Wolski, F., Dhariwal, P., Radford, A. & Klimov, O. (2017).

diff --git a/rlberry/agents/torch/reinforce/reinforce.py b/rlberry/agents/torch/reinforce/reinforce.py
@@ -22,10 +22,10 @@ class REINFORCEAgent(AgentTorch, AgentWithSimplePolicy):
     ----------
     env : Model
         Online model with continuous (Box) state space and discrete actions
-    batch_size : int
-        Number of episodes to wait before updating the policy.
-    horizon : int
-        Horizon.
+    batch_size : int, default: 8
+        Number of episodes used for the update of the policy netowrk.
+    horizon : int, default: 256
+        Episode length: one transition per episode steps. So total number of transitions used for one policy update is batch_size * horizon.
     gamma : double
         Discount factor in [0, 1].
     entr_coef : double
@@ -47,6 +47,35 @@ class REINFORCEAgent(AgentTorch, AgentWithSimplePolicy):
     device: str
         Device to put the tensors on
 
+    Attributes
+    ----------
+    device : str
+        Torch device on which the agent's neural networks are placed.
+    batch_size : int, default: 8
+        Number of episodes used for the update of the policy netowrk.
+    horizon : int, default: 256
+        Episode length: one transition per episode steps.
+    gamma : float, default: 0.99
+        Discount factor used to discount future rewards in the Bellman equation.
+    state_dim : int
+        Dimensionality of the continuous state space of the environment.
+    action_dim : int
+        Number of discrete actions available in the environment.
+    policy_net_fn : function(env, **kwargs)
+        Function that returns an instance of a policy network (PyTorch).
+    policy_net_kwargs : dict
+        Keyword arguments for `policy_net_fn`.
+    optimizer_kwargs : dict
+        Keyword arguments for the optimizer used during neural network training.
+    policy_net : torch.nn.Module
+        The policy network used by the agent.
+    policy_optimizer : torch.optim.Optimizer
+        The optimizer used for training the policy network.
+    memory : Memory
+        The memory buffer used to store the agent's experiences.
+    episode : int
+        A counter that keeps track of the number of episodes.
+
     References
     ----------
     Williams, Ronald J.,

diff --git a/rlberry/envs/bandits/bandit_base.py b/rlberry/envs/bandits/bandit_base.py
@@ -22,6 +22,19 @@ class Bandit(Model):
     **kwargs: keywords arguments
         additional arguments sent to :class:`~rlberry.envs.interface.Model`
 
+    Attributes
+    ----------
+    laws: list
+        laws of the arms. can either be a frozen scipy law or any class that
+        has a method .rvs().
+    n_arms: int
+        Number of arms.
+    action_space: spaces.Discrete
+        Action space when viewing the bandit as a single-state MDP.
+    rewards: list
+        For each arm, pre-sample 10 times.
+    n_rewards: list
+        Reward counter per arm.
     """
 
     name = ""