instadeepai · Louay-Ben-nessir · Nov 18, 2024 · Nov 22, 2024 · Nov 26, 2024 · Dec 4, 2024
@@ -0,0 +1,11 @@
+defaults:
+  - logger: logger
+  - arch: sebulba
+  - system: q_learning/rec_iql
+  - network: rnn  # [rnn, rcnn]
+  - env: lbf_gym  # [rware_gym, lbf_gym, smac_gym]
+  - _self_
+
+hydra:
+  searchpath:
+    - file://mava/configs
@@ -11,13 +11,13 @@ add_agent_id: True
 min_buffer_size: 32
 update_batch_size: 1 # Number of vectorised gradient updates per device.
 
-rollout_length: 2 # Number of environment steps per vectorised environment.
+rollout_length:  2 # Number of environment steps per vectorised enviro²nment.
 epochs: 2 # Number of learn epochs per training data batch.
 
 # sizes
-buffer_size: 5000 # size of the replay buffer. Note: total size is this * num_devices
+buffer_size: 1000 # size of the replay buffer. Note: total size is this * num_devices
 sample_batch_size: 32  # size of training data batch sampled from the buffer
-sample_sequence_length: 20 # 20 transitions are sampled, giving 19 complete data points
+sample_sequence_length: 32 # 20 transitions are sampled, giving 19 complete data points
 
 # learning rates
 q_lr: 3e-4  # the learning rate of the Q network network optimizer
@@ -31,3 +31,7 @@ gamma: 0.99  # discount factor
 
 eps_min: 0.05
 eps_decay: 1e5
+
+# --- Sebulba parameters ---
+data_sample_mean: 150  # Average number of times the learner should sample each item from the replay buffer.
+error_tolerance: 2     # Tolerance for how much the learner/actor can sample/insert before being blocked. Must be greater than 2 to avoid deadlocks.
@@ -290,8 +290,7 @@ def _episode(key: PRNGKey) -> Tuple[PRNGKey, Metrics]:
             if config.env.log_win_rate:
                 metrics["won_episode"] = timesteps.extras["won_episode"]
 
-            # find the first instance of done to get the metrics at that timestep, we don't
-            # care about subsequent steps because we only the results from the first episode
+            # Find the first instance of done to get the metrics at that timestep.
             done_idx = np.argmax(timesteps.last(), axis=0)
             metrics = tree.map(lambda m: m[done_idx, np.arange(n_parallel_envs)], metrics)
             del metrics["is_terminal_step"]  # uneeded for logging

@@ -44,7 +44,7 @@
     TrainState,
     Transition,
 )
-from mava.types import MarlEnv, Observation
+from mava.types import MarlEnv, MavaObservation
 from mava.utils import make_env as environments
 from mava.utils.checkpointing import Checkpointer
 from mava.utils.config import check_total_timesteps
@@ -241,7 +241,7 @@ def make_update_fns(
 ) -> Callable[[LearnerState[QMIXParams]], Tuple[LearnerState[QMIXParams], Tuple[Metrics, Metrics]]]:
     def select_eps_greedy_action(
         action_selection_state: ActionSelectionState,
-        obs: Observation,
+        obs: MavaObservation,
         term_or_trunc: Array,
     ) -> Tuple[ActionSelectionState, Array]:
         """Select action to take in eps-greedy way. Batch and agent dims are included."""
@@ -310,7 +310,7 @@ def action_step(action_state: ActionState, _: Any) -> Tuple[ActionState, Dict]:
 
         return new_act_state, next_timestep.extras["episode_metrics"]
 
-    def prep_inputs_to_scannedrnn(obs: Observation, term_or_trunc: chex.Array) -> chex.Array:
+    def prep_inputs_to_scannedrnn(obs: MavaObservation, term_or_trunc: chex.Array) -> chex.Array:
         """Prepares the inputs to the RNN network for either getting q values or the
         eps-greedy distribution.