From 47eccbe0f812b6f6b88bf649edae768f5dbec1d0 Mon Sep 17 00:00:00 2001
From: kywch <choe.kyoung@gmail.com>
Date: Thu, 28 Mar 2024 15:30:06 -0700
Subject: [PATCH] switched to pyproject.toml, applied ruff

---
 .github/workflows/workflow.yml                |  24 +
 .pre-commit-config.yaml                       |   9 +
 agent_zoo/neurips23_start_kit/__init__.py     |   2 +-
 .../neurips23_start_kit/baseline_policy.py    | 455 +++++++++---------
 .../neurips23_start_kit/reward_wrapper.py     |  23 +-
 analysis/proc_eval_result.py                  | 191 ++++----
 analysis/proc_task_cond_result.py             |  91 ++--
 analysis/run_task_conditioning.py             |  55 ++-
 curriculum_generation/curriculum_tutorial.py  |   7 +
 curriculum_generation/manual_curriculum.py    | 383 +++++++--------
 curriculum_generation/task_encoder.py         |  69 ++-
 curriculum_generation/task_sampler.py         |  38 +-
 evaluate.py                                   | 182 ++++---
 neurips23_evaluation/export_embeddings.py     |  32 +-
 .../heldout_evaluation_task.py                |  44 +-
 .../sample_evaluation_task.py                 |  16 +-
 pyproject.toml                                | 121 +++++
 reinforcement_learning/clean_pufferl.py       | 298 ++++++------
 reinforcement_learning/environment.py         |  32 +-
 reinforcement_learning/stat_wrapper.py        | 167 +++----
 requirements.txt                              |   9 -
 tests/test_task_encoder.py                    |  84 ++--
 train.py                                      | 135 +++---
 train_helper.py                               | 146 +++---
 24 files changed, 1460 insertions(+), 1153 deletions(-)
 create mode 100644 .github/workflows/workflow.yml
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 pyproject.toml
 delete mode 100644 requirements.txt

diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
new file mode 100644
index 00000000..8601682d
--- /dev/null
+++ b/.github/workflows/workflow.yml
@@ -0,0 +1,24 @@
+name: tox
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        py: ["3.8", "3.9", "3.10"]
+    steps:
+      - name: Setup python for test ${{ matrix.py }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.py }}
+      - uses: actions/checkout@v3
+      - name: Upgrade pip
+        run: python -m pip install -U pip setuptools wheel cython
+      - name: Install
+        run: python -m pip install -e '.[dev]'
+      - name: Check formatting
+        run: ruff format .
+      - name: Check lint
+        run: ruff check .
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..8f03df78
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.3.2
+    hooks:
+      # Run the linter.
+      - id: ruff
+      # Run the formatter.
+      - id: ruff-format
\ No newline at end of file
diff --git a/agent_zoo/neurips23_start_kit/__init__.py b/agent_zoo/neurips23_start_kit/__init__.py
index 6bce0e48..2f7ed092 100644
--- a/agent_zoo/neurips23_start_kit/__init__.py
+++ b/agent_zoo/neurips23_start_kit/__init__.py
@@ -1,3 +1,3 @@
 from .baseline_policy import Baseline as Policy
 from .baseline_policy import Recurrent
-from .reward_wrapper import RewardWrapper
\ No newline at end of file
+from .reward_wrapper import RewardWrapper
diff --git a/agent_zoo/neurips23_start_kit/baseline_policy.py b/agent_zoo/neurips23_start_kit/baseline_policy.py
index 933f9116..8525250f 100644
--- a/agent_zoo/neurips23_start_kit/baseline_policy.py
+++ b/agent_zoo/neurips23_start_kit/baseline_policy.py
@@ -11,264 +11,271 @@
 EntityId = EntityState.State.attr_name_to_col["id"]
 
 # NOTE: a workaround for the torch.complier problem. TODO: try torch 2.2
-#unpack_batched_obs = torch.compiler.disable(unpack_batched_obs)
+# unpack_batched_obs = torch.compiler.disable(unpack_batched_obs)
 
 
 class Recurrent(pufferlib.models.RecurrentWrapper):
     def __init__(self, env, policy, input_size=256, hidden_size=256, num_layers=1):
         super().__init__(env, policy, input_size, hidden_size, num_layers)
 
+
 class Baseline(pufferlib.models.Policy):
-  '''Improved baseline policy by JimyhZhu'''
-  def __init__(self, env, input_size=256, hidden_size=256, task_size=2048):
-    super().__init__(env)
-
-    self.unflatten_context = env.unflatten_context
-
-    self.tile_encoder = TileEncoder(input_size)
-    self.player_encoder = PlayerEncoder(input_size, hidden_size)
-    self.item_encoder = ItemEncoder(input_size, hidden_size)
-    self.inventory_encoder = InventoryEncoder(input_size, hidden_size)
-    self.market_encoder = MarketEncoder(input_size, hidden_size)
-    self.task_encoder = TaskEncoder(input_size, hidden_size, task_size)
-    self.proj_fc = torch.nn.Linear(6 * input_size, input_size)
-    self.action_decoder = ActionDecoder(input_size, hidden_size)
-    self.value_head = torch.nn.Linear(hidden_size, 1)
-
-  def encode_observations(self, flat_observations):
-    env_outputs = unpack_batched_obs(flat_observations, self.unflatten_context)
-    tile = self.tile_encoder(env_outputs["Tile"])
-    player_embeddings, my_agent = self.player_encoder(
-        env_outputs["Entity"], env_outputs["AgentId"][:, 0]
-    )
-
-    item_embeddings = self.item_encoder(env_outputs["Inventory"])
-    market_embeddings = self.item_encoder(env_outputs["Market"])  # no_pooling
-    market = self.market_encoder(market_embeddings)  # fc +mean pooling already applied
-    task = self.task_encoder(env_outputs["Task"])
-    pooled_item_embeddings = item_embeddings.mean(dim=1)
-    pooled_player_embeddings = player_embeddings.mean(dim=1)
-    obs = torch.cat([tile, my_agent, pooled_player_embeddings, pooled_item_embeddings, market, task], dim=-1)
-    obs = self.proj_fc(obs)
-
-    # Pad the embeddings to make them the same size to the action_decoder
-    # This is a workaround for the fact that the action_decoder expects the same number of actions including no-op
-    embeddings = [player_embeddings, item_embeddings, market_embeddings]
-    padded_embeddings = []
-    for embedding in embeddings:
-        padding_size = 1  # The size of padding to be added
-        padding = torch.zeros(embedding.size(0), padding_size, embedding.size(2), device=embedding.device)
-        padded_embedding = torch.cat([embedding, padding], dim=1)
-        padded_embeddings.append(padded_embedding)
-    # Replace the original embeddings with the padded versions
-    player_embeddings, item_embeddings, market_embeddings = padded_embeddings
-
-    return obs, (
-        player_embeddings,
-        item_embeddings,
-        market_embeddings,
-        env_outputs["ActionTargets"],
-    )
-
-  def decode_actions(self, hidden, lookup):
-    actions = self.action_decoder(hidden, lookup)
-    value = self.value_head(hidden)
-    return actions, value
+    """Improved baseline policy by JimyhZhu"""
+
+    def __init__(self, env, input_size=256, hidden_size=256, task_size=2048):
+        super().__init__(env)
+
+        self.unflatten_context = env.unflatten_context
+
+        self.tile_encoder = TileEncoder(input_size)
+        self.player_encoder = PlayerEncoder(input_size, hidden_size)
+        self.item_encoder = ItemEncoder(input_size, hidden_size)
+        self.inventory_encoder = InventoryEncoder(input_size, hidden_size)
+        self.market_encoder = MarketEncoder(input_size, hidden_size)
+        self.task_encoder = TaskEncoder(input_size, hidden_size, task_size)
+        self.proj_fc = torch.nn.Linear(6 * input_size, input_size)
+        self.action_decoder = ActionDecoder(input_size, hidden_size)
+        self.value_head = torch.nn.Linear(hidden_size, 1)
+
+    def encode_observations(self, flat_observations):
+        env_outputs = unpack_batched_obs(flat_observations, self.unflatten_context)
+        tile = self.tile_encoder(env_outputs["Tile"])
+        player_embeddings, my_agent = self.player_encoder(
+            env_outputs["Entity"], env_outputs["AgentId"][:, 0]
+        )
+
+        item_embeddings = self.item_encoder(env_outputs["Inventory"])
+        market_embeddings = self.item_encoder(env_outputs["Market"])  # no_pooling
+        market = self.market_encoder(market_embeddings)  # fc +mean pooling already applied
+        task = self.task_encoder(env_outputs["Task"])
+        pooled_item_embeddings = item_embeddings.mean(dim=1)
+        pooled_player_embeddings = player_embeddings.mean(dim=1)
+        obs = torch.cat(
+            [tile, my_agent, pooled_player_embeddings, pooled_item_embeddings, market, task], dim=-1
+        )
+        obs = self.proj_fc(obs)
+
+        # Pad the embeddings to make them the same size to the action_decoder
+        # This is a workaround for the fact that the action_decoder expects the same number of actions including no-op
+        embeddings = [player_embeddings, item_embeddings, market_embeddings]
+        padded_embeddings = []
+        for embedding in embeddings:
+            padding_size = 1  # The size of padding to be added
+            padding = torch.zeros(
+                embedding.size(0), padding_size, embedding.size(2), device=embedding.device
+            )
+            padded_embedding = torch.cat([embedding, padding], dim=1)
+            padded_embeddings.append(padded_embedding)
+        # Replace the original embeddings with the padded versions
+        player_embeddings, item_embeddings, market_embeddings = padded_embeddings
+
+        return obs, (
+            player_embeddings,
+            item_embeddings,
+            market_embeddings,
+            env_outputs["ActionTargets"],
+        )
+
+    def decode_actions(self, hidden, lookup):
+        actions = self.action_decoder(hidden, lookup)
+        value = self.value_head(hidden)
+        return actions, value
 
 
 class TileEncoder(torch.nn.Module):
-  def __init__(self, input_size):
-    super().__init__()
-    self.tile_offset = torch.tensor([i * 256 for i in range(3)])
-    self.embedding = torch.nn.Embedding(3 * 256, 32)
+    def __init__(self, input_size):
+        super().__init__()
+        self.tile_offset = torch.tensor([i * 256 for i in range(3)])
+        self.embedding = torch.nn.Embedding(3 * 256, 32)
 
-    self.tile_conv_1 = torch.nn.Conv2d(96, 32, 3)
-    self.tile_conv_2 = torch.nn.Conv2d(32, 8, 3)
-    self.tile_fc = torch.nn.Linear(8 * 11 * 11, input_size)
+        self.tile_conv_1 = torch.nn.Conv2d(96, 32, 3)
+        self.tile_conv_2 = torch.nn.Conv2d(32, 8, 3)
+        self.tile_fc = torch.nn.Linear(8 * 11 * 11, input_size)
 
-  def forward(self, tile):
-    tile[:, :, :2] -= tile[:, 112:113, :2].clone()
-    tile[:, :, :2] += 7
-    tile = self.embedding(
-        tile.long().clip(0, 255) + self.tile_offset.to(tile.device)
-    )
+    def forward(self, tile):
+        tile[:, :, :2] -= tile[:, 112:113, :2].clone()
+        tile[:, :, :2] += 7
+        tile = self.embedding(tile.long().clip(0, 255) + self.tile_offset.to(tile.device))
 
-    agents, tiles, features, embed = tile.shape
-    tile = (
-        tile.view(agents, tiles, features * embed)
-        .transpose(1, 2)
-        .view(agents, features * embed, 15, 15)
-    )
+        agents, tiles, features, embed = tile.shape
+        tile = (
+            tile.view(agents, tiles, features * embed)
+            .transpose(1, 2)
+            .view(agents, features * embed, 15, 15)
+        )
 
-    tile = F.relu(self.tile_conv_1(tile))
-    tile = F.relu(self.tile_conv_2(tile))
-    tile = tile.contiguous().view(agents, -1)
-    tile = F.relu(self.tile_fc(tile))
+        tile = F.relu(self.tile_conv_1(tile))
+        tile = F.relu(self.tile_conv_2(tile))
+        tile = tile.contiguous().view(agents, -1)
+        tile = F.relu(self.tile_fc(tile))
 
-    return tile
+        return tile
 
 
 class PlayerEncoder(torch.nn.Module):
-  def __init__(self, input_size, hidden_size):
-    super().__init__()
-    self.entity_dim = 31
-    self.num_classes_npc_type = 5  # only using npc_type for one hot (0,4)
-    self.agent_fc = torch.nn.Linear(self.entity_dim + 5 - 1, hidden_size)
-    self.my_agent_fc = torch.nn.Linear(self.entity_dim + 5 - 1, input_size)
-
-  def forward(self, agents, my_id):
-    npc_type = agents[:, :, 1]
-    one_hot_npc_type = F.one_hot(npc_type.long(),
-                                 num_classes=self.num_classes_npc_type).float()  # Subtract 1 if npc_type starts from 1
-    one_hot_agents = torch.cat([agents[:, :, :1], one_hot_npc_type, agents[:, :, 2:]], dim=-1).float()
-
-    agent_ids = one_hot_agents[:, :, EntityId]
-    mask = (agent_ids == my_id.unsqueeze(1)) & (agent_ids != 0)
-    mask = mask.int()
-    row_indices = torch.where(
-        mask.any(dim=1), mask.argmax(dim=1), torch.zeros_like(mask.sum(dim=1))
-    )
-
-    # batch, agent, attrs, embed = agent_embeddings.shape
-    my_agent_embeddings = one_hot_agents[
-        torch.arange(one_hot_agents.shape[0]), row_indices
-    ]
-    agent_embeddings = self.agent_fc(one_hot_agents.cuda())
-    my_agent_embeddings = self.my_agent_fc(my_agent_embeddings)
-    my_agent_embeddings = F.relu(my_agent_embeddings)
-
-    return agent_embeddings, my_agent_embeddings
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.entity_dim = 31
+        self.num_classes_npc_type = 5  # only using npc_type for one hot (0,4)
+        self.agent_fc = torch.nn.Linear(self.entity_dim + 5 - 1, hidden_size)
+        self.my_agent_fc = torch.nn.Linear(self.entity_dim + 5 - 1, input_size)
+
+    def forward(self, agents, my_id):
+        npc_type = agents[:, :, 1]
+        one_hot_npc_type = F.one_hot(
+            npc_type.long(), num_classes=self.num_classes_npc_type
+        ).float()  # Subtract 1 if npc_type starts from 1
+        one_hot_agents = torch.cat(
+            [agents[:, :, :1], one_hot_npc_type, agents[:, :, 2:]], dim=-1
+        ).float()
+
+        agent_ids = one_hot_agents[:, :, EntityId]
+        mask = (agent_ids == my_id.unsqueeze(1)) & (agent_ids != 0)
+        mask = mask.int()
+        row_indices = torch.where(
+            mask.any(dim=1), mask.argmax(dim=1), torch.zeros_like(mask.sum(dim=1))
+        )
+
+        # batch, agent, attrs, embed = agent_embeddings.shape
+        my_agent_embeddings = one_hot_agents[torch.arange(one_hot_agents.shape[0]), row_indices]
+        agent_embeddings = self.agent_fc(one_hot_agents.cuda())
+        my_agent_embeddings = self.my_agent_fc(my_agent_embeddings)
+        my_agent_embeddings = F.relu(my_agent_embeddings)
+
+        return agent_embeddings, my_agent_embeddings
 
 
 class ItemEncoder(torch.nn.Module):
-  def __init__(self, input_size, hidden_size):
-    super().__init__()
-    self.fc = torch.nn.Linear(18 + 2 + 14 - 2, hidden_size)
-    self.discrete_idxs = [1, 14]
-    self.discrete_offset = torch.Tensor([2, 0])
-    self.continuous_idxs = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15]
-    self.continuous_scale = torch.Tensor([1/100] * 12)
-
-  def forward(self, items):
-    if self.discrete_offset.device != items.device:
-      self.discrete_offset = self.discrete_offset.to(items.device)
-      self.continuous_scale = self.continuous_scale.to(items.device)
-
-    one_hot_discrete_equipped = F.one_hot(items[:, :, 14].long(), num_classes=2).float()
-    one_hot_discrete_type_id = F.one_hot(items[:, :, 1].long(), num_classes=18).float()
-    one_hot_discrete = torch.concat([one_hot_discrete_type_id, one_hot_discrete_equipped], dim=-1)  #
-    continuous = items[:, :, self.continuous_idxs] * self.continuous_scale
-    item_embeddings = torch.cat([one_hot_discrete, continuous], dim=-1).float()
-    item_embeddings = self.fc(item_embeddings)
-    return item_embeddings
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.fc = torch.nn.Linear(18 + 2 + 14 - 2, hidden_size)
+        self.discrete_idxs = [1, 14]
+        self.discrete_offset = torch.Tensor([2, 0])
+        self.continuous_idxs = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15]
+        self.continuous_scale = torch.Tensor([1 / 100] * 12)
+
+    def forward(self, items):
+        if self.discrete_offset.device != items.device:
+            self.discrete_offset = self.discrete_offset.to(items.device)
+            self.continuous_scale = self.continuous_scale.to(items.device)
+
+        one_hot_discrete_equipped = F.one_hot(items[:, :, 14].long(), num_classes=2).float()
+        one_hot_discrete_type_id = F.one_hot(items[:, :, 1].long(), num_classes=18).float()
+        one_hot_discrete = torch.concat(
+            [one_hot_discrete_type_id, one_hot_discrete_equipped], dim=-1
+        )  #
+        continuous = items[:, :, self.continuous_idxs] * self.continuous_scale
+        item_embeddings = torch.cat([one_hot_discrete, continuous], dim=-1).float()
+        item_embeddings = self.fc(item_embeddings)
+        return item_embeddings
 
 
 class InventoryEncoder(torch.nn.Module):
-  def __init__(self, input_size, hidden_size):
-    super().__init__()
-    self.fc = torch.nn.Linear(12 * hidden_size, input_size)
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.fc = torch.nn.Linear(12 * hidden_size, input_size)
 
-  def forward(self, inventory):
-    agents, items, hidden = inventory.shape
-    inventory = inventory.view(agents, items * hidden)
-    return self.fc(inventory)
+    def forward(self, inventory):
+        agents, items, hidden = inventory.shape
+        inventory = inventory.view(agents, items * hidden)
+        return self.fc(inventory)
 
 
 class MarketEncoder(torch.nn.Module):
-  def __init__(self, input_size, hidden_size):
-    super().__init__()
-    self.fc = torch.nn.Linear(hidden_size, input_size)
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.fc = torch.nn.Linear(hidden_size, input_size)
 
-  def forward(self, market):
-    return self.fc(market).mean(-2)
+    def forward(self, market):
+        return self.fc(market).mean(-2)
 
 
 class TaskEncoder(torch.nn.Module):
-  def __init__(self, input_size, hidden_size, task_size):
-    super().__init__()
-    self.fc = torch.nn.Linear(task_size, input_size)
+    def __init__(self, input_size, hidden_size, task_size):
+        super().__init__()
+        self.fc = torch.nn.Linear(task_size, input_size)
 
-  def forward(self, task):
-    return self.fc(task.clone().float())
+    def forward(self, task):
+        return self.fc(task.clone().float())
 
 
 class ActionDecoder(torch.nn.Module):
-  def __init__(self, input_size, hidden_size):
-    super().__init__()
-    self.layers = torch.nn.ModuleDict(
-        {
-            "attack_style": torch.nn.Linear(hidden_size, 3),
-            "attack_target": torch.nn.Linear(hidden_size, hidden_size),
-            "market_buy": torch.nn.Linear(hidden_size, hidden_size),
-            "inventory_destroy": torch.nn.Linear(hidden_size, hidden_size),
-            "inventory_give_item": torch.nn.Linear(hidden_size, hidden_size),
-            "inventory_give_player": torch.nn.Linear(hidden_size, hidden_size),
-            "gold_quantity": torch.nn.Linear(hidden_size, 99),
-            "gold_target": torch.nn.Linear(hidden_size, hidden_size),
-            "move": torch.nn.Linear(hidden_size, 5),
-            "inventory_sell": torch.nn.Linear(hidden_size, hidden_size),
-            "inventory_price": torch.nn.Linear(hidden_size, 99),
-            "inventory_use": torch.nn.Linear(hidden_size, hidden_size),
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.layers = torch.nn.ModuleDict(
+            {
+                "attack_style": torch.nn.Linear(hidden_size, 3),
+                "attack_target": torch.nn.Linear(hidden_size, hidden_size),
+                "market_buy": torch.nn.Linear(hidden_size, hidden_size),
+                "inventory_destroy": torch.nn.Linear(hidden_size, hidden_size),
+                "inventory_give_item": torch.nn.Linear(hidden_size, hidden_size),
+                "inventory_give_player": torch.nn.Linear(hidden_size, hidden_size),
+                "gold_quantity": torch.nn.Linear(hidden_size, 99),
+                "gold_target": torch.nn.Linear(hidden_size, hidden_size),
+                "move": torch.nn.Linear(hidden_size, 5),
+                "inventory_sell": torch.nn.Linear(hidden_size, hidden_size),
+                "inventory_price": torch.nn.Linear(hidden_size, 99),
+                "inventory_use": torch.nn.Linear(hidden_size, hidden_size),
+            }
+        )
+
+    def apply_layer(self, layer, embeddings, mask, hidden):
+        hidden = layer(hidden)
+        if hidden.dim() == 2 and embeddings is not None:
+            hidden = torch.matmul(embeddings, hidden.unsqueeze(-1)).squeeze(-1)
+
+        if mask is not None:
+            hidden = hidden.masked_fill(mask == 0, -1e9)
+
+        return hidden
+
+    def forward(self, hidden, lookup):
+        (
+            player_embeddings,
+            inventory_embeddings,
+            market_embeddings,
+            action_targets,
+        ) = lookup
+
+        embeddings = {
+            "attack_target": player_embeddings,
+            "market_buy": market_embeddings,
+            "inventory_destroy": inventory_embeddings,
+            "inventory_give_item": inventory_embeddings,
+            "inventory_give_player": player_embeddings,
+            "gold_target": player_embeddings,
+            "inventory_sell": inventory_embeddings,
+            "inventory_use": inventory_embeddings,
+        }
+
+        action_targets = {
+            "attack_style": action_targets["Attack"]["Style"],
+            "attack_target": action_targets["Attack"]["Target"],
+            "market_buy": action_targets["Buy"]["MarketItem"],
+            "inventory_destroy": action_targets["Destroy"]["InventoryItem"],
+            "inventory_give_item": action_targets["Give"]["InventoryItem"],
+            "inventory_give_player": action_targets["Give"]["Target"],
+            "gold_quantity": action_targets["GiveGold"]["Price"],
+            "gold_target": action_targets["GiveGold"]["Target"],
+            "move": action_targets["Move"]["Direction"],
+            "inventory_sell": action_targets["Sell"]["InventoryItem"],
+            "inventory_price": action_targets["Sell"]["Price"],
+            "inventory_use": action_targets["Use"]["InventoryItem"],
         }
-    )
-
-  def apply_layer(self, layer, embeddings, mask, hidden):
-    hidden = layer(hidden)
-    if hidden.dim() == 2 and embeddings is not None:
-      hidden = torch.matmul(embeddings, hidden.unsqueeze(-1)).squeeze(-1)
-
-    if mask is not None:
-      hidden = hidden.masked_fill(mask == 0, -1e9)
-
-    return hidden
-
-  def forward(self, hidden, lookup):
-    (
-        player_embeddings,
-        inventory_embeddings,
-        market_embeddings,
-        action_targets,
-    ) = lookup
-
-    embeddings = {
-        "attack_target": player_embeddings,
-        "market_buy": market_embeddings,
-        "inventory_destroy": inventory_embeddings,
-        "inventory_give_item": inventory_embeddings,
-        "inventory_give_player": player_embeddings,
-        "gold_target": player_embeddings,
-        "inventory_sell": inventory_embeddings,
-        "inventory_use": inventory_embeddings,
-    }
-
-    action_targets = {
-        "attack_style": action_targets["Attack"]["Style"],
-        "attack_target": action_targets["Attack"]["Target"],
-        "market_buy": action_targets["Buy"]["MarketItem"],
-        "inventory_destroy": action_targets["Destroy"]["InventoryItem"],
-        "inventory_give_item": action_targets["Give"]["InventoryItem"],
-        "inventory_give_player": action_targets["Give"]["Target"],
-        "gold_quantity": action_targets["GiveGold"]["Price"],
-        "gold_target": action_targets["GiveGold"]["Target"],
-        "move": action_targets["Move"]["Direction"],
-        "inventory_sell": action_targets["Sell"]["InventoryItem"],
-        "inventory_price": action_targets["Sell"]["Price"],
-        "inventory_use": action_targets["Use"]["InventoryItem"],
-    }
-
-    actions = []
-    for key, layer in self.layers.items():
-      mask = None
-      mask = action_targets[key]
-      embs = embeddings.get(key)
-
-      # NOTE: SHOULD not hit this
-      # if embs is not None and embs.shape[1] != mask.shape[1]:
-      #   b, _, f = embs.shape
-      #   zeros = torch.zeros([b, 1, f], dtype=embs.dtype, device=embs.device)
-      #   embs = torch.cat([embs, zeros], dim=1)
-
-      action = self.apply_layer(layer, embs, mask, hidden)
-      actions.append(action)
-
-    return actions
+
+        actions = []
+        for key, layer in self.layers.items():
+            mask = None
+            mask = action_targets[key]
+            embs = embeddings.get(key)
+
+            # NOTE: SHOULD not hit this
+            # if embs is not None and embs.shape[1] != mask.shape[1]:
+            #   b, _, f = embs.shape
+            #   zeros = torch.zeros([b, 1, f], dtype=embs.dtype, device=embs.device)
+            #   embs = torch.cat([embs, zeros], dim=1)
+
+            action = self.apply_layer(layer, embs, mask, hidden)
+            actions.append(action)
+
+        return actions
diff --git a/agent_zoo/neurips23_start_kit/reward_wrapper.py b/agent_zoo/neurips23_start_kit/reward_wrapper.py
index 7638ed50..b907740a 100644
--- a/agent_zoo/neurips23_start_kit/reward_wrapper.py
+++ b/agent_zoo/neurips23_start_kit/reward_wrapper.py
@@ -9,7 +9,6 @@ def __init__(
         eval_mode=False,
         early_stop_agent_num=0,
         stat_prefix=None,
-
         # Custom reward wrapper args
         heal_bonus_weight=0,
         explore_bonus_weight=0,
@@ -22,15 +21,15 @@ def __init__(
         self.clip_unique_event = clip_unique_event
 
     def reset(self, **kwargs):
-        '''Called at the start of each episode'''
+        """Called at the start of each episode"""
         self._reset_reward_vars()
         return super().reset(**kwargs)
 
     def _reset_reward_vars(self):
         self._history = {
             agent_id: {
-                'prev_price': 0,
-                'prev_moves': [],
+                "prev_price": 0,
+                "prev_moves": [],
             }
             for agent_id in self.env.possible_agents
         }
@@ -43,20 +42,20 @@ def observation_space(self):
     """
 
     def observation(self, agent_id, agent_obs):
-        '''Called before observations are returned from the environment
+        """Called before observations are returned from the environment
 
         Use this to define custom featurizers. Changing the space itself requires you to
         define the observation space again (i.e. Gym.spaces.Dict(gym.spaces....))
-        '''
+        """
         # Mask the price of the previous action, to encourage agents to explore new prices
-        agent_obs['ActionTargets']['Sell']['Price'][self._history[agent_id]['prev_price']] = 0
+        agent_obs["ActionTargets"]["Sell"]["Price"][self._history[agent_id]["prev_price"]] = 0
         return agent_obs
 
     def action(self, agent_id, agent_atn):
-        '''Called before actions are passed from the model to the environment'''
+        """Called before actions are passed from the model to the environment"""
         # Keep track of the previous price and moves for each agent
-        self._history[agent_id]['prev_price'] = agent_atn['Sell']['Price']
-        self._history[agent_id]['prev_moves'].append(agent_atn['Move']['Direction'])
+        self._history[agent_id]["prev_price"] = agent_atn["Sell"]["Price"]
+        self._history[agent_id]["prev_moves"].append(agent_atn["Move"]["Direction"])
         return agent_atn
 
     def reward_terminated_truncated_info(self, agent_id, reward, terminated, truncated, info):
@@ -72,8 +71,8 @@ def reward_terminated_truncated_info(self, agent_id, reward, terminated, truncat
         # The number of unique events are available in self._unique_events[agent_id]
         uniq = self._unique_events[agent_id]
         explore_bonus = 0
-        if self.explore_bonus_weight > 0 and uniq['curr_count'] > uniq['prev_count']:
-            explore_bonus = min(self.clip_unique_event, uniq['curr_count'] - uniq['prev_count'])
+        if self.explore_bonus_weight > 0 and uniq["curr_count"] > uniq["prev_count"]:
+            explore_bonus = min(self.clip_unique_event, uniq["curr_count"] - uniq["prev_count"])
             explore_bonus *= self.explore_bonus_weight
 
         reward += healing_bonus + explore_bonus
diff --git a/analysis/proc_eval_result.py b/analysis/proc_eval_result.py
index 815a2701..2531014b 100644
--- a/analysis/proc_eval_result.py
+++ b/analysis/proc_eval_result.py
@@ -8,85 +8,87 @@
 
 # Make the table output simpler
 pl.Config.set_tbl_hide_dataframe_shape(True)
-pl.Config.set_tbl_formatting('NOTHING')
+pl.Config.set_tbl_formatting("NOTHING")
 pl.Config.set_tbl_hide_column_data_types(True)
 
 # string matching for task names
 WEIGHT_DICT = {
-    'TickGE': ('survival', 100 / 6),  # 1 survival task
-    'PLAYER_KILL': ('combat', 100 / (6*3)),  # 3 combat tasks
-    'DefeatEntity': ('combat', 100 / (6*3)),
-    'GO_FARTHEST': ('exploration', 100 / (6*2)),  # 2 exploration tasks
-    'OccupyTile': ('exploration', 100 / (6*2)),
-    'AttainSkill': ('skill', 100 / (6*8)),  # 8 skill tasks
-    'HarvestItem': ('item', 100 / (6*44)),  # 44 item tasks
-    'ConsumeItem': ('item', 100 / (6*44)),
-    'EquipItem': ('item', 100 / (6*44)),
-    'FullyArmed': ('item', 100 / (6*44)),
-    'EARN_GOLD': ('market', 100 / (6*5)),  # 5 market tasks
-    'BUY_ITEM': ('market', 100 / (6*5)),
-    'EarnGold': ('market', 100 / (6*5)),
-    'HoardGold': ('market', 100 / (6*5)),
-    'MakeProfit': ('market', 100 / (6*5)),
+    "TickGE": ("survival", 100 / 6),  # 1 survival task
+    "PLAYER_KILL": ("combat", 100 / (6 * 3)),  # 3 combat tasks
+    "DefeatEntity": ("combat", 100 / (6 * 3)),
+    "GO_FARTHEST": ("exploration", 100 / (6 * 2)),  # 2 exploration tasks
+    "OccupyTile": ("exploration", 100 / (6 * 2)),
+    "AttainSkill": ("skill", 100 / (6 * 8)),  # 8 skill tasks
+    "HarvestItem": ("item", 100 / (6 * 44)),  # 44 item tasks
+    "ConsumeItem": ("item", 100 / (6 * 44)),
+    "EquipItem": ("item", 100 / (6 * 44)),
+    "FullyArmed": ("item", 100 / (6 * 44)),
+    "EARN_GOLD": ("market", 100 / (6 * 5)),  # 5 market tasks
+    "BUY_ITEM": ("market", 100 / (6 * 5)),
+    "EarnGold": ("market", 100 / (6 * 5)),
+    "HoardGold": ("market", 100 / (6 * 5)),
+    "MakeProfit": ("market", 100 / (6 * 5)),
 }
 
+
 def get_task_weight(task_name):
     for key, val in WEIGHT_DICT.items():
         if key in task_name:
             return val
-    logging.warning(f'Task name {task_name} not found in weight dict')
-    return 'etc', 0
+    logging.warning(f"Task name {task_name} not found in weight dict")
+    return "etc", 0
+
 
 def get_summary_dict(vals, key):
-    progress = vals if key == 'length' else [v[0] for v in vals]
-    summ = {
-        'count': len(progress),
-        'mean': np.mean(progress),
-        'median': np.median(progress)
-    }
-
-    if key == 'length':
+    progress = vals if key == "length" else [v[0] for v in vals]
+    summ = {"count": len(progress), "mean": np.mean(progress), "median": np.median(progress)}
+
+    if key == "length":
         progress = np.array(progress) / 1023  # full episode length
 
-    summ['completed'] = np.mean([1 if v >= 1 else 0 for v in progress])
-    summ['over30pcnt'] = np.mean([1 if v >= 0.3 else 0 for v in progress])
+    summ["completed"] = np.mean([1 if v >= 1 else 0 for v in progress])
+    summ["over30pcnt"] = np.mean([1 if v >= 0.3 else 0 for v in progress])
     return summ
 
+
 def summarize_single_eval(data, weighted_score=False):
     summary = {}
 
     # task-level info
     for key, vals in data.items():
-        if key.startswith('curriculum') or key == 'length':
+        if key.startswith("curriculum") or key == "length":
             summary[key] = get_summary_dict(vals, key)
 
-            if weighted_score and key.startswith('curriculum'):
+            if weighted_score and key.startswith("curriculum"):
                 category, weight = get_task_weight(key)
-                summary[key]['category'] = category
-                summary[key]['weight'] = weight
-                summary[key]['weighted_score'] = summary[key]['mean'] * weight
+                summary[key]["category"] = category
+                summary[key]["weight"] = weight
+                summary[key]["weighted_score"] = summary[key]["mean"] * weight
 
     # meta info
-    summary['avg_progress'] = np.mean([v['mean'] for k, v in summary.items()
-                                       if k.startswith('curriculum')])
+    summary["avg_progress"] = np.mean(
+        [v["mean"] for k, v in summary.items() if k.startswith("curriculum")]
+    )
     if weighted_score:
-        summary['weighted_score'] = np.sum([v['weighted_score'] for k, v in summary.items()
-                                            if k.startswith('curriculum')])
+        summary["weighted_score"] = np.sum(
+            [v["weighted_score"] for k, v in summary.items() if k.startswith("curriculum")]
+        )
     return summary
 
+
 def process_eval_files(policy_store_dir, eval_prefix):
     summ_policy = []
     summ_task = []
 
     for file in os.listdir(policy_store_dir):
         # NOTE: assumes the file naming convention is 'eval_<mode>_<seed>.json'
-        if not file.startswith(eval_prefix) or not file.endswith('.json'):
+        if not file.startswith(eval_prefix) or not file.endswith(".json"):
             continue
 
-        mode = file.split('_')[1]
-        random_seed = file.split('_')[2].replace('.json', '')
+        mode = file.split("_")[1]
+        random_seed = file.split("_")[2].replace(".json", "")
 
-        with open(os.path.join(policy_store_dir, file), 'r') as f:
+        with open(os.path.join(policy_store_dir, file), "r") as f:
             data = json.load(f)
 
         for pol_name, pol_data in data.items():
@@ -94,59 +96,82 @@ def process_eval_files(policy_store_dir, eval_prefix):
                 continue
 
             summary = summarize_single_eval(pol_data, weighted_score=True)
-            summ_policy.append({
-                'policy_name': pol_name,
-                'mode': mode,
-                'seed': random_seed,
-                'count': summary['length']['count'],
-                'length': summary['length']['mean'],
-                'score': summary['avg_progress'],
-                'weighted_score': summary['weighted_score']
-            })
+            summ_policy.append(
+                {
+                    "policy_name": pol_name,
+                    "mode": mode,
+                    "seed": random_seed,
+                    "count": summary["length"]["count"],
+                    "length": summary["length"]["mean"],
+                    "score": summary["avg_progress"],
+                    "weighted_score": summary["weighted_score"],
+                }
+            )
 
             # also gather the results across random seeds for each task, then average
             for task_name, task_data in summary.items():
-                if not task_name.startswith('curriculum'):
+                if not task_name.startswith("curriculum"):
                     continue
-                summ_task.append({
-                    'category': task_data['category'],
-                    'task_name': task_name,
-                    'weight': task_data['weight'],
-                    'policy_name': pol_name,
-                    'mode': mode,
-                    'seed': random_seed,
-                    'count': task_data['count'],
-                    'score': task_data['mean']
-                })
-
-    summ_df = pl.DataFrame(summ_policy).sort(['policy_name', 'mode', 'seed'])
-    summ_grp = summ_df.group_by(['policy_name', 'mode']).agg(
-        pl.col('score').mean(),
-        pl.col('weighted_score').mean(),
+                summ_task.append(
+                    {
+                        "category": task_data["category"],
+                        "task_name": task_name,
+                        "weight": task_data["weight"],
+                        "policy_name": pol_name,
+                        "mode": mode,
+                        "seed": random_seed,
+                        "count": task_data["count"],
+                        "score": task_data["mean"],
+                    }
+                )
+
+    summ_df = pl.DataFrame(summ_policy).sort(["policy_name", "mode", "seed"])
+    summ_grp = summ_df.group_by(["policy_name", "mode"]).agg(
+        pl.col("score").mean(),
+        pl.col("weighted_score").mean(),
     )
-    summ_grp = summ_grp.sort('weighted_score', descending=True)
-    summ_grp.write_csv(os.path.join(policy_store_dir, 'score_summary.tsv'), separator="\t", float_precision=6)
-    print('\nPolicy score summary, sorted by weighted_score:')
+    summ_grp = summ_grp.sort("weighted_score", descending=True)
+    summ_grp.write_csv(
+        os.path.join(policy_store_dir, "score_summary.tsv"), separator="\t", float_precision=6
+    )
+    print("\nPolicy score summary, sorted by weighted_score:")
     print(summ_grp)
 
-    task_df = pl.DataFrame(summ_task).sort(['mode', 'category', 'task_name', 'policy_name', 'seed'])
-    task_grp = task_df.group_by(['mode', 'category', 'task_name', 'policy_name']).agg(pl.col('score').mean())
-    task_grp = task_grp.sort(['mode', 'category', 'task_name', 'policy_name'])
-    task_grp.write_csv(os.path.join(policy_store_dir, 'score_task_summary.tsv'), separator="\t", float_precision=6)
-    cate_grp = task_df.group_by(['mode', 'category', 'policy_name']).agg(pl.col('score').mean())
-    cate_grp = cate_grp.sort(['mode', 'category', 'policy_name'])
-    cate_grp.write_csv(os.path.join(policy_store_dir, 'score_category_summary.tsv'), separator="\t", float_precision=6)
+    task_df = pl.DataFrame(summ_task).sort(["mode", "category", "task_name", "policy_name", "seed"])
+    task_grp = task_df.group_by(["mode", "category", "task_name", "policy_name"]).agg(
+        pl.col("score").mean()
+    )
+    task_grp = task_grp.sort(["mode", "category", "task_name", "policy_name"])
+    task_grp.write_csv(
+        os.path.join(policy_store_dir, "score_task_summary.tsv"), separator="\t", float_precision=6
+    )
+    cate_grp = task_df.group_by(["mode", "category", "policy_name"]).agg(pl.col("score").mean())
+    cate_grp = cate_grp.sort(["mode", "category", "policy_name"])
+    cate_grp.write_csv(
+        os.path.join(policy_store_dir, "score_category_summary.tsv"),
+        separator="\t",
+        float_precision=6,
+    )
 
-    if len(summ_df['seed'].unique()) > 1:
-        summ_df.write_csv(os.path.join(policy_store_dir, 'score_by_seed.tsv'), separator="\t", float_precision=6)
-        task_df.write_csv(os.path.join(policy_store_dir, 'score_by_task_seed.tsv'), separator="\t", float_precision=6)
+    if len(summ_df["seed"].unique()) > 1:
+        summ_df.write_csv(
+            os.path.join(policy_store_dir, "score_by_seed.tsv"), separator="\t", float_precision=6
+        )
+        task_df.write_csv(
+            os.path.join(policy_store_dir, "score_by_task_seed.tsv"),
+            separator="\t",
+            float_precision=6,
+        )
 
     return summ_df, summ_grp, task_df, task_grp, cate_grp
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Process the evaluation result files')
-    parser.add_argument('policy_store_dir', type=str, help='Path to the policy directory')
-    parser.add_argument('-p', '--prefix', type=str, default='eval_', help='Prefix of the evaluation result files')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process the evaluation result files")
+    parser.add_argument("policy_store_dir", type=str, help="Path to the policy directory")
+    parser.add_argument(
+        "-p", "--prefix", type=str, default="eval_", help="Prefix of the evaluation result files"
+    )
     args = parser.parse_args()
 
     process_eval_files(args.policy_store_dir, args.prefix)
diff --git a/analysis/proc_task_cond_result.py b/analysis/proc_task_cond_result.py
index 53b10468..651ceba1 100644
--- a/analysis/proc_task_cond_result.py
+++ b/analysis/proc_task_cond_result.py
@@ -11,53 +11,55 @@
 from nmmo.systems.item import ALL_ITEM
 from nmmo.systems.skill import COMBAT_SKILL, HARVEST_SKILL
 
-CODE_TO_EVENT = {
-    v: k for k, v in EventCode.__dict__.items() if not k.startswith('_')
-}
+CODE_TO_EVENT = {v: k for k, v in EventCode.__dict__.items() if not k.startswith("_")}
 
-ITEM_ID_TO_NAME = {
-    item.ITEM_TYPE_ID: item.__name__ for item in ALL_ITEM
-}
+ITEM_ID_TO_NAME = {item.ITEM_TYPE_ID: item.__name__ for item in ALL_ITEM}
+
+SKILL_ID_TO_NAME = {skill.SKILL_ID: skill.__name__ for skill in COMBAT_SKILL + HARVEST_SKILL}
 
-SKILL_ID_TO_NAME = {
-    skill.SKILL_ID: skill.__name__ for skill in COMBAT_SKILL + HARVEST_SKILL
-}
 
 # event tuple key to string
 def event_key_to_str(event_key):
     if event_key[0] == EventCode.LEVEL_UP:
-        return f'LEVEL_{SKILL_ID_TO_NAME[event_key[1]]}'
+        return f"LEVEL_{SKILL_ID_TO_NAME[event_key[1]]}"
 
     elif event_key[0] == EventCode.SCORE_HIT:
-        return f'ATTACK_NUM_{SKILL_ID_TO_NAME[event_key[1]]}'
+        return f"ATTACK_NUM_{SKILL_ID_TO_NAME[event_key[1]]}"
 
-    elif event_key[0] in [EventCode.HARVEST_ITEM, EventCode.CONSUME_ITEM, EventCode.EQUIP_ITEM,
-                          EventCode.LIST_ITEM, EventCode.BUY_ITEM]:
-        return f'{CODE_TO_EVENT[event_key[0]]}_{ITEM_ID_TO_NAME[event_key[1]]}'
+    elif event_key[0] in [
+        EventCode.HARVEST_ITEM,
+        EventCode.CONSUME_ITEM,
+        EventCode.EQUIP_ITEM,
+        EventCode.LIST_ITEM,
+        EventCode.BUY_ITEM,
+    ]:
+        return f"{CODE_TO_EVENT[event_key[0]]}_{ITEM_ID_TO_NAME[event_key[1]]}"
 
     elif event_key[0] == EventCode.GO_FARTHEST:
-        return '3_PROGRESS_TO_CENTER'
+        return "3_PROGRESS_TO_CENTER"
 
     elif event_key[0] == EventCode.AGENT_CULLED:
-        return '2_AGENT_LIFESPAN'
+        return "2_AGENT_LIFESPAN"
 
     else:
         return CODE_TO_EVENT[event_key[0]]
 
+
 def extract_task_name(task_str):
-    name = task_str.split('Task_eval_fn:(')[1].split(')_assignee:')[0]
+    name = task_str.split("Task_eval_fn:(")[1].split(")_assignee:")[0]
     # then take out (agent_id,)
-    return name.split('_(')[0] + '_' + name.split(')_')[1]
+    return name.split("_(")[0] + "_" + name.split(")_")[1]
+
 
 def gather_agent_events_by_task(data_dir):
     data_by_task = defaultdict(list)
-    file_list = [f for f in os.listdir(data_dir) if f.endswith('.metadata.pkl')]
+    file_list = [f for f in os.listdir(data_dir) if f.endswith(".metadata.pkl")]
     for file_name in tqdm(file_list):
-        data = dill.load(open(f'{data_dir}/{file_name}', 'rb'))
-        final_tick = data['tick']
+        data = dill.load(open(f"{data_dir}/{file_name}", "rb"))
+        final_tick = data["tick"]
 
-        for agent_id, vals in data['event_stats'].items():
-            task_name = extract_task_name(data['task'][agent_id])
+        for agent_id, vals in data["event_stats"].items():
+            task_name = extract_task_name(data["task"][agent_id])
 
             # Agent survived until the end
             if EventCode.AGENT_CULLED not in vals:
@@ -66,9 +68,10 @@ def gather_agent_events_by_task(data_dir):
 
     return data_by_task
 
+
 def get_event_stats(task_name, task_data):
     num_agents = len(task_data)
-    assert num_agents > 0, 'There should be at least one agent'
+    assert num_agents > 0, "There should be at least one agent"
 
     cnt_attack = 0
     cnt_buy = 0
@@ -77,7 +80,7 @@ def get_event_stats(task_name, task_data):
     cnt_harvest = 0
     cnt_list = 0
 
-    results = {'0_NAME': task_name, '1_COUNT': num_agents}
+    results = {"0_NAME": task_name, "1_COUNT": num_agents}
     event_data = defaultdict(list)
     for data in task_data:
         for event, val in data.items():
@@ -90,11 +93,11 @@ def get_event_stats(task_name, task_data):
             results[event_key_to_str(event)] = np.mean(vals)  # AVG skill level
         elif event[0] == EventCode.AGENT_CULLED:
             life_span = np.mean(vals)
-            results['2_AGENT_LIFESPAN_AVG'] = life_span
-            results['2_AGENT_LIFESPAN_SD'] = np.std(vals)
+            results["2_AGENT_LIFESPAN_AVG"] = life_span
+            results["2_AGENT_LIFESPAN_SD"] = np.std(vals)
         elif event[0] == EventCode.GO_FARTHEST:
-            results['3_PROGRESS_TO_CENTER_AVG'] = np.mean(vals)
-            results['3_PROGRESS_TO_CENTER_SD'] = np.std(vals)
+            results["3_PROGRESS_TO_CENTER_AVG"] = np.mean(vals)
+            results["3_PROGRESS_TO_CENTER_SD"] = np.std(vals)
         else:
             results[event_key_to_str(event)] = sum(vals) / num_agents
 
@@ -111,31 +114,31 @@ def get_event_stats(task_name, task_data):
         if event[0] == EventCode.LIST_ITEM:
             cnt_list += sum(vals)
 
-    results['4_NORM_ATTACK'] = cnt_attack / life_span
-    results['4_NORM_BUY'] = cnt_buy / life_span
-    results['4_NORM_CONSUME'] = cnt_consume / life_span
-    results['4_NORM_EQUIP'] = cnt_equip / life_span
-    results['4_NORM_HARVEST'] = cnt_harvest / life_span
-    results['4_NORM_LIST'] = cnt_list / life_span
+    results["4_NORM_ATTACK"] = cnt_attack / life_span
+    results["4_NORM_BUY"] = cnt_buy / life_span
+    results["4_NORM_CONSUME"] = cnt_consume / life_span
+    results["4_NORM_EQUIP"] = cnt_equip / life_span
+    results["4_NORM_HARVEST"] = cnt_harvest / life_span
+    results["4_NORM_LIST"] = cnt_list / life_span
 
     return results
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Process replay data')
-    parser.add_argument('policy_store_dir', type=str, help='Path to the policy directory')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process replay data")
+    parser.add_argument("policy_store_dir", type=str, help="Path to the policy directory")
     args = parser.parse_args()
 
     # Gather the event data by tasks, across multiple replays
     data_by_task = gather_agent_events_by_task(args.policy_store_dir)
 
     task_results = [
-        get_event_stats(task_name, task_data)
-        for task_name, task_data in data_by_task.items()    
+        get_event_stats(task_name, task_data) for task_name, task_data in data_by_task.items()
     ]
 
-    task_df = pl.DataFrame(task_results).fill_null(0).sort('0_NAME')
+    task_df = pl.DataFrame(task_results).fill_null(0).sort("0_NAME")
     task_df = task_df.select(sorted(task_df.columns))
-    task_df.write_csv('task_conditioning.tsv', separator='\t', float_precision=5)
+    task_df.write_csv("task_conditioning.tsv", separator="\t", float_precision=5)
 
-    print('Result file saved as task_conditioning.tsv')
-    print('Done.')
+    print("Result file saved as task_conditioning.tsv")
+    print("Done.")
diff --git a/analysis/run_task_conditioning.py b/analysis/run_task_conditioning.py
index fb324b54..6d889719 100644
--- a/analysis/run_task_conditioning.py
+++ b/analysis/run_task_conditioning.py
@@ -8,39 +8,52 @@
 
 from evaluate import make_env_creator, make_agent_creator
 
-CURRICULUM_FILE = 'neurips23_evaluation/heldout_task_with_embedding.pkl'
+CURRICULUM_FILE = "neurips23_evaluation/heldout_task_with_embedding.pkl"
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
-    parser = argparse.ArgumentParser(description='Parse environment argument', add_help=False)
-    parser.add_argument('eval_model_path', type=str, default=None, help='Path to model to evaluate')
-    parser.add_argument('-c', '--curriculum', type=str, default=CURRICULUM_FILE, help='Path to curriculum file')
-    parser.add_argument('-t', '--task-to-assign', type=int, default=None,
-                        help='The index of the task to assign in the curriculum file')
-    parser.add_argument('-r', '--repeat', type=int, default=1, help='Number of times to repeat the evaluation')
+    parser = argparse.ArgumentParser(description="Parse environment argument", add_help=False)
+    parser.add_argument("eval_model_path", type=str, default=None, help="Path to model to evaluate")
+    parser.add_argument(
+        "-c", "--curriculum", type=str, default=CURRICULUM_FILE, help="Path to curriculum file"
+    )
+    parser.add_argument(
+        "-t",
+        "--task-to-assign",
+        type=int,
+        default=None,
+        help="The index of the task to assign in the curriculum file",
+    )
+    parser.add_argument(
+        "-r", "--repeat", type=int, default=1, help="Number of times to repeat the evaluation"
+    )
     clean_parser = argparse.ArgumentParser(parents=[parser])
     args = parser.parse_known_args()[0].__dict__
 
     # required args when using train.py's helper functions
-    args['no_track'] = True
-    args['no_recurrence'] = False
-    args['vectorization'] = 'serial'
-    args['debug'] = False
+    args["no_track"] = True
+    args["no_recurrence"] = False
+    args["vectorization"] = "serial"
+    args["debug"] = False
 
     # Generate argparse menu from config
-    config = load_from_config('neurips23_start_kit')  # dummy learner
+    config = load_from_config("neurips23_start_kit")  # dummy learner
     args = combine_config_args(parser, args, config)
-    args = update_args(args, mode='replay')
+    args = update_args(args, mode="replay")
 
     agent_creator = make_agent_creator()
-    env_creator = make_env_creator(args['curriculum'], 'pvp')
+    env_creator = make_env_creator(args["curriculum"], "pvp")
 
     # Generate replay
     for i in range(args.repeat):
-        generate_replay(args, env_creator, agent_creator,
-                        stop_when_all_complete_task=False,
-                        seed=random.randint(10000000, 99999999))
-        print(f'Generated replay for task {i+1}/{args.repeat}...')
-
-    print('Done!')
+        generate_replay(
+            args,
+            env_creator,
+            agent_creator,
+            stop_when_all_complete_task=False,
+            seed=random.randint(10000000, 99999999),
+        )
+        print(f"Generated replay for task {i+1}/{args.repeat}...")
+
+    print("Done!")
diff --git a/curriculum_generation/curriculum_tutorial.py b/curriculum_generation/curriculum_tutorial.py
index 40b5e441..cc96dcf0 100644
--- a/curriculum_generation/curriculum_tutorial.py
+++ b/curriculum_generation/curriculum_tutorial.py
@@ -41,6 +41,7 @@
 ##############################################################################
 # Create training tasks using custom evaluation functions
 
+
 def PracticeEating(gs, subject):
     """The progress, the max of which is 1, should
     * increase small for each eating
@@ -55,12 +56,15 @@ def PracticeEating(gs, subject):
         progress += 0.3
     return norm(progress)  # norm is a helper function to normalize the value to [0, 1]
 
+
 curriculum.append(TaskSpec(eval_fn=PracticeEating, eval_fn_kwargs={}))
 
+
 # You can also use pre-built eval functions to define your own eval functions
 def PracticeInventoryManagement(gs, subject, space, num_tick):
     return norm(InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick))
 
+
 for space in [2, 4, 8]:
     curriculum.append(
         TaskSpec(
@@ -74,6 +78,7 @@ def PracticeInventoryManagement(gs, subject, space, num_tick):
     # Import the custom curriculum
     print("------------------------------------------------------------")
     import curriculum_tutorial  # which is this file
+
     CURRICULUM = curriculum_tutorial.curriculum
     print("The number of training tasks in the curriculum:", len(CURRICULUM))
 
@@ -96,6 +101,7 @@ def PracticeInventoryManagement(gs, subject, space, num_tick):
     CURRICULUM_FILE_PATH = "custom_curriculum_with_embedding.pkl"
     with open(CURRICULUM_FILE_PATH, "wb") as f:
         import dill
+
         dill.dump(CURRICULUM, f)
     print("All training tasks are picklable.")
 
@@ -105,6 +111,7 @@ def PracticeInventoryManagement(gs, subject, space, num_tick):
     print("------------------------------------------------------------")
     print("Generating the task spec with embedding file ...")
     from task_encoder import TaskEncoder
+
     LLM_CHECKPOINT = "deepseek-ai/deepseek-coder-1.3b-instruct"
 
     # Get the task embeddings for the training tasks and save to file
diff --git a/curriculum_generation/manual_curriculum.py b/curriculum_generation/manual_curriculum.py
index 37fe9e11..0f207573 100644
--- a/curriculum_generation/manual_curriculum.py
+++ b/curriculum_generation/manual_curriculum.py
@@ -1,9 +1,7 @@
 """Manual test for creating learning curriculum manually"""
-# pylint: disable=invalid-name,redefined-outer-name,bad-builtin
-# pylint: disable=wildcard-import,unused-wildcard-import
+
 from typing import List
 
-import nmmo
 import nmmo.lib.material as m
 import nmmo.systems.item as Item
 import nmmo.systems.skill as Skill
@@ -64,14 +62,14 @@
     "DRINK_WATER",
 ]
 for event_code in most_essentials:
-  for cnt in range(1, 10):
-    curriculum.append(
-        TaskSpec(
-            eval_fn=CountEvent,
-            eval_fn_kwargs={"event": event_code, "N": cnt},
-            sampling_weight=100,
+    for cnt in range(1, 10):
+        curriculum.append(
+            TaskSpec(
+                eval_fn=CountEvent,
+                eval_fn_kwargs={"event": event_code, "N": cnt},
+                sampling_weight=100,
+            )
         )
-    )
 
 essential_skills = [
     "SCORE_HIT",
@@ -85,14 +83,14 @@
     "BUY_ITEM",
 ]
 for event_code in essential_skills:
-  for cnt in EVENT_NUMBER_GOAL:
-    curriculum.append(
-        TaskSpec(
-            eval_fn=CountEvent,
-            eval_fn_kwargs={"event": event_code, "N": cnt},
-            sampling_weight=20,
+    for cnt in EVENT_NUMBER_GOAL:
+        curriculum.append(
+            TaskSpec(
+                eval_fn=CountEvent,
+                eval_fn_kwargs={"event": event_code, "N": cnt},
+                sampling_weight=20,
+            )
         )
-    )
 
 # item/market skills, which happen less frequently or should not do too much
 item_skills = [
@@ -101,248 +99,241 @@
     "GIVE_GOLD",
 ]
 for event_code in item_skills:
-  curriculum += [
-      TaskSpec(eval_fn=CountEvent, eval_fn_kwargs={
-               "event": event_code, "N": cnt})
-      for cnt in INFREQUENT_GOAL
-  ]  # less than 10
+    curriculum += [
+        TaskSpec(eval_fn=CountEvent, eval_fn_kwargs={"event": event_code, "N": cnt})
+        for cnt in INFREQUENT_GOAL
+    ]  # less than 10
 
 # find resource tiles
 for resource in m.Harvestable:
-  curriculum.append(
-      TaskSpec(
-          eval_fn=CanSeeTile,
-          eval_fn_kwargs={"tile_type": resource},
-          sampling_weight=10,
-      )
-  )
+    curriculum.append(
+        TaskSpec(
+            eval_fn=CanSeeTile,
+            eval_fn_kwargs={"tile_type": resource},
+            sampling_weight=10,
+        )
+    )
+
 
 # practice particular skill with a tool
 def PracticeSkillWithTool(gs, subject, skill, exp):
-  return 0.3 * EquipItem(gs, subject, item=TOOL_FOR_SKILL[skill], level=1, num_agent=1) + \
-         0.7 * GainExperience(gs, subject, skill, exp, num_agent=1)
+    return 0.3 * EquipItem(
+        gs, subject, item=TOOL_FOR_SKILL[skill], level=1, num_agent=1
+    ) + 0.7 * GainExperience(gs, subject, skill, exp, num_agent=1)
+
 
 for skill in SKILLS:
-  # level up a skill
-  for level in LEVEL_GOAL[1:]:
-    # since this is an agent task, num_agent must be 1
-    curriculum.append(
-        TaskSpec(
-            eval_fn=AttainSkill,
-            eval_fn_kwargs={"skill": skill, "level": level, "num_agent": 1},
-            sampling_weight=10*(6 - level) if level < 6 else 5,
+    # level up a skill
+    for level in LEVEL_GOAL[1:]:
+        # since this is an agent task, num_agent must be 1
+        curriculum.append(
+            TaskSpec(
+                eval_fn=AttainSkill,
+                eval_fn_kwargs={"skill": skill, "level": level, "num_agent": 1},
+                sampling_weight=10 * (6 - level) if level < 6 else 5,
+            )
         )
-    )
-  
-  # gain experience on particular skill
-  for exp in EXP_GOAL:
-    curriculum.append(
-        TaskSpec(
-            eval_fn=PracticeSkillWithTool,
-            eval_fn_kwargs={"skill": skill, "exp": exp},
-            sampling_weight=50,
+
+    # gain experience on particular skill
+    for exp in EXP_GOAL:
+        curriculum.append(
+            TaskSpec(
+                eval_fn=PracticeSkillWithTool,
+                eval_fn_kwargs={"skill": skill, "exp": exp},
+                sampling_weight=50,
+            )
         )
-    )
 
 # stay alive ... like ... for 300 ticks
 # i.e., getting incremental reward for each tick alive as an individual or a team
 for num_tick in STAY_ALIVE_GOAL:
-  curriculum.append(
-      TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": num_tick}))
+    curriculum.append(TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": num_tick}))
 
 # occupy the center tile, assuming the Medium map size
 # TODO: it'd be better to have some intermediate targets toward the center
-curriculum.append(TaskSpec(eval_fn=OccupyTile,
-                 eval_fn_kwargs={"row": 80, "col": 80}))
+curriculum.append(TaskSpec(eval_fn=OccupyTile, eval_fn_kwargs={"row": 80, "col": 80}))
 
 # find the other team leader
 for target in ["left_team_leader", "right_team_leader"]:
-  curriculum.append(TaskSpec(eval_fn=CanSeeAgent,
-                   eval_fn_kwargs={"target": target}))
+    curriculum.append(TaskSpec(eval_fn=CanSeeAgent, eval_fn_kwargs={"target": target}))
 
 # find the other team (any agent)
 for target in ["left_team", "right_team"]:
-  curriculum.append(TaskSpec(eval_fn=CanSeeGroup,
-                   eval_fn_kwargs={"target": target}))
+    curriculum.append(TaskSpec(eval_fn=CanSeeGroup, eval_fn_kwargs={"target": target}))
 
 # practice specific combat style
 for style in COMBAT_STYLE:
-  for cnt in EVENT_NUMBER_GOAL:
-    curriculum.append(
-        TaskSpec(
-            eval_fn=ScoreHit,
-            eval_fn_kwargs={"combat_style": style, "N": cnt},
-            sampling_weight=5,
+    for cnt in EVENT_NUMBER_GOAL:
+        curriculum.append(
+            TaskSpec(
+                eval_fn=ScoreHit,
+                eval_fn_kwargs={"combat_style": style, "N": cnt},
+                sampling_weight=5,
+            )
         )
-    )
 
 # hoarding gold -- evaluated on the current gold
 for amount in EVENT_NUMBER_GOAL:
-  curriculum.append(
-      TaskSpec(
-          eval_fn=HoardGold, eval_fn_kwargs={"amount": amount}, sampling_weight=10
-      )
-  )
+    curriculum.append(
+        TaskSpec(eval_fn=HoardGold, eval_fn_kwargs={"amount": amount}, sampling_weight=10)
+    )
 
 # earning gold -- evaluated on the total gold earned by selling items
 for amount in EVENT_NUMBER_GOAL:
-  curriculum.append(
-      TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={
-               "amount": amount}, sampling_weight=10)
-  )
+    curriculum.append(
+        TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": amount}, sampling_weight=10)
+    )
 
 # spending gold, by buying items
 for amount in EVENT_NUMBER_GOAL:
-  curriculum.append(
-      TaskSpec(
-          eval_fn=SpendGold, eval_fn_kwargs={"amount": amount}, sampling_weight=5
-      )
-  )
+    curriculum.append(
+        TaskSpec(eval_fn=SpendGold, eval_fn_kwargs={"amount": amount}, sampling_weight=5)
+    )
 
 # making profits by trading -- only buying and selling are counted
 for amount in EVENT_NUMBER_GOAL:
-  curriculum.append(
-      TaskSpec(
-          eval_fn=MakeProfit, eval_fn_kwargs={"amount": amount}, sampling_weight=3
-      )
-  )
+    curriculum.append(
+        TaskSpec(eval_fn=MakeProfit, eval_fn_kwargs={"amount": amount}, sampling_weight=3)
+    )
+
 
 # managing inventory space
 def PracticeInventoryManagement(gs, subject, space, num_tick):
-  return InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick)
+    return InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick)
+
 
 for space in [2, 4, 8]:
-  curriculum += [
-      TaskSpec(
-          eval_fn=PracticeInventoryManagement,
-          eval_fn_kwargs={"space": space, "num_tick": num_tick},
-      )
-      for num_tick in STAY_ALIVE_GOAL
-  ]
+    curriculum += [
+        TaskSpec(
+            eval_fn=PracticeInventoryManagement,
+            eval_fn_kwargs={"space": space, "num_tick": num_tick},
+        )
+        for num_tick in STAY_ALIVE_GOAL
+    ]
 
 # own item, evaluated on the current inventory
 for item in ALL_ITEM:
-  for level in LEVEL_GOAL:
-    # agent task
-    for quantity in ITEM_NUM_GOAL:
-      if level + quantity <= 6 or quantity == 1:  # heuristic prune
-        curriculum.append(
-            TaskSpec(
-                eval_fn=OwnItem,
-                eval_fn_kwargs={
-                    "item": item,
-                    "level": level,
-                    "quantity": quantity,
-                },
-                sampling_weight=4 - level if level < 4 else 1,
-            )
-        )
+    for level in LEVEL_GOAL:
+        # agent task
+        for quantity in ITEM_NUM_GOAL:
+            if level + quantity <= 6 or quantity == 1:  # heuristic prune
+                curriculum.append(
+                    TaskSpec(
+                        eval_fn=OwnItem,
+                        eval_fn_kwargs={
+                            "item": item,
+                            "level": level,
+                            "quantity": quantity,
+                        },
+                        sampling_weight=4 - level if level < 4 else 1,
+                    )
+                )
 
 # equip item, evaluated on the current inventory and equipment status
 for item in EQUIP_ITEM:
-  for level in LEVEL_GOAL:
-    # agent task
-    curriculum.append(
-        TaskSpec(
-            eval_fn=EquipItem,
-            eval_fn_kwargs={"item": item, "level": level, "num_agent": 1},
-            sampling_weight=4 - level if level < 4 else 1,
-        )
-    )
-
-# consume items (ration, potion), evaluated based on the event log
-for item in Item.CONSUMABLE:
-  for level in LEVEL_GOAL:
-    # agent task
-    for quantity in ITEM_NUM_GOAL:
-      if level + quantity <= 6 or quantity == 1:  # heuristic prune
+    for level in LEVEL_GOAL:
+        # agent task
         curriculum.append(
             TaskSpec(
-                eval_fn=ConsumeItem,
-                eval_fn_kwargs={
-                    "item": item,
-                    "level": level,
-                    "quantity": quantity,
-                },
+                eval_fn=EquipItem,
+                eval_fn_kwargs={"item": item, "level": level, "num_agent": 1},
                 sampling_weight=4 - level if level < 4 else 1,
             )
         )
 
+# consume items (ration, potion), evaluated based on the event log
+for item in Item.CONSUMABLE:
+    for level in LEVEL_GOAL:
+        # agent task
+        for quantity in ITEM_NUM_GOAL:
+            if level + quantity <= 6 or quantity == 1:  # heuristic prune
+                curriculum.append(
+                    TaskSpec(
+                        eval_fn=ConsumeItem,
+                        eval_fn_kwargs={
+                            "item": item,
+                            "level": level,
+                            "quantity": quantity,
+                        },
+                        sampling_weight=4 - level if level < 4 else 1,
+                    )
+                )
+
 # harvest items, evaluated based on the event log
 for item in HARVEST_ITEM:
-  for level in LEVEL_GOAL:
-    # agent task
-    for quantity in ITEM_NUM_GOAL:
-      if level + quantity <= 6 or quantity == 1:  # heuristic prune
-        curriculum.append(
-            TaskSpec(
-                eval_fn=HarvestItem,
-                eval_fn_kwargs={
-                    "item": item,
-                    "level": level,
-                    "quantity": quantity,
-                },
-                sampling_weight=4 - level if level < 4 else 1,
-            )
-        )
+    for level in LEVEL_GOAL:
+        # agent task
+        for quantity in ITEM_NUM_GOAL:
+            if level + quantity <= 6 or quantity == 1:  # heuristic prune
+                curriculum.append(
+                    TaskSpec(
+                        eval_fn=HarvestItem,
+                        eval_fn_kwargs={
+                            "item": item,
+                            "level": level,
+                            "quantity": quantity,
+                        },
+                        sampling_weight=4 - level if level < 4 else 1,
+                    )
+                )
 
 # list items, evaluated based on the event log
 for item in ALL_ITEM:
-  for level in LEVEL_GOAL:
-    # agent task
-    for quantity in ITEM_NUM_GOAL:
-      if level + quantity <= 6 or quantity == 1:  # heuristic prune
-        curriculum.append(
-            TaskSpec(
-                eval_fn=ListItem,
-                eval_fn_kwargs={
-                    "item": item,
-                    "level": level,
-                    "quantity": quantity,
-                },
-                sampling_weight=4 - level if level < 4 else 1,
-            )
-        )
+    for level in LEVEL_GOAL:
+        # agent task
+        for quantity in ITEM_NUM_GOAL:
+            if level + quantity <= 6 or quantity == 1:  # heuristic prune
+                curriculum.append(
+                    TaskSpec(
+                        eval_fn=ListItem,
+                        eval_fn_kwargs={
+                            "item": item,
+                            "level": level,
+                            "quantity": quantity,
+                        },
+                        sampling_weight=4 - level if level < 4 else 1,
+                    )
+                )
 
 # buy items, evaluated based on the event log
 for item in ALL_ITEM:
-  for level in LEVEL_GOAL:
-    # agent task
-    for quantity in ITEM_NUM_GOAL:
-      if level + quantity <= 6 or quantity == 1:  # heuristic prune
-        curriculum.append(
-            TaskSpec(
-                eval_fn=BuyItem,
-                eval_fn_kwargs={
-                    "item": item,
-                    "level": level,
-                    "quantity": quantity,
-                },
-                sampling_weight=4 - level if level < 4 else 1,
-            )
-        )
+    for level in LEVEL_GOAL:
+        # agent task
+        for quantity in ITEM_NUM_GOAL:
+            if level + quantity <= 6 or quantity == 1:  # heuristic prune
+                curriculum.append(
+                    TaskSpec(
+                        eval_fn=BuyItem,
+                        eval_fn_kwargs={
+                            "item": item,
+                            "level": level,
+                            "quantity": quantity,
+                        },
+                        sampling_weight=4 - level if level < 4 else 1,
+                    )
+                )
 
 if __name__ == "__main__":
-  import multiprocessing as mp
-  from contextlib import contextmanager
-
-  import dill
-  import numpy as np
-  import psutil
-
-  @contextmanager
-  def create_pool(num_proc):
-    pool = mp.Pool(processes=num_proc)
-    yield pool
-    pool.close()
-    pool.join()
-
-  # 1609 task specs: divide the specs into chunks
-  num_workers = round(psutil.cpu_count(logical=False)*0.7)
-  spec_chunks = np.array_split(curriculum, num_workers)
-  with create_pool(num_workers) as pool:
-    pool.map(check_task_spec, spec_chunks)
-
-  # test if the task spec is pickalable
-  with open("pickle_test.pkl", "wb") as f:
-    dill.dump(curriculum, f)
\ No newline at end of file
+    import multiprocessing as mp
+    from contextlib import contextmanager
+
+    import dill
+    import numpy as np
+    import psutil
+
+    @contextmanager
+    def create_pool(num_proc):
+        pool = mp.Pool(processes=num_proc)
+        yield pool
+        pool.close()
+        pool.join()
+
+    # 1609 task specs: divide the specs into chunks
+    num_workers = round(psutil.cpu_count(logical=False) * 0.7)
+    spec_chunks = np.array_split(curriculum, num_workers)
+    with create_pool(num_workers) as pool:
+        pool.map(check_task_spec, spec_chunks)
+
+    # test if the task spec is pickalable
+    with open("pickle_test.pkl", "wb") as f:
+        dill.dump(curriculum, f)
diff --git a/curriculum_generation/task_encoder.py b/curriculum_generation/task_encoder.py
index 7e6f1409..4a2ff489 100644
--- a/curriculum_generation/task_encoder.py
+++ b/curriculum_generation/task_encoder.py
@@ -14,17 +14,23 @@
 
 
 def extract_module_fn(module: ModuleType):
-  fn_dict = {}
-  for name, fn in module.__dict__.items():
-    if inspect.isfunction(fn) and not inspect.isbuiltin(fn) and not name.startswith("_"):
-      fn_dict[name] = fn
-  return fn_dict
+    fn_dict = {}
+    for name, fn in module.__dict__.items():
+        if inspect.isfunction(fn) and not inspect.isbuiltin(fn) and not name.startswith("_"):
+            fn_dict[name] = fn
+    return fn_dict
 
 
 class TaskEncoder:
     """A class for encoding tasks into embeddings using a pretrained model."""
-    
-    def __init__(self, checkpoint: str, context: ModuleType, batch_size=2, tmp_file_path="tmp_task_encoder.pkl"):
+
+    def __init__(
+        self,
+        checkpoint: str,
+        context: ModuleType,
+        batch_size=2,
+        tmp_file_path="tmp_task_encoder.pkl",
+    ):
         """
         Initialize the TaskEncoder.
 
@@ -38,13 +44,13 @@ def __init__(self, checkpoint: str, context: ModuleType, batch_size=2, tmp_file_
         self.tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
         self.tokenizer.pad_token = self.tokenizer.eos_token
         if self.device == "cuda":
-            self.model = AutoModelForCausalLM.from_pretrained(checkpoint,
-                                                              trust_remote_code=True,
-                                                              device_map="auto",
-                                                              torch_dtype=torch.bfloat16)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                checkpoint, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16
+            )
         else:
-            self.model = AutoModelForCausalLM.from_pretrained(checkpoint,
-                                                              trust_remote_code=True).to(self.device)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                checkpoint, trust_remote_code=True
+            ).to(self.device)
         self.model.eval()
         self.batch_size = batch_size
         self.temp_file_path = tmp_file_path
@@ -70,10 +76,14 @@ def _get_embedding(self, prompts: List[str]) -> list:
         all_embeddings = []
         with torch.no_grad():
             for i in tqdm(range(0, len(prompts), self.batch_size)):
-                batch = prompts[i: i + self.batch_size]
-                tokens = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(self.device)
+                batch = prompts[i : i + self.batch_size]
+                tokens = self.tokenizer(
+                    batch, return_tensors="pt", padding=True, truncation=True
+                ).to(self.device)
                 outputs = self.model(**tokens, output_hidden_states=True)
-                embeddings = outputs.hidden_states[-1].mean(dim=1).detach().cpu().to(torch.float32).numpy()
+                embeddings = (
+                    outputs.hidden_states[-1].mean(dim=1).detach().cpu().to(torch.float32).numpy()
+                )
                 all_embeddings.extend(embeddings.astype(np.float16))
         return all_embeddings
 
@@ -88,8 +98,18 @@ def _get_task_deps_src(self, eval_fn) -> tuple:
         A tuple with source code and dependencies of eval_fn.
         """
         eval_src = inspect.getsource(eval_fn)
-        deps_fns = [node.func.id for node in ast.walk(ast.parse(eval_src)) if isinstance(node, ast.Call) and isinstance(node.func, ast.Name)]
-        deps_src = "\n".join([inspect.getsource(self._fn_dict[fn_name]) for fn_name in deps_fns if fn_name in self._fn_dict])
+        deps_fns = [
+            node.func.id
+            for node in ast.walk(ast.parse(eval_src))
+            if isinstance(node, ast.Call) and isinstance(node.func, ast.Name)
+        ]
+        deps_src = "\n".join(
+            [
+                inspect.getsource(self._fn_dict[fn_name])
+                for fn_name in deps_fns
+                if fn_name in self._fn_dict
+            ]
+        )
         return eval_src, deps_src
 
     def _construct_prompt(self, reward_to, eval_fn, eval_fn_kwargs) -> str:
@@ -126,7 +146,12 @@ def get_task_embedding(self, task_spec_list: List[ts.TaskSpec], save_to_file: st
         Updated task specifications with embeddings.
         """
         assert self.model is not None, "Model has been unloaded. Re-initialize the TaskEncoder."
-        prompts = [self._construct_prompt(single_spec.reward_to, single_spec.eval_fn, single_spec.eval_fn_kwargs) for single_spec in task_spec_list]
+        prompts = [
+            self._construct_prompt(
+                single_spec.reward_to, single_spec.eval_fn, single_spec.eval_fn_kwargs
+            )
+            for single_spec in task_spec_list
+        ]
         embeddings = self._get_embedding(prompts)
 
         for single_spec, embedding in zip(task_spec_list, embeddings):
@@ -155,11 +180,9 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 if __name__ == "__main__":
     import curriculum_generation.manual_curriculum as curriculum
+
     LLM_CHECKPOINT = "deepseek-ai/deepseek-coder-1.3b-instruct"
     CURRICULUM_FILE_PATH = "curriculum_generation/curriculum_with_embedding.pkl"
 
     with TaskEncoder(LLM_CHECKPOINT, curriculum, batch_size=6) as task_encoder:
-        task_encoder.get_task_embedding(
-            curriculum.curriculum,
-            save_to_file=CURRICULUM_FILE_PATH
-        )
+        task_encoder.get_task_embedding(curriculum.curriculum, save_to_file=CURRICULUM_FILE_PATH)
diff --git a/curriculum_generation/task_sampler.py b/curriculum_generation/task_sampler.py
index eba88def..b6ad2603 100644
--- a/curriculum_generation/task_sampler.py
+++ b/curriculum_generation/task_sampler.py
@@ -4,10 +4,9 @@
 
 from nmmo.task import task_spec as ts
 
+
 class LearnableTaskSampler:
-    def __init__(self,
-                 task_spec: List[ts.TaskSpec],
-                 average_window = 50):
+    def __init__(self, task_spec: List[ts.TaskSpec], average_window=50):
         self.task_spec = task_spec
         self.name_to_spec = {single_spec.name: single_spec for single_spec in self.task_spec}
         self.task_stats = {}
@@ -27,12 +26,12 @@ def update(self, infos, prefix="curriculum/"):
         for key, val in infos.items():
             # Process the new infos
             if key.startswith(prefix):
-                spec_name = key.replace(prefix,"")
-                completed, prog_over_10pcnt, rcnt_over_2 = [], [], []
+                spec_name = key.replace(prefix, "")
+                completed, _, rcnt_over_2 = [], [], []
                 for sublist in val:
                     for prog, rcnt in sublist:
                         completed.append(float(prog >= 1))
-                        rcnt_over_2.append(float(rcnt >= 2)) # rewarded >= 2 times
+                        rcnt_over_2.append(float(rcnt >= 2))  # rewarded >= 2 times
 
                 # Add to the task_stats
                 if spec_name not in self.task_stats:
@@ -42,32 +41,37 @@ def update(self, infos, prefix="curriculum/"):
 
                 # Keep only the recent values -- self.average_window (50)
                 for key, vals in self.task_stats[spec_name].items():
-                    self.task_stats[spec_name][key] = vals[-self.average_window:]
+                    self.task_stats[spec_name][key] = vals[-self.average_window :]
 
-    def get_learnable_tasks(self, num_tasks,
-                               max_completed = 0.8, # filter out easy tasks
-                               min_completed = 0.05, # filter out harder tasks
-                               min_rcnt_rate = 0.1, # reward signal generating
+    def get_learnable_tasks(
+        self,
+        num_tasks,
+        max_completed=0.8,  # filter out easy tasks
+        min_completed=0.05,  # filter out harder tasks
+        min_rcnt_rate=0.1,  # reward signal generating
     ) -> List[ts.TaskSpec]:
         learnable = []
         for spec_name, stat in self.task_stats.items():
             completion_rate = np.mean(stat["completed"])
             rcnt_over2_rate = np.mean(stat["rcnt_over_2"])
-            if completion_rate < max_completed and\
-              (completion_rate >= min_completed or rcnt_over2_rate >= min_rcnt_rate):
+            if completion_rate < max_completed and (
+                completion_rate >= min_completed or rcnt_over2_rate >= min_rcnt_rate
+            ):
                 learnable.append(self.name_to_spec[spec_name])
 
         if len(learnable) > num_tasks:
             return list(np.random.choice(learnable, num_tasks))
         return learnable
 
-    def sample_tasks(self, num_tasks,
-                     random_ratio = 0.5,
-                     reset_sampling_weight = True,
+    def sample_tasks(
+        self,
+        num_tasks,
+        random_ratio=0.5,
+        reset_sampling_weight=True,
     ) -> List[ts.TaskSpec]:
         task_spec = []
         if 0 <= random_ratio < 1:
-            num_learnable = round(num_tasks * (1-random_ratio))
+            num_learnable = round(num_tasks * (1 - random_ratio))
             task_spec = self.get_learnable_tasks(num_learnable)
 
         # fill in with the randomly-sampled tasks
diff --git a/evaluate.py b/evaluate.py
index c3ac9880..7455410f 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -17,75 +17,97 @@
 from train import get_init_args
 
 NUM_AGENTS = 128
-EVAL_TASK_FILE = 'neurips23_evaluation/sample_eval_task_with_embedding.pkl'
+EVAL_TASK_FILE = "neurips23_evaluation/sample_eval_task_with_embedding.pkl"
 NUM_PVE_EVAL_EPISODE = 32
 NUM_PVP_EVAL_EPISODE = 200  # TODO: cannot do more due to memory leak
 
+
 def get_eval_config(debug=False):
     return {
-        'device': 'cuda',
-        'num_envs': 6 if not debug else 1,
-        'batch_size': 2**15 if not debug else 2**12,
+        "device": "cuda",
+        "num_envs": 6 if not debug else 1,
+        "batch_size": 2**15 if not debug else 2**12,
     }
 
 
-class EvalConfig(nc.Medium, nc.Terrain, nc.Resource, nc.Combat, nc.NPC, nc.Progression,
-                 nc.Item, nc.Equipment, nc.Profession, nc.Exchange):
-    '''NMMO config for NeurIPS 2023 competition evaluation.
-       Hardcoded to keep the eval config independent from the training config.
-    '''
+class EvalConfig(
+    nc.Medium,
+    nc.Terrain,
+    nc.Resource,
+    nc.Combat,
+    nc.NPC,
+    nc.Progression,
+    nc.Item,
+    nc.Equipment,
+    nc.Profession,
+    nc.Exchange,
+):
+    """NMMO config for NeurIPS 2023 competition evaluation.
+    Hardcoded to keep the eval config independent from the training config.
+    """
+
     def __init__(self, task_file, mode):
         super().__init__()
-        self.set('GAME_PACKS', [(nmmo.core.game_api.AgentTraining, 1)])
-        self.set('CURRICULUM_FILE_PATH', task_file)
-        self.set('TASK_EMBED_DIM', 2048)  # must match the task file
+        self.set("GAME_PACKS", [(nmmo.core.game_api.AgentTraining, 1)])
+        self.set("CURRICULUM_FILE_PATH", task_file)
+        self.set("TASK_EMBED_DIM", 2048)  # must match the task file
 
         # Eval constants
-        self.set('PROVIDE_ACTION_TARGETS', True)
-        self.set('PROVIDE_NOOP_ACTION_TARGET', True)
-        self.set('PLAYER_N', NUM_AGENTS)
-        self.set('HORIZON', 1024)
-        self.set('PLAYER_DEATH_FOG', None)
-        self.set('NPC_N', 256)
-        self.set('RESOURCE_RESILIENT_POPULATION', 0)
-        self.set('COMBAT_SPAWN_IMMUNITY', 20)
+        self.set("PROVIDE_ACTION_TARGETS", True)
+        self.set("PROVIDE_NOOP_ACTION_TARGET", True)
+        self.set("PLAYER_N", NUM_AGENTS)
+        self.set("HORIZON", 1024)
+        self.set("PLAYER_DEATH_FOG", None)
+        self.set("NPC_N", 256)
+        self.set("RESOURCE_RESILIENT_POPULATION", 0)
+        self.set("COMBAT_SPAWN_IMMUNITY", 20)
 
         # Map related
-        self.set('TERRAIN_FLIP_SEED', True)
-        self.set('MAP_CENTER', 128)
-        self.set('MAP_FORCE_GENERATION', False)
-        self.set('MAP_GENERATE_PREVIEWS', True)
-        if mode not in ['pve', 'pvp']:
-            raise ValueError(f'Invalid eval_mode: {mode}')
-        if mode == 'pve':
-            self.set('MAP_N', 4)
-            self.set('PATH_MAPS', 'maps/pve_eval/')
+        self.set("TERRAIN_FLIP_SEED", True)
+        self.set("MAP_CENTER", 128)
+        self.set("MAP_FORCE_GENERATION", False)
+        self.set("MAP_GENERATE_PREVIEWS", True)
+        if mode not in ["pve", "pvp"]:
+            raise ValueError(f"Invalid eval_mode: {mode}")
+        if mode == "pve":
+            self.set("MAP_N", 4)
+            self.set("PATH_MAPS", "maps/pve_eval/")
         else:
-            self.set('MAP_N', 256)
-            self.set('PATH_MAPS', 'maps/pvp_eval/')
+            self.set("MAP_N", 256)
+            self.set("PATH_MAPS", "maps/pvp_eval/")
+
 
 def make_env_creator(task_file, mode):
     def env_creator(*args, **kwargs):  # dummy args
         env = nmmo.Env(EvalConfig(task_file, mode))
         # Reward wrapper is for the learner, which is not used in evaluation
         env = default_learner.RewardWrapper(
-            env, **{'eval_mode': True, 'early_stop_agent_num': 0,}
+            env,
+            **{
+                "eval_mode": True,
+                "early_stop_agent_num": 0,
+            },
         )
         env = pufferlib.emulation.PettingZooPufferEnv(env)
         return env
+
     return env_creator
 
+
 def make_agent_creator():
     # NOTE: Assuming all policies are recurrent, which may not be true
     policy_args = get_init_args(default_learner.Policy.__init__)
     recurrent_args = get_init_args(default_learner.Recurrent.__init__)
+
     def agent_creator(env, args=None):
         policy = default_learner.Policy(env, **policy_args)
         policy = default_learner.Recurrent(env, policy, **recurrent_args)
         policy = pufferlib.frameworks.cleanrl.RecurrentPolicy(policy)
-        return policy.to(get_eval_config()['device'])
+        return policy.to(get_eval_config()["device"])
+
     return agent_creator
 
+
 class EvalRunner:
     def __init__(self, policy_store_dir, debug=False):
         self.policy_store_dir = policy_store_dir
@@ -96,22 +118,24 @@ def set_debug(self, debug):
 
     def setup_evaluator(self, mode, task_file, seed):
         policies = pp.get_policy_names(self.policy_store_dir)
-        assert len(policies) > 0, 'No policies found in eval_model_path'
-        if mode == 'pve':
-            assert len(policies) == 1, 'PvE mode requires only one policy'
-        logging.info(f'Policies to evaluate: {policies}')
+        assert len(policies) > 0, "No policies found in eval_model_path"
+        if mode == "pve":
+            assert len(policies) == 1, "PvE mode requires only one policy"
+        logging.info(f"Policies to evaluate: {policies}")
 
         # pool_kernel determines policy-agent mapping
-        pool_kernel = pp.create_kernel(NUM_AGENTS, len(policies),
-                                       shuffle_with_seed=seed)
+        pool_kernel = pp.create_kernel(NUM_AGENTS, len(policies), shuffle_with_seed=seed)
 
         config = self.get_pufferl_config(self._debug)
         config.seed = seed
         config.data_dir = self.policy_store_dir
         config.pool_kernel = pool_kernel
 
-        vectorization = pufferlib.vectorization.Serial if self._debug \
+        vectorization = (
+            pufferlib.vectorization.Serial
+            if self._debug
             else pufferlib.vectorization.Multiprocessing
+        )
 
         return clean_pufferl.create(
             config=config,
@@ -127,14 +151,14 @@ def setup_evaluator(self, mode, task_file, seed):
     def get_pufferl_config(debug=False):
         config = get_eval_config(debug)
         # add required configs
-        config['torch_deterministic'] = True
-        config['total_timesteps'] = 100_000_000  # arbitrarily large, but will end much earlier
-        config['envs_per_batch'] = config['num_envs']
-        config['envs_per_worker'] = 1
-        config['env_pool'] = False
-        config['learning_rate'] = 1e-4
-        config['compile'] = False
-        config['verbose'] = True  # not debug
+        config["torch_deterministic"] = True
+        config["total_timesteps"] = 100_000_000  # arbitrarily large, but will end much earlier
+        config["envs_per_batch"] = config["num_envs"]
+        config["envs_per_worker"] = 1
+        config["env_pool"] = False
+        config["learning_rate"] = 1e-4
+        config["compile"] = False
+        config["verbose"] = True  # not debug
         return pufferlib.namespace(**config)
 
     def perform_eval(self, mode, task_file, seed, num_eval_episode, save_file_prefix):
@@ -148,40 +172,38 @@ def perform_eval(self, mode, task_file, seed, num_eval_episode, save_file_prefix
             _, infos = clean_pufferl.evaluate(pufferl_data)
 
             for pol, vals in infos.items():
-                cnt_episode += sum(infos[pol]['episode_done'])
+                cnt_episode += sum(infos[pol]["episode_done"])
                 if pol not in eval_results:
                     eval_results[pol] = defaultdict(list)
                 for k, v in vals.items():
-                    if k == 'length':
+                    if k == "length":
                         eval_results[pol][k] += v  # length is a plain list
-                    if k.startswith('curriculum'):
+                    if k.startswith("curriculum"):
                         eval_results[pol][k] += [vv[0] for vv in v]
 
             pufferl_data.sort_keys = []  # TODO: check if this solves memory leak
 
-            print(f'\nSeed: {seed}, evaluated {cnt_episode} episodes.\n')
+            print(f"\nSeed: {seed}, evaluated {cnt_episode} episodes.\n")
 
-        file_name = f'{save_file_prefix}_{seed}.json'
+        file_name = f"{save_file_prefix}_{seed}.json"
         self._save_results(eval_results, file_name)
         clean_pufferl.close(pufferl_data)
         return eval_results, file_name
 
     def _save_results(self, results, file_name):
-        with open(os.path.join(self.policy_store_dir, file_name), 'w') as f:
+        with open(os.path.join(self.policy_store_dir, file_name), "w") as f:
             json.dump(results, f)
 
-    def run(self, mode,
-            task_file=EVAL_TASK_FILE,
-            seed=None,
-            num_episode=None,
-            save_file_prefix=None):
-        assert mode in ['pve', 'pvp'], f'Invalid mode: {mode}'
-        if mode == 'pve':
+    def run(
+        self, mode, task_file=EVAL_TASK_FILE, seed=None, num_episode=None, save_file_prefix=None
+    ):
+        assert mode in ["pve", "pvp"], f"Invalid mode: {mode}"
+        if mode == "pve":
             num_episode = num_episode or NUM_PVE_EVAL_EPISODE
-            save_file_prefix = save_file_prefix or 'eval_pve'
+            save_file_prefix = save_file_prefix or "eval_pve"
         else:
             num_episode = num_episode or NUM_PVP_EVAL_EPISODE
-            save_file_prefix = save_file_prefix or 'eval_pvp'
+            save_file_prefix = save_file_prefix or "eval_pvp"
 
         if self._debug:
             num_episode = 4
@@ -189,25 +211,33 @@ def run(self, mode,
         if seed is None:
             seed = random.randint(10000000, 99999999)
 
-        logging.info(f'Evaluating {self.policy_store_dir} in the {mode} mode with seed: {seed}')
-        logging.info(f'Using the task file: {task_file}')
+        logging.info(f"Evaluating {self.policy_store_dir} in the {mode} mode with seed: {seed}")
+        logging.info(f"Using the task file: {task_file}")
 
         _, file_name = self.perform_eval(mode, task_file, seed, num_episode, save_file_prefix)
 
-        print(f'Saved the result file to: {file_name}.')
+        print(f"Saved the result file to: {file_name}.")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
-    parser = argparse.ArgumentParser(description='Evaluate a policy store')
-    parser.add_argument('policy_store_dir', type=str, help='Path to the policy directory')
-    parser.add_argument('mode', type=str, choices=['pve', 'pvp'], help='Evaluation mode')
-    parser.add_argument('-t', '--task-file', type=str, default=EVAL_TASK_FILE, help='Path to the task file')
-    parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed')
-    parser.add_argument('-n', '--num-episode', type=int, default=None, help='Number of episodes to evaluate')
-    parser.add_argument('-r', '--repeat', type=int, default=1, help='Number of times to repeat the evaluation')
-    parser.add_argument('--save-file-prefix', type=str, default=None, help='Prefix for the save file')
-    parser.add_argument('--debug', action='store_true', help='Debug mode')
+    parser = argparse.ArgumentParser(description="Evaluate a policy store")
+    parser.add_argument("policy_store_dir", type=str, help="Path to the policy directory")
+    parser.add_argument("mode", type=str, choices=["pve", "pvp"], help="Evaluation mode")
+    parser.add_argument(
+        "-t", "--task-file", type=str, default=EVAL_TASK_FILE, help="Path to the task file"
+    )
+    parser.add_argument("-s", "--seed", type=int, default=1, help="Random seed")
+    parser.add_argument(
+        "-n", "--num-episode", type=int, default=None, help="Number of episodes to evaluate"
+    )
+    parser.add_argument(
+        "-r", "--repeat", type=int, default=1, help="Number of times to repeat the evaluation"
+    )
+    parser.add_argument(
+        "--save-file-prefix", type=str, default=None, help="Prefix for the save file"
+    )
+    parser.add_argument("--debug", action="store_true", help="Debug mode")
     args = parser.parse_args()
 
     runner = EvalRunner(args.policy_store_dir, args.debug)
diff --git a/neurips23_evaluation/export_embeddings.py b/neurips23_evaluation/export_embeddings.py
index 55d1ae39..28b33a25 100644
--- a/neurips23_evaluation/export_embeddings.py
+++ b/neurips23_evaluation/export_embeddings.py
@@ -8,7 +8,7 @@
     eval_tasks = dill.load(f)
 
 # metadata: task name (including full info), predicate, kwargs, sampling weights, training vs evaluation
-# group by 
+# group by
 #  - train vs eval
 #  - predicate
 
@@ -17,29 +17,35 @@
 embeddings = []
 metadata = []
 
+
 def get_task_predicate(spec):
     name = spec.name.split("_")[1]
     if name == "CountEvent":
         return name + "=" + spec.eval_fn_kwargs["event"]
     return name
 
+
 for spec in curriculum:
     embeddings.append(spec.embedding)
-    metadata.append({
-        "task_name": spec.name.replace("Task_", "").replace("_reward_to:agent", ""),
-        "predicate": get_task_predicate(spec),
-        "used_for": "train",
-        "sampling_weight": spec.sampling_weight,
-    })
+    metadata.append(
+        {
+            "task_name": spec.name.replace("Task_", "").replace("_reward_to:agent", ""),
+            "predicate": get_task_predicate(spec),
+            "used_for": "train",
+            "sampling_weight": spec.sampling_weight,
+        }
+    )
 
 for spec in eval_tasks:
     embeddings.append(spec.embedding)
-    metadata.append({
-        "task_name": spec.name.replace("Task_", "").replace("_reward_to:agent", ""),
-        "predicate": get_task_predicate(spec),
-        "used_for": "eval",
-        "sampling_weight": spec.sampling_weight,
-    })
+    metadata.append(
+        {
+            "task_name": spec.name.replace("Task_", "").replace("_reward_to:agent", ""),
+            "predicate": get_task_predicate(spec),
+            "used_for": "eval",
+            "sampling_weight": spec.sampling_weight,
+        }
+    )
 
 
 embed_df = pl.DataFrame(embeddings)
diff --git a/neurips23_evaluation/heldout_evaluation_task.py b/neurips23_evaluation/heldout_evaluation_task.py
index 1fbc2bcc..1d407419 100644
--- a/neurips23_evaluation/heldout_evaluation_task.py
+++ b/neurips23_evaluation/heldout_evaluation_task.py
@@ -1,9 +1,23 @@
 """Held-out evaluation tasks for NeurIPS 2023 competition."""
+
 from typing import List
 
 from nmmo.systems import skill as s
 from nmmo.systems import item as i
-from nmmo.task.base_predicates import *
+from nmmo.task.base_predicates import (
+    AttainSkill,
+    ConsumeItem,
+    CountEvent,
+    DefeatEntity,
+    EarnGold,
+    EquipItem,
+    FullyArmed,
+    HarvestItem,
+    HoardGold,
+    MakeProfit,
+    OccupyTile,
+    TickGE,
+)
 from nmmo.task.task_spec import TaskSpec, check_task_spec
 
 
@@ -16,9 +30,7 @@
 curriculum: List[TaskSpec] = []
 
 # Survive to the end
-curriculum.append(
-    TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024})
-)
+curriculum.append(TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024}))
 
 # Kill 20 players/npcs
 curriculum.append(
@@ -117,34 +129,20 @@
 )
 
 # Earn 100 gold (revenue), just by trading
-curriculum.append(
-    TaskSpec(
-        eval_fn=EarnGold,
-        eval_fn_kwargs={"amount": GOLD_GOAL}
-    )
-)
+curriculum.append(TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": GOLD_GOAL}))
 
 # Own and protect 100 gold by any means (looting or trading)
-curriculum.append(
-    TaskSpec(
-        eval_fn=HoardGold,
-        eval_fn_kwargs={"amount": GOLD_GOAL}
-    )
-)
+curriculum.append(TaskSpec(eval_fn=HoardGold, eval_fn_kwargs={"amount": GOLD_GOAL}))
 
 # Make profit of 100 gold by any means
-curriculum.append(
-    TaskSpec(
-        eval_fn=MakeProfit,
-        eval_fn_kwargs={"amount": GOLD_GOAL}
-    )
-)
+curriculum.append(TaskSpec(eval_fn=MakeProfit, eval_fn_kwargs={"amount": GOLD_GOAL}))
 
 
 if __name__ == "__main__":
     # Import the custom curriculum
     print("------------------------------------------------------------")
     from neurips23_evaluation import heldout_evaluation_task  # which is this file
+
     CURRICULUM = heldout_evaluation_task.curriculum
     print("The number of training tasks in the curriculum:", len(CURRICULUM))
 
@@ -166,6 +164,7 @@
     print("Checking if the training tasks are picklable ...")
     with open(CURRICULUM_FILE_PATH, "wb") as f:
         import dill
+
         dill.dump(CURRICULUM, f)
     print("All training tasks are picklable.")
 
@@ -175,6 +174,7 @@
     print("------------------------------------------------------------")
     print("Generating the task spec with embedding file ...")
     from curriculum_generation.task_encoder import TaskEncoder
+
     LLM_CHECKPOINT = "deepseek-ai/deepseek-coder-1.3b-instruct"
 
     # Get the task embeddings for the training tasks and save to file
diff --git a/neurips23_evaluation/sample_evaluation_task.py b/neurips23_evaluation/sample_evaluation_task.py
index 6a06fa56..fce5e394 100644
--- a/neurips23_evaluation/sample_evaluation_task.py
+++ b/neurips23_evaluation/sample_evaluation_task.py
@@ -1,6 +1,5 @@
 """Manual test for creating learning curriculum manually"""
-# pylint: disable=invalid-name,redefined-outer-name,bad-builtin
-# pylint: disable=wildcard-import,unused-wildcard-import
+
 from typing import List
 
 from nmmo.systems import skill as s
@@ -13,9 +12,7 @@
 curriculum: List[TaskSpec] = []
 
 # Stay alive as long as possible
-curriculum.append(
-    TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024})
-)
+curriculum.append(TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024}))
 
 # Perform these 10 times
 essential_skills = [
@@ -52,9 +49,7 @@
     )
 
 # Earn gold 50
-curriculum.append(
-    TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": 50})
-)
+curriculum.append(TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": 50}))
 
 if __name__ == "__main__":
     from neurips23_evaluation import sample_evaluation_task as curriculum
@@ -63,7 +58,4 @@
     LLM_CHECKPOINT = "deepseek-ai/deepseek-coder-1.3b-instruct"
 
     with TaskEncoder(LLM_CHECKPOINT, curriculum, batch_size=6) as task_encoder:
-        task_encoder.get_task_embedding(
-            curriculum.curriculum,
-            save_to_file=CURRICULUM_FILE_PATH
-        )
+        task_encoder.get_task_embedding(curriculum.curriculum, save_to_file=CURRICULUM_FILE_PATH)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..7a4d30ed
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,121 @@
+[project]
+name = "nmmo2-baselines"
+version = "0.1.0"
+description = "Neural MMO 2.1 baselines"
+keywords = []
+classifiers = [
+	"Natural Language :: English",
+	"Operating System :: POSIX :: Linux",
+	"Operating System :: MacOS :: MacOS X",
+	"Programming Language :: Python",
+	"Programming Language :: Python :: 3.10",
+	"Programming Language :: Python :: Implementation :: CPython",
+]
+dependencies = [
+    "accelerate==0.27.2",
+    "nmmo@git+https://github.com/kywch/nmmo-environment",  # WIP nmmo 2.1
+    "polars",
+    "pufferlib[nmmo]==0.7.3",
+    "psutil",
+    "torch>=2.1",
+    "transformers==4.37.2",
+    "wandb",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+exclude = ["tests"]
+
+[project.optional-dependencies]
+monitoring = [
+    "nvitop"
+]
+dev = [
+    "pre-commit",
+    "ruff"
+]
+
+[tool.distutils.bdist_wheel]
+universal = true
+
+[tool.ruff]
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".ipynb_checkpoints",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pyenv",
+    ".pytest_cache",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    ".vscode",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "site-packages",
+    "venv",
+]
+
+# Same as Black.
+line-length = 100
+indent-width = 4
+
+# Assume Python 3.10
+target-version = "py310"
+
+[tool.ruff.lint]
+# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
+# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
+# McCabe complexity (`C901`) by default.
+select = ["E4", "E7", "E9", "F"]
+ignore = []
+
+# Allow fix for all enabled rules (when `--fix`) is provided.
+fixable = ["ALL"]
+unfixable = []
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]  # Ignore imported but unused
+
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
+
+# Enable auto-formatting of code examples in docstrings. Markdown,
+# reStructuredText code/literal blocks and doctests are all supported.
+#
+# This is currently disabled by default, but it is planned for this
+# to be opt-out in the future.
+docstring-code-format = false
+
+# Set the line length limit used when formatting code snippets in
+# docstrings.
+#
+# This only has an effect when the `docstring-code-format` setting is
+# enabled.
+docstring-code-line-length = "dynamic"
diff --git a/reinforcement_learning/clean_pufferl.py b/reinforcement_learning/clean_pufferl.py
index 5824483f..f72e76c7 100644
--- a/reinforcement_learning/clean_pufferl.py
+++ b/reinforcement_learning/clean_pufferl.py
@@ -1,5 +1,7 @@
+# ruff: noqa: E722, F841
+
 # Copied from: https://github.com/PufferAI/PufferLib/blob/0.7/clean_pufferl.py
-from pdb import set_trace as T
+# from pdb import set_trace as T
 import os
 import random
 import time
@@ -21,7 +23,7 @@
 import pufferlib.frameworks.cleanrl
 import pufferlib.policy_pool
 
-SKIP_LOG_KEYS = ['curriculum/Task_', 'env_id']
+SKIP_LOG_KEYS = ["curriculum/Task_", "env_id"]
 
 
 @pufferlib.dataclass
@@ -42,6 +44,7 @@ class Performance:
     train_pytorch_memory = 0
     misc_time = 0
 
+
 @pufferlib.dataclass
 class Losses:
     policy_loss = 0
@@ -52,6 +55,7 @@ class Losses:
     clipfrac = 0
     explained_variance = 0
 
+
 @pufferlib.dataclass
 class Charts:
     global_step = 0
@@ -60,29 +64,26 @@ class Charts:
     agent_step = 0
     agent_SPS = 0
 
+
 def create(
-        self: object = None,
-        config: pufferlib.namespace = None,
-        exp_name: str = None,
-        track: bool = False,
-
-        # Agent
-        agent: nn.Module = None,
-        agent_creator: callable = None,
-        agent_kwargs: dict = None,
-
-        # Environment
-        env_creator: callable = None,
-        env_creator_kwargs: dict = None,
-        vectorization: ... = pufferlib.vectorization.Serial,
-
-        # Evaluation or replay mode
-        eval_mode: bool = False,
-        eval_model_path: str = None,
-
-        # Policy Pool options
-        policy_selector: callable = None,
-    ):
+    self: object = None,
+    config: pufferlib.namespace = None,
+    exp_name: str = None,
+    track: bool = False,
+    # Agent
+    agent: nn.Module = None,
+    agent_creator: callable = None,
+    agent_kwargs: dict = None,
+    # Environment
+    env_creator: callable = None,
+    env_creator_kwargs: dict = None,
+    vectorization: ... = pufferlib.vectorization.Serial,
+    # Evaluation or replay mode
+    eval_mode: bool = False,
+    eval_model_path: str = None,
+    # Policy Pool options
+    policy_selector: callable = None,
+):
     if config is None:
         config = pufferlib.args.CleanPuffeRL()
 
@@ -121,24 +122,26 @@ def create(
     resume_state = {}
     path = os.path.join(config.data_dir, exp_name)
     if os.path.exists(path):
-        trainer_path = os.path.join(path, 'trainer_state.pt')
+        trainer_path = os.path.join(path, "trainer_state.pt")
         resume_state = torch.load(trainer_path)
         model_path = os.path.join(path, resume_state["model_name"])
         agent = torch.load(model_path, map_location=device)
-        print(f'Resumed from update {resume_state["update"]} '
-              f'with policy {resume_state["model_name"]}')
+        print(
+            f'Resumed from update {resume_state["update"]} '
+            f'with policy {resume_state["model_name"]}'
+        )
     else:
         agent = pufferlib.emulation.make_object(
-            agent, agent_creator, [pool.driver_env], agent_kwargs)
+            agent, agent_creator, [pool.driver_env], agent_kwargs
+        )
 
     global_step = resume_state.get("global_step", 0)
     agent_step = resume_state.get("agent_step", 0)
     update = resume_state.get("update", 0)
 
-    optimizer = optim.Adam(agent.parameters(),
-        lr=config.learning_rate, eps=1e-5)
+    optimizer = optim.Adam(agent.parameters(), lr=config.learning_rate, eps=1e-5)
 
-    uncompiled_agent = agent # Needed to save the model
+    uncompiled_agent = agent  # Needed to save the model
     if config.compile:
         agent = torch.compile(agent, mode=config.compile_mode)
 
@@ -157,8 +160,13 @@ def create(
     if policy_selector is None:
         policy_selector = pufferlib.policy_pool.RandomPolicySelector(config.seed)
     policy_pool = pufferlib.policy_pool.PolicyPool(
-        agent, pool_agents, atn_shape, device, pool_path,
-        config.pool_kernel, policy_selector,
+        agent,
+        pool_agents,
+        atn_shape,
+        device,
+        pool_path,
+        config.pool_kernel,
+        policy_selector,
     )
 
     # Allocate Storage
@@ -166,19 +174,19 @@ def create(
     next_lstm_state = []
     pool.async_reset(config.seed)
     next_lstm_state = None
-    if hasattr(agent, 'lstm'):
+    if hasattr(agent, "lstm"):
         shape = (agent.lstm.num_layers, total_agents, agent.lstm.hidden_size)
         next_lstm_state = (
             torch.zeros(shape).to(device),
             torch.zeros(shape).to(device),
         )
-    obs=torch.zeros(config.batch_size + 1, *obs_shape)
-    actions=torch.zeros(config.batch_size + 1, *atn_shape, dtype=int)
-    logprobs=torch.zeros(config.batch_size + 1)
-    rewards=torch.zeros(config.batch_size + 1)
-    dones=torch.zeros(config.batch_size + 1)
-    truncateds=torch.zeros(config.batch_size + 1)
-    values=torch.zeros(config.batch_size + 1)
+    obs = torch.zeros(config.batch_size + 1, *obs_shape)
+    actions = torch.zeros(config.batch_size + 1, *atn_shape, dtype=int)
+    logprobs = torch.zeros(config.batch_size + 1)
+    rewards = torch.zeros(config.batch_size + 1)
+    dones = torch.zeros(config.batch_size + 1)
+    truncateds = torch.zeros(config.batch_size + 1)
+    values = torch.zeros(config.batch_size + 1)
 
     obs_ary = np.asarray(obs)
     actions_ary = np.asarray(actions)
@@ -190,83 +198,85 @@ def create(
 
     storage_profiler.stop()
 
-    #"charts/actions": wandb.Histogram(b_actions.cpu().numpy()),
+    # "charts/actions": wandb.Histogram(b_actions.cpu().numpy()),
     init_performance = pufferlib.namespace(
-        init_time = time.time() - start_time,
-        init_env_time = init_profiler.elapsed,
-        init_env_memory = init_profiler.memory,
-        tensor_memory = storage_profiler.memory,
-        tensor_pytorch_memory = storage_profiler.pytorch_memory,
+        init_time=time.time() - start_time,
+        init_env_time=init_profiler.elapsed,
+        init_env_memory=init_profiler.memory,
+        tensor_memory=storage_profiler.memory,
+        tensor_pytorch_memory=storage_profiler.pytorch_memory,
     )
- 
-    return pufferlib.namespace(self,
+
+    return pufferlib.namespace(
+        self,
         # Agent, Optimizer, and Environment
         config=config,
-        pool = pool,
-        agent = agent,
-        uncompiled_agent = uncompiled_agent,
-        optimizer = optimizer,
-        policy_pool = policy_pool,
-
+        pool=pool,
+        agent=agent,
+        uncompiled_agent=uncompiled_agent,
+        optimizer=optimizer,
+        policy_pool=policy_pool,
         # Logging
-        exp_name = exp_name,
-        wandb = wandb,
+        exp_name=exp_name,
+        wandb=wandb,
         learning_rate=config.learning_rate,
-        losses = Losses(),
-        init_performance = init_performance,
-        performance = Performance(),
-
+        losses=Losses(),
+        init_performance=init_performance,
+        performance=Performance(),
         # Storage
-        sort_keys = [],
-        next_lstm_state = next_lstm_state,
-        obs = obs,
-        actions = actions,
-        logprobs = logprobs,
-        rewards = rewards,
-        dones = dones,
-        values = values,
-        obs_ary = obs_ary,
-        actions_ary = actions_ary,
-        logprobs_ary = logprobs_ary,
-        rewards_ary = rewards_ary,
-        dones_ary = dones_ary,
-        truncateds_ary = truncateds_ary,
-        values_ary = values_ary,
-
+        sort_keys=[],
+        next_lstm_state=next_lstm_state,
+        obs=obs,
+        actions=actions,
+        logprobs=logprobs,
+        rewards=rewards,
+        dones=dones,
+        values=values,
+        obs_ary=obs_ary,
+        actions_ary=actions_ary,
+        logprobs_ary=logprobs_ary,
+        rewards_ary=rewards_ary,
+        dones_ary=dones_ary,
+        truncateds_ary=truncateds_ary,
+        values_ary=values_ary,
         # Misc
-        total_updates = total_updates,
-        update = update,
-        global_step = global_step,
-        agent_step = agent_step,
-        device = device,
-        start_time = start_time,
-        eval_mode = eval_mode,
+        total_updates=total_updates,
+        update=update,
+        global_step=global_step,
+        agent_step=agent_step,
+        device=device,
+        start_time=start_time,
+        eval_mode=eval_mode,
     )
 
+
 @pufferlib.utils.profile
 def evaluate(data):
     config = data.config
     # TODO: Handle update on resume
     if data.wandb is not None and data.performance.total_uptime > 0:
-        data.wandb.log({
-            'agent_SPS': data.agent_SPS,
-            'agent_step': data.agent_step,
-            'global_step': data.global_step,
-            'learning_rate': data.optimizer.param_groups[0]["lr"],
-            **{f'losses/{k}': v for k, v in data.losses.items()},
-            **{f'performance/{k}': v
-                for k, v in data.performance.items()},
-            **{f'{k}': v for k, v in data.stats.items()},  # comes with stats/ prefix
-            **{f'skillrank/{policy}': elo
-                for policy, elo in data.policy_pool.ranker.ratings.items()},
-        })
+        data.wandb.log(
+            {
+                "agent_SPS": data.agent_SPS,
+                "agent_step": data.agent_step,
+                "global_step": data.global_step,
+                "learning_rate": data.optimizer.param_groups[0]["lr"],
+                **{f"losses/{k}": v for k, v in data.losses.items()},
+                **{f"performance/{k}": v for k, v in data.performance.items()},
+                **{f"{k}": v for k, v in data.stats.items()},  # comes with stats/ prefix
+                **{
+                    f"skillrank/{policy}": elo
+                    for policy, elo in data.policy_pool.ranker.ratings.items()
+                },
+            }
+        )
 
     # update_policies() changes the policy id (in kernel) - policy mapping
     # It's good for training but not wanted for replay or evaluation, so we skip it
     if not data.eval_mode:
         data.policy_pool.update_policies()
 
-    performance = defaultdict(list)
+    # performance = defaultdict(list)
     env_profiler = pufferlib.utils.Profiler()
     inference_profiler = pufferlib.utils.Profiler()
     eval_profiler = pufferlib.utils.Profiler(memory=True, pytorch_memory=True).start()
@@ -285,9 +295,8 @@ def evaluate(data):
         with misc_profiler:
             i = data.policy_pool.update_scores(i, "return")
             # TODO: Update this for policy pool
-            for ii, ee  in zip(i['learner'], env_id):
-                ii['env_id'] = ee
-
+            for ii, ee in zip(i["learner"], env_id):
+                ii["env_id"] = ee
 
         with inference_profiler, torch.no_grad():
             o = torch.as_tensor(o)
@@ -306,7 +315,8 @@ def evaluate(data):
                 )
 
             actions, logprob, value, next_lstm_state = data.policy_pool.forwards(
-                    o.to(data.device), next_lstm_state)
+                o.to(data.device), next_lstm_state
+            )
 
             if next_lstm_state is not None:
                 h, c = next_lstm_state
@@ -315,16 +325,15 @@ def evaluate(data):
 
             value = value.flatten()
 
-       
         with misc_profiler:
             actions = actions.cpu().numpy()
-     
+
             # Index alive mask with policy pool idxs...
             # TODO: Find a way to avoid having to do this
             learner_mask = torch.Tensor(mask * data.policy_pool.mask)
 
             # Ensure indices do not exceed batch size
-            indices = torch.where(learner_mask)[0][:config.batch_size - ptr + 1].numpy()
+            indices = torch.where(learner_mask)[0][: config.batch_size - ptr + 1].numpy()
             end = ptr + len(indices)
 
             # Batch indexing
@@ -372,8 +381,8 @@ def evaluate(data):
     data.stats = {}
 
     # Get stats only from the learner
-    for k, v in infos['learner'].items():
-        try: # TODO: Better checks on log data types
+    for k, v in infos["learner"].items():
+        try:  # TODO: Better checks on log data types
             # Skip the unnecessary info from the stats
             if not any(skip in k for skip in SKIP_LOG_KEYS):
                 data.stats[k] = np.mean(v)
@@ -385,11 +394,11 @@ def evaluate(data):
 
     return data.stats, infos
 
+
 @pufferlib.utils.profile
 def train(data):
     if done_training(data):
-        raise RuntimeError(
-            f"Max training updates {data.total_updates} already reached")
+        raise RuntimeError(f"Max training updates {data.total_updates} already reached")
 
     config = data.config
     # assert data.num_steps % bptt_horizon == 0, "num_steps must be divisible by bptt_horizon"
@@ -420,9 +429,7 @@ def train(data):
             nextnonterminal = 1.0 - data.dones[i_nxt]
             nextvalues = data.values[i_nxt]
             delta = (
-                data.rewards[i_nxt]
-                + config.gamma * nextvalues * nextnonterminal
-                - data.values[i]
+                data.rewards[i_nxt] + config.gamma * nextvalues * nextnonterminal - data.values[i]
             )
             advantages[t] = lastgaelam = (
                 delta + config.gamma * config.gae_lambda * nextnonterminal * lastgaelam
@@ -430,14 +437,10 @@ def train(data):
 
     # Flatten the batch
     data.b_obs = b_obs = torch.Tensor(data.obs_ary[b_idxs])
-    b_actions = torch.Tensor(data.actions_ary[b_idxs]
-        ).to(data.device, non_blocking=True)
-    b_logprobs = torch.Tensor(data.logprobs_ary[b_idxs]
-        ).to(data.device, non_blocking=True)
-    b_dones = torch.Tensor(data.dones_ary[b_idxs]
-        ).to(data.device, non_blocking=True)
-    b_values = torch.Tensor(data.values_ary[b_idxs]
-        ).to(data.device, non_blocking=True)
+    b_actions = torch.Tensor(data.actions_ary[b_idxs]).to(data.device, non_blocking=True)
+    b_logprobs = torch.Tensor(data.logprobs_ary[b_idxs]).to(data.device, non_blocking=True)
+    b_dones = torch.Tensor(data.dones_ary[b_idxs]).to(data.device, non_blocking=True)
+    b_values = torch.Tensor(data.values_ary[b_idxs]).to(data.device, non_blocking=True)
     b_advantages = advantages.reshape(
         config.batch_rows, num_minibatches, config.bptt_horizon
     ).transpose(0, 1)
@@ -446,22 +449,23 @@ def train(data):
     # Optimizing the policy and value network
     train_time = time.time()
     pg_losses, entropy_losses, v_losses, clipfracs, old_kls, kls = [], [], [], [], [], []
-    mb_obs_buffer = torch.zeros_like(b_obs[0], pin_memory=(data.device=="cuda"))
+    mb_obs_buffer = torch.zeros_like(b_obs[0], pin_memory=(data.device == "cuda"))
 
     for epoch in range(config.update_epochs):
         lstm_state = None
         for mb in range(num_minibatches):
             mb_obs_buffer.copy_(b_obs[mb], non_blocking=True)
             mb_obs = mb_obs_buffer.to(data.device, non_blocking=True)
-            #mb_obs = b_obs[mb].to(data.device, non_blocking=True)
+            # mb_obs = b_obs[mb].to(data.device, non_blocking=True)
             mb_actions = b_actions[mb].contiguous()
             mb_values = b_values[mb].reshape(-1)
             mb_advantages = b_advantages[mb].reshape(-1)
             mb_returns = b_returns[mb].reshape(-1)
 
-            if hasattr(data.agent, 'lstm'):
+            if hasattr(data.agent, "lstm"):
                 _, newlogprob, entropy, newvalue, lstm_state = data.agent(
-                    mb_obs, state=lstm_state, action=mb_actions)
+                    mb_obs, state=lstm_state, action=mb_actions
+                )
                 lstm_state = (lstm_state[0].detach(), lstm_state[1].detach())
             else:
                 _, newlogprob, entropy, newvalue = data.agent(
@@ -478,9 +482,7 @@ def train(data):
                 old_kls.append(old_approx_kl.item())
                 approx_kl = ((ratio - 1) - logratio).mean()
                 kls.append(approx_kl.item())
-                clipfracs += [
-                    ((ratio - 1.0).abs() > config.clip_coef).float().mean().item()
-                ]
+                clipfracs += [((ratio - 1.0).abs() > config.clip_coef).float().mean().item()]
 
             mb_advantages = mb_advantages.reshape(-1)
             if config.norm_adv:
@@ -554,7 +556,8 @@ def train(data):
 
     data.update += 1
     if data.update % config.checkpoint_interval == 0 or done_training(data):
-       save_checkpoint(data)
+        save_checkpoint(data)
+
 
 def close(data):
     data.pool.close()
@@ -567,15 +570,17 @@ def close(data):
         data.wandb.run.log_artifact(artifact)
         data.wandb.finish()
 
+
 def done_training(data):
     return data.update >= data.total_updates
 
+
 def save_checkpoint(data):
     path = os.path.join(data.config.data_dir, data.exp_name)
     if not os.path.exists(path):
         os.makedirs(path)
 
-    model_name = f'model_{data.update:06d}.pt'
+    model_name = f"model_{data.update:06d}.pt"
     model_path = os.path.join(path, model_name)
 
     # Already saved
@@ -593,17 +598,18 @@ def save_checkpoint(data):
     }
 
     if data.wandb:
-        state['exp_name'] = data.exp_name
+        state["exp_name"] = data.exp_name
 
-    state_path = os.path.join(path, 'trainer_state.pt')
-    torch.save(state, state_path + '.tmp')
-    os.rename(state_path + '.tmp', state_path)
+    state_path = os.path.join(path, "trainer_state.pt")
+    torch.save(state, state_path + ".tmp")
+    os.rename(state_path + ".tmp", state_path)
 
     if data.config.verbose:
         print(f"Model saved to {model_path}")
 
     return model_path
 
+
 def seed_everything(seed, torch_deterministic):
     random.seed(seed)
     np.random.seed(seed)
@@ -620,6 +626,7 @@ def seed_everything(seed, torch_deterministic):
         # With torch >= 2.2, check also https://pytorch.org/docs/2.2/deterministic.html
         # torch.utils.deterministic.fill_uninitialized_memory = torch_deterministic
 
+
 def unroll_nested_dict(d):
     if not isinstance(d, dict):
         return d
@@ -631,36 +638,37 @@ def unroll_nested_dict(d):
         else:
             yield k, v
 
+
 def print_dashboard(stats, init_performance, performance):
     output = []
     data = {**init_performance, **performance}
     #  Only show these stats in the dashboard
-    if 'length' in stats:
-        data['length'] = stats['length']
+    if "length" in stats:
+        data["length"] = stats["length"]
 
     grouped_data = defaultdict(dict)
 
     for k, v in data.items():
-        if k == 'total_uptime':
+        if k == "total_uptime":
             v = timedelta(seconds=v)
-        if 'memory' in k:
+        if "memory" in k:
             v = pufferlib.utils.format_bytes(v)
-        elif 'time' in k:
+        elif "time" in k:
             try:
                 v = f"{v:.2f} s"
             except:
                 pass
-        
-        first_word, *rest_words = k.split('_')
-        rest_words = ' '.join(rest_words).title()
-        
+
+        first_word, *rest_words = k.split("_")
+        rest_words = " ".join(rest_words).title()
+
         grouped_data[first_word][rest_words] = v
-    
+
     for main_key, sub_dict in grouped_data.items():
         output.append(f"{main_key.title()}")
         for sub_key, sub_value in sub_dict.items():
             output.append(f"    {sub_key}: {sub_value}")
-    
+
     print("\033c", end="")
-    print('\n'.join(output))
-    time.sleep(1/20)
+    print("\n".join(output))
+    time.sleep(1 / 20)
diff --git a/reinforcement_learning/environment.py b/reinforcement_learning/environment.py
index 5426cbc9..d1720148 100644
--- a/reinforcement_learning/environment.py
+++ b/reinforcement_learning/environment.py
@@ -5,7 +5,7 @@
 
 from pettingzoo.utils.wrappers.base_parallel import BaseParallelWrapper
 
-import nmmo 
+import nmmo
 import nmmo.core.config as nc
 import nmmo.core.game_api as ng
 
@@ -13,9 +13,21 @@
 def alt_combat_damage_formula(offense, defense, multiplier, minimum_proportion):
     return int(max(multiplier * offense - defense, offense * minimum_proportion))
 
-class Config(nc.Medium, nc.Terrain, nc.Resource, nc.Combat, nc.NPC, nc.Progression,
-             nc.Item, nc.Equipment, nc.Profession, nc.Exchange):
-    '''Configuration for Neural MMO.'''
+
+class Config(
+    nc.Medium,
+    nc.Terrain,
+    nc.Resource,
+    nc.Combat,
+    nc.NPC,
+    nc.Progression,
+    nc.Item,
+    nc.Equipment,
+    nc.Profession,
+    nc.Exchange,
+):
+    """Configuration for Neural MMO."""
+
     def __init__(self, env_args: Namespace):
         super().__init__()
 
@@ -25,8 +37,11 @@ def __init__(self, env_args: Namespace):
         self.set("PLAYER_N", env_args.num_agents)
         self.set("HORIZON", env_args.max_episode_length)
         self.set("MAP_N", env_args.num_maps)
-        self.set("PLAYER_DEATH_FOG", env_args.death_fog_tick if isinstance(env_args.death_fog_tick, int) else None)
-        self.set("PATH_MAPS", f'{env_args.maps_path}/{env_args.map_size}/')
+        self.set(
+            "PLAYER_DEATH_FOG",
+            env_args.death_fog_tick if isinstance(env_args.death_fog_tick, int) else None,
+        )
+        self.set("PATH_MAPS", f"{env_args.maps_path}/{env_args.map_size}/")
         self.set("MAP_CENTER", env_args.map_size)
         self.set("NPC_N", env_args.num_npcs)
         self.set("TASK_EMBED_DIM", env_args.task_size)
@@ -66,8 +81,9 @@ def __init__(self, env_args: Namespace):
 def make_env_creator(reward_wrapper_cls: BaseParallelWrapper):
     def env_creator(*args, **kwargs):
         """Create an environment."""
-        env = nmmo.Env(Config(kwargs['env']))  # args.env is provided as kwargs
-        env = reward_wrapper_cls(env, **kwargs['reward_wrapper'])
+        env = nmmo.Env(Config(kwargs["env"]))  # args.env is provided as kwargs
+        env = reward_wrapper_cls(env, **kwargs["reward_wrapper"])
         env = pufferlib.emulation.PettingZooPufferEnv(env)
         return env
+
     return env_creator
diff --git a/reinforcement_learning/stat_wrapper.py b/reinforcement_learning/stat_wrapper.py
index 0ae4a475..0a8f09fd 100644
--- a/reinforcement_learning/stat_wrapper.py
+++ b/reinforcement_learning/stat_wrapper.py
@@ -22,19 +22,19 @@ def __init__(
         self._stat_prefix = stat_prefix
 
     def observation(self, agent_id, agent_obs):
-        '''Called before observations are returned from the environment
-           Use this to define custom featurizers. Changing the space itself requires you to
-           define the observation space again (i.e. Gym.spaces.Dict(gym.spaces....))'''
+        """Called before observations are returned from the environment
+        Use this to define custom featurizers. Changing the space itself requires you to
+        define the observation space again (i.e. Gym.spaces.Dict(gym.spaces....))"""
         # NOTE: nmmo minigame obs is mapping proxy, which doesn't work with pufferlib flatten
         return agent_obs
 
     def action(self, agent_id, agent_atn):
-        '''Called before actions are passed from the model to the environment'''
+        """Called before actions are passed from the model to the environment"""
         return agent_atn
 
     def reward_terminated_truncated_info(self, agent_id, reward, terminated, truncated, info):
-        '''Called on reward, terminated, truncated, and info before they are returned from the environment
-           Use this to define custom reward shaping.'''
+        """Called on reward, terminated, truncated, and info before they are returned from the environment
+        Use this to define custom reward shaping."""
         return reward, terminated, truncated, info
 
     @property
@@ -42,7 +42,7 @@ def agents(self):
         return [] if self.env_done else self.env.agents
 
     def reset(self, **kwargs):
-        '''Called at the start of each episode'''
+        """Called at the start of each episode"""
         self._reset_episode_stats()
         obs, info = self.env.reset(**kwargs)
 
@@ -51,7 +51,7 @@ def reset(self, **kwargs):
         return obs, info
 
     def step(self, action):
-        assert len(self.env.agents) > 0, 'No agents in the environment'  # xcxc: sanity check
+        assert len(self.env.agents) > 0, "No agents in the environment"  # xcxc: sanity check
 
         # Modify actions before they are passed to the environment
         for agent_id in self.env.agents:
@@ -81,7 +81,7 @@ def step(self, action):
 
         if self.env_done:
             # To mark the end of the episode. Only one agent's done flag is enough.
-            infos[agent_id]['episode_done'] = True
+            infos[agent_id]["episode_done"] = True
 
         return obs, rewards, terms, truncs, infos
 
@@ -91,17 +91,17 @@ def _reset_episode_stats(self):
         self.cum_rewards = {agent_id: 0 for agent_id in self.env.possible_agents}
         self._unique_events = {
             agent_id: {
-                'experienced': set(),
-                'prev_count': 0,
-                'curr_count': 0,
+                "experienced": set(),
+                "prev_count": 0,
+                "curr_count": 0,
             }
             for agent_id in self.env.possible_agents
         }
 
     def _process_stats_and_early_stop(self, agent_id, reward, terminated, truncated, info):
-        '''Update stats + info and save replays.'''
+        """Update stats + info and save replays."""
         # Remove the task from info. Curriculum info is processed in _update_stats()
-        info.pop('task', None)
+        info.pop("task", None)
 
         # Handle early stopping
         if self.env_done and not terminated:
@@ -111,56 +111,54 @@ def _process_stats_and_early_stop(self, agent_id, reward, terminated, truncated,
         realm = self.env.realm
         tick_log = realm.event_log.get_data(agents=[agent_id], tick=-1)  # get only the last tick
         uniq = self._unique_events[agent_id]
-        uniq['prev_count'] = uniq['curr_count']
-        uniq['curr_count'] += count_unique_events(tick_log, uniq['experienced'])
+        uniq["prev_count"] = uniq["curr_count"]
+        uniq["curr_count"] += count_unique_events(tick_log, uniq["experienced"])
 
         if not (terminated or truncated):
             self.cum_rewards[agent_id] += reward
             return truncated, info
 
         # The agent is terminated or truncated, so recoding the episode stats
-        if 'stats' not in info:
-            info['stats'] = {}
+        if "stats" not in info:
+            info["stats"] = {}
 
         agent = realm.players.dead_this_tick.get(agent_id, realm.players.get(agent_id))
         assert agent is not None
 
         # NOTE: this may not be true when players can be resurrected. Check back later
-        info['length'] = realm.tick
+        info["length"] = realm.tick
 
-        info['return'] = self.cum_rewards[agent_id]
+        info["return"] = self.cum_rewards[agent_id]
 
         # Cause of Deaths
         if terminated:
-            info['stats']['cod/attacked'] = 1.0 if agent.damage.val > 0 else 0.0
-            info['stats']['cod/starved'] = 1.0 if agent.food.val == 0 else 0.0
-            info['stats']['cod/dehydrated'] = 1.0 if agent.water.val == 0 else 0.0
+            info["stats"]["cod/attacked"] = 1.0 if agent.damage.val > 0 else 0.0
+            info["stats"]["cod/starved"] = 1.0 if agent.food.val == 0 else 0.0
+            info["stats"]["cod/dehydrated"] = 1.0 if agent.water.val == 0 else 0.0
         else:
-            info['stats']['cod/attacked'] = 0
-            info['stats']['cod/starved'] = 0
-            info['stats']['cod/dehydrated'] = 0
+            info["stats"]["cod/attacked"] = 0
+            info["stats"]["cod/starved"] = 0
+            info["stats"]["cod/dehydrated"] = 0
 
         # Task-related stats
         task = self.env.agent_task_map[agent_id][0]  # consider only the first task
-        info['stats']['task/completed'] = 1.0 if task.completed else 0.0
-        info['stats']['task/pcnt_2_reward_signal'] = 1.0 if task.reward_signal_count >= 2 else 0.0
-        info['stats']['task/pcnt_0p2_max_progress'] = 1.0 if task._max_progress >= 0.2 else 0.0
-        info['curriculum'] = {
-            task.spec_name: (task._max_progress, task.reward_signal_count)
-        }
+        info["stats"]["task/completed"] = 1.0 if task.completed else 0.0
+        info["stats"]["task/pcnt_2_reward_signal"] = 1.0 if task.reward_signal_count >= 2 else 0.0
+        info["stats"]["task/pcnt_0p2_max_progress"] = 1.0 if task._max_progress >= 0.2 else 0.0
+        info["curriculum"] = {task.spec_name: (task._max_progress, task.reward_signal_count)}
 
         if self.eval_mode:
             # 'return' is used for ranking in the eval mode, so put the task progress here
-            info['return'] = task._max_progress  # this is 1 if done
+            info["return"] = task._max_progress  # this is 1 if done
 
         # Max combat/harvest level achieved
-        info['stats']['achieved/max_combat_level'] = agent.attack_level
-        info['stats']['achieved/max_harvest_skill_ammo'] = max(
+        info["stats"]["achieved/max_combat_level"] = agent.attack_level
+        info["stats"]["achieved/max_harvest_skill_ammo"] = max(
             agent.prospecting_level.val,
             agent.carving_level.val,
             agent.alchemy_level.val,
         )
-        info['stats']['achieved/max_harvest_skill_consum'] = max(
+        info["stats"]["achieved/max_harvest_skill_consum"] = max(
             agent.fishing_level.val,
             agent.herbalism_level.val,
         )
@@ -168,7 +166,7 @@ def _process_stats_and_early_stop(self, agent_id, reward, terminated, truncated,
         # Event-based stats
         achieved, performed, _ = process_event_log(realm, [agent_id])
         for key, val in list(achieved.items()) + list(performed.items()):
-            info['stats'][key] = float(val)
+            info["stats"][key] = float(val)
 
         if self._stat_prefix:
             info = {self._stat_prefix: info}
@@ -180,33 +178,32 @@ def _process_stats_and_early_stop(self, agent_id, reward, terminated, truncated,
 # Event processing utilities for Neural MMO.
 
 INFO_KEY_TO_EVENT_CODE = {
-    'event/' + evt.lower(): val
-    for evt, val in EventCode.__dict__.items()
-    if isinstance(val, int)
+    "event/" + evt.lower(): val for evt, val in EventCode.__dict__.items() if isinstance(val, int)
 }
 
 # convert the numbers into binary (performed or not) for the key events
 KEY_EVENT = [
-    'eat_food',
-    'drink_water',
-    'score_hit',
-    'player_kill',
-    'consume_item',
-    'harvest_item',
-    'list_item',
-    'buy_item',
+    "eat_food",
+    "drink_water",
+    "score_hit",
+    "player_kill",
+    "consume_item",
+    "harvest_item",
+    "list_item",
+    "buy_item",
 ]
 
 ITEM_TYPE = {
-    'armor': [item.ITEM_TYPE_ID for item in Item.ARMOR],
-    'weapon': [item.ITEM_TYPE_ID for item in Item.WEAPON],
-    'tool': [item.ITEM_TYPE_ID for item in Item.TOOL],
-    'ammo': [item.ITEM_TYPE_ID for item in Item.AMMUNITION],
-    'consumable': [item.ITEM_TYPE_ID for item in Item.CONSUMABLE],
+    "armor": [item.ITEM_TYPE_ID for item in Item.ARMOR],
+    "weapon": [item.ITEM_TYPE_ID for item in Item.WEAPON],
+    "tool": [item.ITEM_TYPE_ID for item in Item.TOOL],
+    "ammo": [item.ITEM_TYPE_ID for item in Item.AMMUNITION],
+    "consumable": [item.ITEM_TYPE_ID for item in Item.CONSUMABLE],
 }
 
+
 def process_event_log(realm, agent_list):
-    '''Process the event log and extract performed actions and achievements.'''
+    """Process the event log and extract performed actions and achievements."""
     log = realm.event_log.get_data(agents=agent_list)
     attr_to_col = realm.event_log.attr_to_col
 
@@ -214,60 +211,68 @@ def process_event_log(realm, agent_list):
     event_cnt = {}
     for key, code in INFO_KEY_TO_EVENT_CODE.items():
         # count the freq of each event
-        event_cnt[key] = int(sum(log[:, attr_to_col['event']] == code))
+        event_cnt[key] = int(sum(log[:, attr_to_col["event"]] == code))
 
     # record true or false for each event
     performed = {}
     for evt in KEY_EVENT:
-        key = 'event/' + evt
+        key = "event/" + evt
         performed[key] = event_cnt[key] > 0
 
     # check if tools, weapons, ammos, ammos were equipped
     for item_type, item_ids in ITEM_TYPE.items():
-        if item_type == 'consumable':
+        if item_type == "consumable":
             continue
-        key = 'event/equip_' + item_type
-        idx = (log[:, attr_to_col['event']] == EventCode.EQUIP_ITEM) & \
-              np.in1d(log[:, attr_to_col['item_type']], item_ids)
+        key = "event/equip_" + item_type
+        idx = (log[:, attr_to_col["event"]] == EventCode.EQUIP_ITEM) & np.in1d(
+            log[:, attr_to_col["item_type"]], item_ids
+        )
         performed[key] = sum(idx) > 0
 
     # check if weapon was harvested
-    key = 'event/harvest_weapon'
-    idx = (log[:, attr_to_col['event']] == EventCode.HARVEST_ITEM) & \
-          np.in1d(log[:, attr_to_col['item_type']], ITEM_TYPE['weapon'])
+    key = "event/harvest_weapon"
+    idx = (log[:, attr_to_col["event"]] == EventCode.HARVEST_ITEM) & np.in1d(
+        log[:, attr_to_col["item_type"]], ITEM_TYPE["weapon"]
+    )
     performed[key] = sum(idx) > 0
 
     # record important achievements
     achieved = {}
 
     # get progress to center
-    idx = log[:, attr_to_col['event']] == EventCode.GO_FARTHEST
-    achieved['achieved/max_progress_to_center'] = \
-        int(max(log[idx, attr_to_col['distance']])) if sum(idx) > 0 else 0
+    idx = log[:, attr_to_col["event"]] == EventCode.GO_FARTHEST
+    achieved["achieved/max_progress_to_center"] = (
+        int(max(log[idx, attr_to_col["distance"]])) if sum(idx) > 0 else 0
+    )
 
     # get earned gold
-    idx = log[:, attr_to_col['event']] == EventCode.EARN_GOLD
-    achieved['achieved/earned_gold'] = int(sum(log[idx, attr_to_col['gold']]))
+    idx = log[:, attr_to_col["event"]] == EventCode.EARN_GOLD
+    achieved["achieved/earned_gold"] = int(sum(log[idx, attr_to_col["gold"]]))
 
     # get max damage
-    idx = log[:, attr_to_col['event']] == EventCode.SCORE_HIT
-    achieved['achieved/max_damage'] = int(max(log[idx, attr_to_col['damage']])) if sum(idx) > 0 else 0
+    idx = log[:, attr_to_col["event"]] == EventCode.SCORE_HIT
+    achieved["achieved/max_damage"] = (
+        int(max(log[idx, attr_to_col["damage"]])) if sum(idx) > 0 else 0
+    )
 
     # get max possessed item levels: from harvesting, looting, buying
-    idx = np.in1d(log[:, attr_to_col['event']],
-                  [EventCode.HARVEST_ITEM, EventCode.LOOT_ITEM, EventCode.BUY_ITEM])
+    idx = np.in1d(
+        log[:, attr_to_col["event"]],
+        [EventCode.HARVEST_ITEM, EventCode.LOOT_ITEM, EventCode.BUY_ITEM],
+    )
     if sum(idx) > 0:
         for item_type, item_ids in ITEM_TYPE.items():
-            idx_item = np.in1d(log[idx, attr_to_col['item_type']], item_ids)
+            idx_item = np.in1d(log[idx, attr_to_col["item_type"]], item_ids)
             if sum(idx_item) > 0:
-                achieved['achieved/max_' + item_type + '_level'] = \
-                    int(max(log[idx][idx_item, attr_to_col['level']]))
+                achieved["achieved/max_" + item_type + "_level"] = int(
+                    max(log[idx][idx_item, attr_to_col["level"]])
+                )
 
     # other notable achievements
-    idx = (log[:, attr_to_col['event']] == EventCode.PLAYER_KILL)
-    achieved['achieved/agent_kill_count'] = int(sum(idx & (log[:, attr_to_col['target_ent']] > 0)))
-    achieved['achieved/npc_kill_count'] = int(sum(idx & (log[:, attr_to_col['target_ent']] < 0)))
-    achieved['achieved/unique_events'] = count_unique_events(log, set())
+    idx = log[:, attr_to_col["event"]] == EventCode.PLAYER_KILL
+    achieved["achieved/agent_kill_count"] = int(sum(idx & (log[:, attr_to_col["target_ent"]] > 0)))
+    achieved["achieved/npc_kill_count"] = int(sum(idx & (log[:, attr_to_col["target_ent"]] < 0)))
+    achieved["achieved/unique_events"] = count_unique_events(log, set())
 
     return achieved, performed, event_cnt
 
@@ -275,8 +280,8 @@ def process_event_log(realm, agent_list):
 # These events are important, so count them even though they are not unique
 EVERY_EVENT_TO_COUNT = set([EventCode.PLAYER_KILL, EventCode.EARN_GOLD])
 
-def count_unique_events(tick_log, experienced,
-                        every_event_to_count=EVERY_EVENT_TO_COUNT):
+
+def count_unique_events(tick_log, experienced, every_event_to_count=EVERY_EVENT_TO_COUNT):
     cnt_unique = 0
     if len(tick_log) == 0:
         return cnt_unique
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index eb79acb6..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-accelerate==0.27.2
-gymnasium==0.29.1
-#nmmo  # https://github.com/kywch/nmmo-environment
-polars
-pufferlib==0.7.3
-psutil
-torch==2.1.0
-transformers==4.37.2
-wandb
diff --git a/tests/test_task_encoder.py b/tests/test_task_encoder.py
index 7ef20f5c..97bacd0a 100644
--- a/tests/test_task_encoder.py
+++ b/tests/test_task_encoder.py
@@ -10,50 +10,48 @@
 # NOTE: different LLMs will give different embedding dimensions
 EMBEDDING_DIM = 4096
 
+
 class TestTaskEncoder(unittest.TestCase):
-  # pylint: disable=protected-access,bad-builtin
-  @classmethod
-  def setUpClass(cls):
-    cls.task_encoder = TaskEncoder(
-        LLM_CHECKPOINT, curriculum_generation.manual_curriculum, batch_size=4
-    )
-
-  @classmethod
-  def tearDownClass(cls):
-    cls.task_encoder.close()
-
-  def test_embed_dim(self):
-    self.assertEqual(self.task_encoder.embed_dim, EMBEDDING_DIM)
-
-  def test_task_encoder_api(self):
-    task_spec_with_embedding = self.task_encoder.get_task_embedding(
-      curriculum_generation.manual_curriculum.curriculum,
-      save_to_file=CURRICULUM_FILE_PATH
-    )
-
-    for single_spec in task_spec_with_embedding:
-      self.assertFalse(sum(single_spec.embedding) == 0)
-
-  def test_get_task_deps_src(self):
-    custom_fn = curriculum_generation.manual_curriculum.PracticeInventoryManagement
-    fn_src, deps_src = self.task_encoder._get_task_deps_src(custom_fn)
-
-    self.assertEqual(
-        fn_src,
-        "def PracticeInventoryManagement(gs, subject, space, num_tick):\n  "
-        +
-        "return InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick)\n",
-    )
-    self.assertTrue("def InventorySpaceGE(" in deps_src)
-    self.assertTrue("def TickGE(" in deps_src)
-
-  def test_contruct_prompt(self):
-    single_spec = random.choice(curriculum_generation.manual_curriculum.curriculum)
-    prompt = self.task_encoder._construct_prompt(
-        single_spec.reward_to, single_spec.eval_fn, single_spec.eval_fn_kwargs
-    )
-    print(prompt)
+    @classmethod
+    def setUpClass(cls):
+        cls.task_encoder = TaskEncoder(
+            LLM_CHECKPOINT, curriculum_generation.manual_curriculum, batch_size=4
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.task_encoder.close()
+
+    def test_embed_dim(self):
+        self.assertEqual(self.task_encoder.embed_dim, EMBEDDING_DIM)
+
+    def test_task_encoder_api(self):
+        task_spec_with_embedding = self.task_encoder.get_task_embedding(
+            curriculum_generation.manual_curriculum.curriculum, save_to_file=CURRICULUM_FILE_PATH
+        )
+
+        for single_spec in task_spec_with_embedding:
+            self.assertFalse(sum(single_spec.embedding) == 0)
+
+    def test_get_task_deps_src(self):
+        custom_fn = curriculum_generation.manual_curriculum.PracticeInventoryManagement
+        fn_src, deps_src = self.task_encoder._get_task_deps_src(custom_fn)
+
+        self.assertEqual(
+            fn_src,
+            "def PracticeInventoryManagement(gs, subject, space, num_tick):\n  "
+            + "return InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick)\n",
+        )
+        self.assertTrue("def InventorySpaceGE(" in deps_src)
+        self.assertTrue("def TickGE(" in deps_src)
+
+    def test_contruct_prompt(self):
+        single_spec = random.choice(curriculum_generation.manual_curriculum.curriculum)
+        prompt = self.task_encoder._construct_prompt(
+            single_spec.reward_to, single_spec.eval_fn, single_spec.eval_fn_kwargs
+        )
+        print(prompt)
 
 
 if __name__ == "__main__":
-  unittest.main()
+    unittest.main()
diff --git a/train.py b/train.py
index 7ffe135c..f697d934 100644
--- a/train.py
+++ b/train.py
@@ -1,4 +1,4 @@
-from pdb import set_trace as T
+# from pdb import set_trace as T
 import importlib
 import argparse
 import inspect
@@ -19,12 +19,12 @@
 
 
 def load_from_config(agent, debug=False):
-    with open('config.yaml') as f:
+    with open("config.yaml") as f:
         config = yaml.safe_load(f)
-    default_keys = 'agent_zoo env train policy recurrent sweep_metadata sweep_metric sweep wandb reward_wrapper'.split()
+    default_keys = "agent_zoo env train policy recurrent sweep_metadata sweep_metric sweep wandb reward_wrapper".split()
     defaults = {key: config.get(key, {}) for key in default_keys}
 
-    debug_config = config.get('debug', {}) if debug else {}
+    debug_config = config.get("debug", {}) if debug else {}
     agent_config = config[agent]
 
     combined_config = {}
@@ -35,15 +35,16 @@ def load_from_config(agent, debug=False):
 
     return pufferlib.namespace(**combined_config)
 
+
 def get_init_args(fn):
     if fn is None:
         return {}
     sig = inspect.signature(fn)
     args = {}
     for name, param in sig.parameters.items():
-        if name in ('self', 'env', 'policy'):
+        if name in ("self", "env", "policy"):
             continue
-        if name in ('agent_id', 'is_multiagent'):  # Postprocessor args
+        if name in ("agent_id", "is_multiagent"):  # Postprocessor args
             continue
         if param.kind == inspect.Parameter.VAR_POSITIONAL:
             continue
@@ -53,12 +54,13 @@ def get_init_args(fn):
             args[name] = param.default if param.default is not inspect.Parameter.empty else None
     return args
 
+
 # Return env_creator, agent_creator
 def setup_agent(module_name):
     try:
-        agent_module = importlib.import_module(f'agent_zoo.{module_name}')
+        agent_module = importlib.import_module(f"agent_zoo.{module_name}")
     except ModuleNotFoundError:
-        raise ValueError(f'Agent module {module_name} not found under the agent_zoo directory.')
+        raise ValueError(f"Agent module {module_name} not found under the agent_zoo directory.")
 
     env_creator = environment.make_env_creator(reward_wrapper_cls=agent_module.RewardWrapper)
 
@@ -72,31 +74,32 @@ def agent_creator(env, args):
         return policy.to(args.train.device)
 
     init_args = {
-        'policy': get_init_args(agent_module.Policy.__init__),
-        'recurrent': get_init_args(agent_module.Recurrent.__init__),
-        'reward_wrapper': get_init_args(agent_module.RewardWrapper.__init__),
+        "policy": get_init_args(agent_module.Policy.__init__),
+        "recurrent": get_init_args(agent_module.Recurrent.__init__),
+        "reward_wrapper": get_init_args(agent_module.RewardWrapper.__init__),
     }
 
     return agent_module, env_creator, agent_creator, init_args
 
+
 def combine_config_args(parser, args, config):
     clean_parser = argparse.ArgumentParser(parents=[parser])
     for name, sub_config in config.items():
         args[name] = {}
         for key, value in sub_config.items():
-            data_key = f'{name}.{key}'
-            cli_key = f'--{data_key}'.replace('_', '-')
+            data_key = f"{name}.{key}"
+            cli_key = f"--{data_key}".replace("_", "-")
             if isinstance(value, bool) and value is False:
-                parser.add_argument(cli_key, default=value, action='store_true')
-                clean_parser.add_argument(cli_key, default=value, action='store_true')
+                parser.add_argument(cli_key, default=value, action="store_true")
+                clean_parser.add_argument(cli_key, default=value, action="store_true")
             elif isinstance(value, bool) and value is True:
-                data_key = f'{name}.no_{key}'
-                cli_key = f'--{data_key}'.replace('_', '-')
-                parser.add_argument(cli_key, default=value, action='store_false')
-                clean_parser.add_argument(cli_key, default=value, action='store_false')
+                data_key = f"{name}.no_{key}"
+                cli_key = f"--{data_key}".replace("_", "-")
+                parser.add_argument(cli_key, default=value, action="store_false")
+                clean_parser.add_argument(cli_key, default=value, action="store_false")
             else:
                 parser.add_argument(cli_key, default=value, type=type(value))
-                clean_parser.add_argument(cli_key, default=value, metavar='', type=type(value))
+                clean_parser.add_argument(cli_key, default=value, metavar="", type=type(value))
 
             args[name][key] = getattr(parser.parse_known_args()[0], data_key)
         args[name] = pufferlib.namespace(**args[name])
@@ -104,6 +107,7 @@ def combine_config_args(parser, args, config):
     clean_parser.parse_args(sys.argv[1:])
     return args
 
+
 def update_args(args, mode=None):
     args = pufferlib.namespace(**args)
 
@@ -111,14 +115,14 @@ def update_args(args, mode=None):
     args.env.curriculum_file_path = args.curriculum
 
     vec = args.vectorization
-    if vec == 'serial' or args.debug:
+    if vec == "serial" or args.debug:
         args.vectorization = pufferlib.vectorization.Serial
-    elif vec == 'multiprocessing':
+    elif vec == "multiprocessing":
         args.vectorization = pufferlib.vectorization.Multiprocessing
-    elif vec == 'ray':
+    elif vec == "ray":
         args.vectorization = pufferlib.vectorization.Ray
     else:
-        raise ValueError('Invalid --vectorization (serial/multiprocessing/ray).')
+        raise ValueError("Invalid --vectorization (serial/multiprocessing/ray).")
 
     # TODO: load the trained baseline from wandb
     # elif args.baseline:
@@ -135,8 +139,8 @@ def update_args(args, mode=None):
     #         model_file = max(os.listdir(data_dir))
     #         args.eval_model_path = os.path.join(data_dir, model_file)
 
-    if mode in ['evaluate', 'replay']:
-        assert args.eval_model_path is not None, 'Eval mode requires a path to checkpoints'
+    if mode in ["evaluate", "replay"]:
+        assert args.eval_model_path is not None, "Eval mode requires a path to checkpoints"
         args.track = False
         # Disable env pool - see the comment about next_lstm_state in clean_pufferl.evaluate()
         args.train.env_pool = False
@@ -144,63 +148,86 @@ def update_args(args, mode=None):
         args.reward_wrapper.eval_mode = True
         args.reward_wrapper.early_stop_agent_num = 0
 
-    if mode == 'replay':
+    if mode == "replay":
         args.train.num_envs = args.train.envs_per_worker = args.train.envs_per_batch = 1
         args.vectorization = pufferlib.vectorization.Serial
 
     return args
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
-    parser = argparse.ArgumentParser(description='Parse environment argument', add_help=False)
-    parser.add_argument('-m', '--mode', type=str, default='train', choices='train sweep replay'.split())
-    parser.add_argument('-a', '--agent', type=str, default='neurips23_start_kit', help='Agent module to use')
-    parser.add_argument('-n', '--exp-name', type=str, default=None, help="Need exp name to resume the experiment")
-    parser.add_argument('-p', '--eval-model-path', type=str, default=None, help='Path to model to evaluate')
-    parser.add_argument('-c', '--curriculum', type=str, default=BASELINE_CURRICULUM, help='Path to curriculum file')
-    parser.add_argument('-t', '--task-to-assign', type=int, default=None,
-                        help='The index of the task to assign in the curriculum file')
-    #parser.add_argument('--baseline', action='store_true', help='Baseline run')
-    parser.add_argument('--vectorization', type=str, default='multiprocessing', choices='serial multiprocessing ray'.split())
-    parser.add_argument('--no-recurrence', action='store_true', help='Do not use recurrence')
+    parser = argparse.ArgumentParser(description="Parse environment argument", add_help=False)
+    parser.add_argument(
+        "-m", "--mode", type=str, default="train", choices="train sweep replay".split()
+    )
+    parser.add_argument(
+        "-a", "--agent", type=str, default="neurips23_start_kit", help="Agent module to use"
+    )
+    parser.add_argument(
+        "-n", "--exp-name", type=str, default=None, help="Need exp name to resume the experiment"
+    )
+    parser.add_argument(
+        "-p", "--eval-model-path", type=str, default=None, help="Path to model to evaluate"
+    )
+    parser.add_argument(
+        "-c", "--curriculum", type=str, default=BASELINE_CURRICULUM, help="Path to curriculum file"
+    )
+    parser.add_argument(
+        "-t",
+        "--task-to-assign",
+        type=int,
+        default=None,
+        help="The index of the task to assign in the curriculum file",
+    )
+    # parser.add_argument('--baseline', action='store_true', help='Baseline run')
+    parser.add_argument(
+        "--vectorization",
+        type=str,
+        default="multiprocessing",
+        choices="serial multiprocessing ray".split(),
+    )
+    parser.add_argument("--no-recurrence", action="store_true", help="Do not use recurrence")
     if DEBUG:
-        parser.add_argument('--no-track', default=True, help='Do NOT track on WandB')
-        parser.add_argument('--debug', default=True, help='Debug mode')
+        parser.add_argument("--no-track", default=True, help="Do NOT track on WandB")
+        parser.add_argument("--debug", default=True, help="Debug mode")
     else:
-        parser.add_argument('--no-track', action='store_true', help='Do NOT track on WandB')
-        parser.add_argument('--debug', action='store_true', help='Debug mode')
+        parser.add_argument("--no-track", action="store_true", help="Do NOT track on WandB")
+        parser.add_argument("--debug", action="store_true", help="Debug mode")
 
     args = parser.parse_known_args()[0].__dict__
-    config = load_from_config(args['agent'], debug=args.get('debug', False))
-    agent_module, env_creator, agent_creator, init_args = setup_agent(args['agent'])
+    config = load_from_config(args["agent"], debug=args.get("debug", False))
+    agent_module, env_creator, agent_creator, init_args = setup_agent(args["agent"])
 
     # Update config with environment defaults
-    config.policy = {**init_args['policy'], **config.policy}
-    config.recurrent = {**init_args['recurrent'], **config.recurrent}
-    config.reward_wrapper = {**init_args['reward_wrapper'], **config.reward_wrapper}
+    config.policy = {**init_args["policy"], **config.policy}
+    config.recurrent = {**init_args["recurrent"], **config.recurrent}
+    config.reward_wrapper = {**init_args["reward_wrapper"], **config.reward_wrapper}
 
     # Generate argparse menu from config
     args = combine_config_args(parser, args, config)
 
     # Perform mode-specific updates
-    args = update_args(args, mode=args['mode'])
+    args = update_args(args, mode=args["mode"])
 
     if args.train.env_pool is True:
-        logging.warning('Env_pool is enabled. This may increase training speed but break determinism.')
+        logging.warning(
+            "Env_pool is enabled. This may increase training speed but break determinism."
+        )
 
     if args.track:
         args.exp_name = init_wandb(args).id
     else:
         args.exp_name = f"nmmo_{time.strftime('%Y%m%d_%H%M%S')}"
 
-    if args.mode == 'train':
+    if args.mode == "train":
         train(args, env_creator, agent_creator)
         exit(0)
-    elif args.mode == 'sweep':
+    elif args.mode == "sweep":
         sweep(args, env_creator, agent_creator)
         exit(0)
-    elif args.mode == 'replay':
+    elif args.mode == "replay":
         generate_replay(args, env_creator, agent_creator)
         exit(0)
     else:
-        raise ValueError('Mode must be one of train, sweep, or evaluate')
+        raise ValueError("Mode must be one of train, sweep, or evaluate")
diff --git a/train_helper.py b/train_helper.py
index 813defc6..20ce4530 100644
--- a/train_helper.py
+++ b/train_helper.py
@@ -5,14 +5,9 @@
 import dill
 import wandb
 
-# Related to torch.use_deterministic_algorithms(True)
-# See also https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
-os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
-
 import torch
 import numpy as np
 
-import pufferlib
 import pufferlib.policy_pool as pp
 
 from nmmo.render.replay_helper import FileReplayHelper
@@ -20,40 +15,45 @@
 
 from reinforcement_learning import clean_pufferl
 
+# Related to torch.use_deterministic_algorithms(True)
+# See also https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
 
 def init_wandb(args, resume=True):
     if args.no_track:
         return None
-    assert args.wandb.project is not None, 'Please set the wandb project in config.yaml'
-    assert args.wandb.entity is not None, 'Please set the wandb entity in config.yaml'
+    assert args.wandb.project is not None, "Please set the wandb project in config.yaml"
+    assert args.wandb.entity is not None, "Please set the wandb entity in config.yaml"
     wandb_kwargs = {
-        'id': args.exp_name or wandb.util.generate_id(),
-        'project': args.wandb.project,
-        'entity': args.wandb.entity,
-        'config': {
-            'cleanrl': args.train,
-            'env': args.env,
-            'agent_zoo': args.agent,
-            'policy': args.policy,
-            'recurrent': args.recurrent,
-            'reward_wrapper': args.reward_wrapper,
+        "id": args.exp_name or wandb.util.generate_id(),
+        "project": args.wandb.project,
+        "entity": args.wandb.entity,
+        "config": {
+            "cleanrl": args.train,
+            "env": args.env,
+            "agent_zoo": args.agent,
+            "policy": args.policy,
+            "recurrent": args.recurrent,
+            "reward_wrapper": args.reward_wrapper,
         },
-        'name': args.exp_name,
-        'monitor_gym': True,
-        'save_code': True,
-        'resume': resume,
+        "name": args.exp_name,
+        "monitor_gym": True,
+        "save_code": True,
+        "resume": resume,
     }
     if args.wandb.group is not None:
-        wandb_kwargs['group'] = args.wandb.group
+        wandb_kwargs["group"] = args.wandb.group
     return wandb.init(**wandb_kwargs)
 
+
 def train(args, env_creator, agent_creator):
     data = clean_pufferl.create(
         config=args.train,
         agent_creator=agent_creator,
-        agent_kwargs={'args': args},
+        agent_kwargs={"args": args},
         env_creator=env_creator,
-        env_creator_kwargs={'env': args.env, 'reward_wrapper': args.reward_wrapper},
+        env_creator_kwargs={"env": args.env, "reward_wrapper": args.reward_wrapper},
         vectorization=args.vectorization,
         exp_name=args.exp_name,
         track=args.track,
@@ -63,9 +63,10 @@ def train(args, env_creator, agent_creator):
         clean_pufferl.evaluate(data)
         clean_pufferl.train(data)
 
-    print('Done training. Saving data...')
+    print("Done training. Saving data...")
     clean_pufferl.close(data)
-    print('Run complete.')
+    print("Run complete.")
+
 
 def sweep(args, env_creator, agent_creator):
     sweep_id = wandb.sweep(sweep=args.sweep, project=args.wandb.project)
@@ -73,32 +74,32 @@ def sweep(args, env_creator, agent_creator):
     def main():
         try:
             args.exp_name = init_wandb(args).id
-            if hasattr(wandb.config, 'train'):
+            if hasattr(wandb.config, "train"):
                 # TODO: Add update method to namespace
                 print(args.train.__dict__)
                 print(wandb.config.train)
                 args.train.__dict__.update(dict(wandb.config.train))
             train(args, env_creator, agent_creator)
-        except Exception as e:
+        except Exception as e:  # noqa: F841
             import traceback
+
             traceback.print_exc()
 
     wandb.agent(sweep_id, main, count=20)
 
-def generate_replay(args, env_creator, agent_creator,
-                    stop_when_all_complete_task=True,
-                    seed=None):
-    assert args.eval_model_path is not None, 'eval_model_path must be set for replay generation'
+
+def generate_replay(args, env_creator, agent_creator, stop_when_all_complete_task=True, seed=None):
+    assert args.eval_model_path is not None, "eval_model_path must be set for replay generation"
     policies = pp.get_policy_names(args.eval_model_path)
-    assert len(policies) > 0, 'No policies found in eval_model_path'
-    logging.info(f'Policies to generate replay: {policies}')
+    assert len(policies) > 0, "No policies found in eval_model_path"
+    logging.info(f"Policies to generate replay: {policies}")
 
     save_dir = args.eval_model_path
-    logging.info('Replays will be saved to %s', save_dir)
+    logging.info("Replays will be saved to %s", save_dir)
 
     if seed is not None:
         args.train.seed = seed
-    logging.info('Seed: %d', args.train.seed)
+    logging.info("Seed: %d", args.train.seed)
 
     # Set the train config for replay
     args.train.num_envs = 1
@@ -110,15 +111,16 @@ def generate_replay(args, env_creator, agent_creator,
     args.reward_wrapper.early_stop_agent_num = 0
 
     # Use the policy pool helper functions to create kernel (policy-agent mapping)
-    args.train.pool_kernel = pp.create_kernel(args.env.num_agents, len(policies),
-                                              shuffle_with_seed=args.train.seed)
+    args.train.pool_kernel = pp.create_kernel(
+        args.env.num_agents, len(policies), shuffle_with_seed=args.train.seed
+    )
 
     data = clean_pufferl.create(
         config=args.train,
         agent_creator=agent_creator,
-        agent_kwargs={'args': args},
+        agent_kwargs={"args": args},
         env_creator=env_creator,
-        env_creator_kwargs={'env': args.env, 'reward_wrapper': args.reward_wrapper},
+        env_creator_kwargs={"env": args.env, "reward_wrapper": args.reward_wrapper},
         eval_mode=True,
         eval_model_path=args.eval_model_path,
         policy_selector=pp.AllPolicySelector(args.train.seed),
@@ -131,25 +133,26 @@ def generate_replay(args, env_creator, agent_creator,
     nmmo_env.realm.record_replay(replay_helper)
 
     # Sanity checks for replay generation
-    assert len(policies) == len(data.policy_pool.current_policies), 'Policy count mismatch'
-    assert len(data.policy_pool.kernel) == nmmo_env.max_num_agents, 'Agent count mismatch'
+    assert len(policies) == len(data.policy_pool.current_policies), "Policy count mismatch"
+    assert len(data.policy_pool.kernel) == nmmo_env.max_num_agents, "Agent count mismatch"
 
     # Add the policy names to agent names
     if len(policies) > 1:
         for policy_id, samp in data.policy_pool.sample_idxs.items():
-            policy_name = data.policy_pool.current_policies[policy_id]['name']
+            policy_name = data.policy_pool.current_policies[policy_id]["name"]
             for idx in samp:
                 agent_id = idx + 1  # agents are 0-indexed in policy_pool, but 1-indexed in nmmo
-                nmmo_env.realm.players[agent_id].name = f'{policy_name}_{agent_id}'
+                nmmo_env.realm.players[agent_id].name = f"{policy_name}_{agent_id}"
 
     # Assign the specified task to the agents, if provided
     if args.task_to_assign is not None:
-        with open(args.curriculum, 'rb') as f:
-            task_with_embedding = dill.load(f) # a list of TaskSpec
-        assert 0 <= args.task_to_assign < len(task_with_embedding), 'Task index out of range'
+        with open(args.curriculum, "rb") as f:
+            task_with_embedding = dill.load(f)  # a list of TaskSpec
+        assert 0 <= args.task_to_assign < len(task_with_embedding), "Task index out of range"
         select_task = task_with_embedding[args.task_to_assign]
-        tasks = make_task_from_spec(nmmo_env.possible_agents,
-                                    [select_task] * len(nmmo_env.possible_agents))
+        tasks = make_task_from_spec(
+            nmmo_env.possible_agents, [select_task] * len(nmmo_env.possible_agents)
+        )
 
         # Reassign the task to the agents
         nmmo_env.tasks = tasks
@@ -159,7 +162,7 @@ def generate_replay(args, env_creator, agent_creator,
             task_embedding = nmmo_env.agent_task_map[agent_id][0].embedding
             nmmo_env.obs[agent_id].gym_obs.reset(task_embedding)
 
-        print(f'All agents are assigned: {nmmo_env.tasks[0].spec_name}\n')
+        print(f"All agents are assigned: {nmmo_env.tasks[0].spec_name}\n")
 
     # Generate the replay
     replay_helper.reset()
@@ -178,7 +181,8 @@ def generate_replay(args, env_creator, agent_creator,
                 )
 
             actions, logprob, value, next_lstm_state = data.policy_pool.forwards(
-                    o.to(data.device), next_lstm_state)
+                o.to(data.device), next_lstm_state
+            )
 
             if next_lstm_state is not None:
                 h, c = next_lstm_state
@@ -192,35 +196,39 @@ def generate_replay(args, env_creator, agent_creator,
 
         num_alive = len(nmmo_env.agents)
         task_done = sum(1 for task in nmmo_env.tasks if task.completed)
-        alive_done = sum(1 for task in nmmo_env.tasks
-                         if task.completed and task.assignee[0] in nmmo_env.realm.players)
-        print('Tick:', nmmo_env.realm.tick, ', alive agents:', num_alive, ', task done:', task_done)
+        alive_done = sum(
+            1
+            for task in nmmo_env.tasks
+            if task.completed and task.assignee[0] in nmmo_env.realm.players
+        )
+        print("Tick:", nmmo_env.realm.tick, ", alive agents:", num_alive, ", task done:", task_done)
         if num_alive == alive_done:
-            print('All alive agents completed the task.')
+            print("All alive agents completed the task.")
             break
         if num_alive == 0 or nmmo_env.realm.tick == args.env.max_episode_length:
-            print('All agents died or reached the max episode length.')
+            print("All agents died or reached the max episode length.")
             break
 
     # Count how many agents completed the task
-    print('--------------------------------------------------')
-    print('Task:', nmmo_env.tasks[0].spec_name)
+    print("--------------------------------------------------")
+    print("Task:", nmmo_env.tasks[0].spec_name)
     num_completed = sum(1 for task in nmmo_env.tasks if task.completed)
-    print('Number of agents completed the task:', num_completed)
-    avg_progress = np.mean([task.progress_info['max_progress'] for task in nmmo_env.tasks])
-    print(f'Average maximum progress (max=1): {avg_progress:.3f}')
-    avg_completed_tick = 0 
+    print("Number of agents completed the task:", num_completed)
+    avg_progress = np.mean([task.progress_info["max_progress"] for task in nmmo_env.tasks])
+    print(f"Average maximum progress (max=1): {avg_progress:.3f}")
+    avg_completed_tick = 0
     if num_completed > 0:
-        avg_completed_tick = np.mean([task.progress_info['completed_tick']
-                                      for task in nmmo_env.tasks if task.completed])
-    print(f'Average completed tick: {avg_completed_tick:.1f}')
+        avg_completed_tick = np.mean(
+            [task.progress_info["completed_tick"] for task in nmmo_env.tasks if task.completed]
+        )
+    print(f"Average completed tick: {avg_completed_tick:.1f}")
 
     # Save the replay file
-    replay_file = f'replay_seed_{args.train.seed}_'
+    replay_file = f"replay_seed_{args.train.seed}_"
     if args.task_to_assign is not None:
-        replay_file += f'task_{args.task_to_assign}_'
-    replay_file = os.path.join(save_dir, replay_file + time.strftime('%Y%m%d_%H%M%S'))
-    print(f'Saving replay to {replay_file}')
+        replay_file += f"task_{args.task_to_assign}_"
+    replay_file = os.path.join(save_dir, replay_file + time.strftime("%Y%m%d_%H%M%S"))
+    print(f"Saving replay to {replay_file}")
     replay_helper.save(replay_file, compress=True)
     clean_pufferl.close(data)