From 47eccbe0f812b6f6b88bf649edae768f5dbec1d0 Mon Sep 17 00:00:00 2001 From: kywch Date: Thu, 28 Mar 2024 15:30:06 -0700 Subject: [PATCH] switched to pyproject.toml, applied ruff --- .github/workflows/workflow.yml | 24 + .pre-commit-config.yaml | 9 + agent_zoo/neurips23_start_kit/__init__.py | 2 +- .../neurips23_start_kit/baseline_policy.py | 455 +++++++++--------- .../neurips23_start_kit/reward_wrapper.py | 23 +- analysis/proc_eval_result.py | 191 ++++---- analysis/proc_task_cond_result.py | 91 ++-- analysis/run_task_conditioning.py | 55 ++- curriculum_generation/curriculum_tutorial.py | 7 + curriculum_generation/manual_curriculum.py | 383 +++++++-------- curriculum_generation/task_encoder.py | 69 ++- curriculum_generation/task_sampler.py | 38 +- evaluate.py | 182 ++++--- neurips23_evaluation/export_embeddings.py | 32 +- .../heldout_evaluation_task.py | 44 +- .../sample_evaluation_task.py | 16 +- pyproject.toml | 121 +++++ reinforcement_learning/clean_pufferl.py | 298 ++++++------ reinforcement_learning/environment.py | 32 +- reinforcement_learning/stat_wrapper.py | 167 +++---- requirements.txt | 9 - tests/test_task_encoder.py | 84 ++-- train.py | 135 +++--- train_helper.py | 146 +++--- 24 files changed, 1460 insertions(+), 1153 deletions(-) create mode 100644 .github/workflows/workflow.yml create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml delete mode 100644 requirements.txt diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml new file mode 100644 index 00000000..8601682d --- /dev/null +++ b/.github/workflows/workflow.yml @@ -0,0 +1,24 @@ +name: tox +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + py: ["3.8", "3.9", "3.10"] + steps: + - name: Setup python for test ${{ matrix.py }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.py }} + - uses: actions/checkout@v3 + - name: Upgrade pip + run: python -m pip install -U pip setuptools wheel cython + - name: Install + run: python -m pip install -e '.[dev]' + - name: Check formatting + run: ruff format . + - name: Check lint + run: ruff check . \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..8f03df78 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.3.2 + hooks: + # Run the linter. + - id: ruff + # Run the formatter. + - id: ruff-format \ No newline at end of file diff --git a/agent_zoo/neurips23_start_kit/__init__.py b/agent_zoo/neurips23_start_kit/__init__.py index 6bce0e48..2f7ed092 100644 --- a/agent_zoo/neurips23_start_kit/__init__.py +++ b/agent_zoo/neurips23_start_kit/__init__.py @@ -1,3 +1,3 @@ from .baseline_policy import Baseline as Policy from .baseline_policy import Recurrent -from .reward_wrapper import RewardWrapper \ No newline at end of file +from .reward_wrapper import RewardWrapper diff --git a/agent_zoo/neurips23_start_kit/baseline_policy.py b/agent_zoo/neurips23_start_kit/baseline_policy.py index 933f9116..8525250f 100644 --- a/agent_zoo/neurips23_start_kit/baseline_policy.py +++ b/agent_zoo/neurips23_start_kit/baseline_policy.py @@ -11,264 +11,271 @@ EntityId = EntityState.State.attr_name_to_col["id"] # NOTE: a workaround for the torch.complier problem. TODO: try torch 2.2 -#unpack_batched_obs = torch.compiler.disable(unpack_batched_obs) +# unpack_batched_obs = torch.compiler.disable(unpack_batched_obs) class Recurrent(pufferlib.models.RecurrentWrapper): def __init__(self, env, policy, input_size=256, hidden_size=256, num_layers=1): super().__init__(env, policy, input_size, hidden_size, num_layers) + class Baseline(pufferlib.models.Policy): - '''Improved baseline policy by JimyhZhu''' - def __init__(self, env, input_size=256, hidden_size=256, task_size=2048): - super().__init__(env) - - self.unflatten_context = env.unflatten_context - - self.tile_encoder = TileEncoder(input_size) - self.player_encoder = PlayerEncoder(input_size, hidden_size) - self.item_encoder = ItemEncoder(input_size, hidden_size) - self.inventory_encoder = InventoryEncoder(input_size, hidden_size) - self.market_encoder = MarketEncoder(input_size, hidden_size) - self.task_encoder = TaskEncoder(input_size, hidden_size, task_size) - self.proj_fc = torch.nn.Linear(6 * input_size, input_size) - self.action_decoder = ActionDecoder(input_size, hidden_size) - self.value_head = torch.nn.Linear(hidden_size, 1) - - def encode_observations(self, flat_observations): - env_outputs = unpack_batched_obs(flat_observations, self.unflatten_context) - tile = self.tile_encoder(env_outputs["Tile"]) - player_embeddings, my_agent = self.player_encoder( - env_outputs["Entity"], env_outputs["AgentId"][:, 0] - ) - - item_embeddings = self.item_encoder(env_outputs["Inventory"]) - market_embeddings = self.item_encoder(env_outputs["Market"]) # no_pooling - market = self.market_encoder(market_embeddings) # fc +mean pooling already applied - task = self.task_encoder(env_outputs["Task"]) - pooled_item_embeddings = item_embeddings.mean(dim=1) - pooled_player_embeddings = player_embeddings.mean(dim=1) - obs = torch.cat([tile, my_agent, pooled_player_embeddings, pooled_item_embeddings, market, task], dim=-1) - obs = self.proj_fc(obs) - - # Pad the embeddings to make them the same size to the action_decoder - # This is a workaround for the fact that the action_decoder expects the same number of actions including no-op - embeddings = [player_embeddings, item_embeddings, market_embeddings] - padded_embeddings = [] - for embedding in embeddings: - padding_size = 1 # The size of padding to be added - padding = torch.zeros(embedding.size(0), padding_size, embedding.size(2), device=embedding.device) - padded_embedding = torch.cat([embedding, padding], dim=1) - padded_embeddings.append(padded_embedding) - # Replace the original embeddings with the padded versions - player_embeddings, item_embeddings, market_embeddings = padded_embeddings - - return obs, ( - player_embeddings, - item_embeddings, - market_embeddings, - env_outputs["ActionTargets"], - ) - - def decode_actions(self, hidden, lookup): - actions = self.action_decoder(hidden, lookup) - value = self.value_head(hidden) - return actions, value + """Improved baseline policy by JimyhZhu""" + + def __init__(self, env, input_size=256, hidden_size=256, task_size=2048): + super().__init__(env) + + self.unflatten_context = env.unflatten_context + + self.tile_encoder = TileEncoder(input_size) + self.player_encoder = PlayerEncoder(input_size, hidden_size) + self.item_encoder = ItemEncoder(input_size, hidden_size) + self.inventory_encoder = InventoryEncoder(input_size, hidden_size) + self.market_encoder = MarketEncoder(input_size, hidden_size) + self.task_encoder = TaskEncoder(input_size, hidden_size, task_size) + self.proj_fc = torch.nn.Linear(6 * input_size, input_size) + self.action_decoder = ActionDecoder(input_size, hidden_size) + self.value_head = torch.nn.Linear(hidden_size, 1) + + def encode_observations(self, flat_observations): + env_outputs = unpack_batched_obs(flat_observations, self.unflatten_context) + tile = self.tile_encoder(env_outputs["Tile"]) + player_embeddings, my_agent = self.player_encoder( + env_outputs["Entity"], env_outputs["AgentId"][:, 0] + ) + + item_embeddings = self.item_encoder(env_outputs["Inventory"]) + market_embeddings = self.item_encoder(env_outputs["Market"]) # no_pooling + market = self.market_encoder(market_embeddings) # fc +mean pooling already applied + task = self.task_encoder(env_outputs["Task"]) + pooled_item_embeddings = item_embeddings.mean(dim=1) + pooled_player_embeddings = player_embeddings.mean(dim=1) + obs = torch.cat( + [tile, my_agent, pooled_player_embeddings, pooled_item_embeddings, market, task], dim=-1 + ) + obs = self.proj_fc(obs) + + # Pad the embeddings to make them the same size to the action_decoder + # This is a workaround for the fact that the action_decoder expects the same number of actions including no-op + embeddings = [player_embeddings, item_embeddings, market_embeddings] + padded_embeddings = [] + for embedding in embeddings: + padding_size = 1 # The size of padding to be added + padding = torch.zeros( + embedding.size(0), padding_size, embedding.size(2), device=embedding.device + ) + padded_embedding = torch.cat([embedding, padding], dim=1) + padded_embeddings.append(padded_embedding) + # Replace the original embeddings with the padded versions + player_embeddings, item_embeddings, market_embeddings = padded_embeddings + + return obs, ( + player_embeddings, + item_embeddings, + market_embeddings, + env_outputs["ActionTargets"], + ) + + def decode_actions(self, hidden, lookup): + actions = self.action_decoder(hidden, lookup) + value = self.value_head(hidden) + return actions, value class TileEncoder(torch.nn.Module): - def __init__(self, input_size): - super().__init__() - self.tile_offset = torch.tensor([i * 256 for i in range(3)]) - self.embedding = torch.nn.Embedding(3 * 256, 32) + def __init__(self, input_size): + super().__init__() + self.tile_offset = torch.tensor([i * 256 for i in range(3)]) + self.embedding = torch.nn.Embedding(3 * 256, 32) - self.tile_conv_1 = torch.nn.Conv2d(96, 32, 3) - self.tile_conv_2 = torch.nn.Conv2d(32, 8, 3) - self.tile_fc = torch.nn.Linear(8 * 11 * 11, input_size) + self.tile_conv_1 = torch.nn.Conv2d(96, 32, 3) + self.tile_conv_2 = torch.nn.Conv2d(32, 8, 3) + self.tile_fc = torch.nn.Linear(8 * 11 * 11, input_size) - def forward(self, tile): - tile[:, :, :2] -= tile[:, 112:113, :2].clone() - tile[:, :, :2] += 7 - tile = self.embedding( - tile.long().clip(0, 255) + self.tile_offset.to(tile.device) - ) + def forward(self, tile): + tile[:, :, :2] -= tile[:, 112:113, :2].clone() + tile[:, :, :2] += 7 + tile = self.embedding(tile.long().clip(0, 255) + self.tile_offset.to(tile.device)) - agents, tiles, features, embed = tile.shape - tile = ( - tile.view(agents, tiles, features * embed) - .transpose(1, 2) - .view(agents, features * embed, 15, 15) - ) + agents, tiles, features, embed = tile.shape + tile = ( + tile.view(agents, tiles, features * embed) + .transpose(1, 2) + .view(agents, features * embed, 15, 15) + ) - tile = F.relu(self.tile_conv_1(tile)) - tile = F.relu(self.tile_conv_2(tile)) - tile = tile.contiguous().view(agents, -1) - tile = F.relu(self.tile_fc(tile)) + tile = F.relu(self.tile_conv_1(tile)) + tile = F.relu(self.tile_conv_2(tile)) + tile = tile.contiguous().view(agents, -1) + tile = F.relu(self.tile_fc(tile)) - return tile + return tile class PlayerEncoder(torch.nn.Module): - def __init__(self, input_size, hidden_size): - super().__init__() - self.entity_dim = 31 - self.num_classes_npc_type = 5 # only using npc_type for one hot (0,4) - self.agent_fc = torch.nn.Linear(self.entity_dim + 5 - 1, hidden_size) - self.my_agent_fc = torch.nn.Linear(self.entity_dim + 5 - 1, input_size) - - def forward(self, agents, my_id): - npc_type = agents[:, :, 1] - one_hot_npc_type = F.one_hot(npc_type.long(), - num_classes=self.num_classes_npc_type).float() # Subtract 1 if npc_type starts from 1 - one_hot_agents = torch.cat([agents[:, :, :1], one_hot_npc_type, agents[:, :, 2:]], dim=-1).float() - - agent_ids = one_hot_agents[:, :, EntityId] - mask = (agent_ids == my_id.unsqueeze(1)) & (agent_ids != 0) - mask = mask.int() - row_indices = torch.where( - mask.any(dim=1), mask.argmax(dim=1), torch.zeros_like(mask.sum(dim=1)) - ) - - # batch, agent, attrs, embed = agent_embeddings.shape - my_agent_embeddings = one_hot_agents[ - torch.arange(one_hot_agents.shape[0]), row_indices - ] - agent_embeddings = self.agent_fc(one_hot_agents.cuda()) - my_agent_embeddings = self.my_agent_fc(my_agent_embeddings) - my_agent_embeddings = F.relu(my_agent_embeddings) - - return agent_embeddings, my_agent_embeddings + def __init__(self, input_size, hidden_size): + super().__init__() + self.entity_dim = 31 + self.num_classes_npc_type = 5 # only using npc_type for one hot (0,4) + self.agent_fc = torch.nn.Linear(self.entity_dim + 5 - 1, hidden_size) + self.my_agent_fc = torch.nn.Linear(self.entity_dim + 5 - 1, input_size) + + def forward(self, agents, my_id): + npc_type = agents[:, :, 1] + one_hot_npc_type = F.one_hot( + npc_type.long(), num_classes=self.num_classes_npc_type + ).float() # Subtract 1 if npc_type starts from 1 + one_hot_agents = torch.cat( + [agents[:, :, :1], one_hot_npc_type, agents[:, :, 2:]], dim=-1 + ).float() + + agent_ids = one_hot_agents[:, :, EntityId] + mask = (agent_ids == my_id.unsqueeze(1)) & (agent_ids != 0) + mask = mask.int() + row_indices = torch.where( + mask.any(dim=1), mask.argmax(dim=1), torch.zeros_like(mask.sum(dim=1)) + ) + + # batch, agent, attrs, embed = agent_embeddings.shape + my_agent_embeddings = one_hot_agents[torch.arange(one_hot_agents.shape[0]), row_indices] + agent_embeddings = self.agent_fc(one_hot_agents.cuda()) + my_agent_embeddings = self.my_agent_fc(my_agent_embeddings) + my_agent_embeddings = F.relu(my_agent_embeddings) + + return agent_embeddings, my_agent_embeddings class ItemEncoder(torch.nn.Module): - def __init__(self, input_size, hidden_size): - super().__init__() - self.fc = torch.nn.Linear(18 + 2 + 14 - 2, hidden_size) - self.discrete_idxs = [1, 14] - self.discrete_offset = torch.Tensor([2, 0]) - self.continuous_idxs = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15] - self.continuous_scale = torch.Tensor([1/100] * 12) - - def forward(self, items): - if self.discrete_offset.device != items.device: - self.discrete_offset = self.discrete_offset.to(items.device) - self.continuous_scale = self.continuous_scale.to(items.device) - - one_hot_discrete_equipped = F.one_hot(items[:, :, 14].long(), num_classes=2).float() - one_hot_discrete_type_id = F.one_hot(items[:, :, 1].long(), num_classes=18).float() - one_hot_discrete = torch.concat([one_hot_discrete_type_id, one_hot_discrete_equipped], dim=-1) # - continuous = items[:, :, self.continuous_idxs] * self.continuous_scale - item_embeddings = torch.cat([one_hot_discrete, continuous], dim=-1).float() - item_embeddings = self.fc(item_embeddings) - return item_embeddings + def __init__(self, input_size, hidden_size): + super().__init__() + self.fc = torch.nn.Linear(18 + 2 + 14 - 2, hidden_size) + self.discrete_idxs = [1, 14] + self.discrete_offset = torch.Tensor([2, 0]) + self.continuous_idxs = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15] + self.continuous_scale = torch.Tensor([1 / 100] * 12) + + def forward(self, items): + if self.discrete_offset.device != items.device: + self.discrete_offset = self.discrete_offset.to(items.device) + self.continuous_scale = self.continuous_scale.to(items.device) + + one_hot_discrete_equipped = F.one_hot(items[:, :, 14].long(), num_classes=2).float() + one_hot_discrete_type_id = F.one_hot(items[:, :, 1].long(), num_classes=18).float() + one_hot_discrete = torch.concat( + [one_hot_discrete_type_id, one_hot_discrete_equipped], dim=-1 + ) # + continuous = items[:, :, self.continuous_idxs] * self.continuous_scale + item_embeddings = torch.cat([one_hot_discrete, continuous], dim=-1).float() + item_embeddings = self.fc(item_embeddings) + return item_embeddings class InventoryEncoder(torch.nn.Module): - def __init__(self, input_size, hidden_size): - super().__init__() - self.fc = torch.nn.Linear(12 * hidden_size, input_size) + def __init__(self, input_size, hidden_size): + super().__init__() + self.fc = torch.nn.Linear(12 * hidden_size, input_size) - def forward(self, inventory): - agents, items, hidden = inventory.shape - inventory = inventory.view(agents, items * hidden) - return self.fc(inventory) + def forward(self, inventory): + agents, items, hidden = inventory.shape + inventory = inventory.view(agents, items * hidden) + return self.fc(inventory) class MarketEncoder(torch.nn.Module): - def __init__(self, input_size, hidden_size): - super().__init__() - self.fc = torch.nn.Linear(hidden_size, input_size) + def __init__(self, input_size, hidden_size): + super().__init__() + self.fc = torch.nn.Linear(hidden_size, input_size) - def forward(self, market): - return self.fc(market).mean(-2) + def forward(self, market): + return self.fc(market).mean(-2) class TaskEncoder(torch.nn.Module): - def __init__(self, input_size, hidden_size, task_size): - super().__init__() - self.fc = torch.nn.Linear(task_size, input_size) + def __init__(self, input_size, hidden_size, task_size): + super().__init__() + self.fc = torch.nn.Linear(task_size, input_size) - def forward(self, task): - return self.fc(task.clone().float()) + def forward(self, task): + return self.fc(task.clone().float()) class ActionDecoder(torch.nn.Module): - def __init__(self, input_size, hidden_size): - super().__init__() - self.layers = torch.nn.ModuleDict( - { - "attack_style": torch.nn.Linear(hidden_size, 3), - "attack_target": torch.nn.Linear(hidden_size, hidden_size), - "market_buy": torch.nn.Linear(hidden_size, hidden_size), - "inventory_destroy": torch.nn.Linear(hidden_size, hidden_size), - "inventory_give_item": torch.nn.Linear(hidden_size, hidden_size), - "inventory_give_player": torch.nn.Linear(hidden_size, hidden_size), - "gold_quantity": torch.nn.Linear(hidden_size, 99), - "gold_target": torch.nn.Linear(hidden_size, hidden_size), - "move": torch.nn.Linear(hidden_size, 5), - "inventory_sell": torch.nn.Linear(hidden_size, hidden_size), - "inventory_price": torch.nn.Linear(hidden_size, 99), - "inventory_use": torch.nn.Linear(hidden_size, hidden_size), + def __init__(self, input_size, hidden_size): + super().__init__() + self.layers = torch.nn.ModuleDict( + { + "attack_style": torch.nn.Linear(hidden_size, 3), + "attack_target": torch.nn.Linear(hidden_size, hidden_size), + "market_buy": torch.nn.Linear(hidden_size, hidden_size), + "inventory_destroy": torch.nn.Linear(hidden_size, hidden_size), + "inventory_give_item": torch.nn.Linear(hidden_size, hidden_size), + "inventory_give_player": torch.nn.Linear(hidden_size, hidden_size), + "gold_quantity": torch.nn.Linear(hidden_size, 99), + "gold_target": torch.nn.Linear(hidden_size, hidden_size), + "move": torch.nn.Linear(hidden_size, 5), + "inventory_sell": torch.nn.Linear(hidden_size, hidden_size), + "inventory_price": torch.nn.Linear(hidden_size, 99), + "inventory_use": torch.nn.Linear(hidden_size, hidden_size), + } + ) + + def apply_layer(self, layer, embeddings, mask, hidden): + hidden = layer(hidden) + if hidden.dim() == 2 and embeddings is not None: + hidden = torch.matmul(embeddings, hidden.unsqueeze(-1)).squeeze(-1) + + if mask is not None: + hidden = hidden.masked_fill(mask == 0, -1e9) + + return hidden + + def forward(self, hidden, lookup): + ( + player_embeddings, + inventory_embeddings, + market_embeddings, + action_targets, + ) = lookup + + embeddings = { + "attack_target": player_embeddings, + "market_buy": market_embeddings, + "inventory_destroy": inventory_embeddings, + "inventory_give_item": inventory_embeddings, + "inventory_give_player": player_embeddings, + "gold_target": player_embeddings, + "inventory_sell": inventory_embeddings, + "inventory_use": inventory_embeddings, + } + + action_targets = { + "attack_style": action_targets["Attack"]["Style"], + "attack_target": action_targets["Attack"]["Target"], + "market_buy": action_targets["Buy"]["MarketItem"], + "inventory_destroy": action_targets["Destroy"]["InventoryItem"], + "inventory_give_item": action_targets["Give"]["InventoryItem"], + "inventory_give_player": action_targets["Give"]["Target"], + "gold_quantity": action_targets["GiveGold"]["Price"], + "gold_target": action_targets["GiveGold"]["Target"], + "move": action_targets["Move"]["Direction"], + "inventory_sell": action_targets["Sell"]["InventoryItem"], + "inventory_price": action_targets["Sell"]["Price"], + "inventory_use": action_targets["Use"]["InventoryItem"], } - ) - - def apply_layer(self, layer, embeddings, mask, hidden): - hidden = layer(hidden) - if hidden.dim() == 2 and embeddings is not None: - hidden = torch.matmul(embeddings, hidden.unsqueeze(-1)).squeeze(-1) - - if mask is not None: - hidden = hidden.masked_fill(mask == 0, -1e9) - - return hidden - - def forward(self, hidden, lookup): - ( - player_embeddings, - inventory_embeddings, - market_embeddings, - action_targets, - ) = lookup - - embeddings = { - "attack_target": player_embeddings, - "market_buy": market_embeddings, - "inventory_destroy": inventory_embeddings, - "inventory_give_item": inventory_embeddings, - "inventory_give_player": player_embeddings, - "gold_target": player_embeddings, - "inventory_sell": inventory_embeddings, - "inventory_use": inventory_embeddings, - } - - action_targets = { - "attack_style": action_targets["Attack"]["Style"], - "attack_target": action_targets["Attack"]["Target"], - "market_buy": action_targets["Buy"]["MarketItem"], - "inventory_destroy": action_targets["Destroy"]["InventoryItem"], - "inventory_give_item": action_targets["Give"]["InventoryItem"], - "inventory_give_player": action_targets["Give"]["Target"], - "gold_quantity": action_targets["GiveGold"]["Price"], - "gold_target": action_targets["GiveGold"]["Target"], - "move": action_targets["Move"]["Direction"], - "inventory_sell": action_targets["Sell"]["InventoryItem"], - "inventory_price": action_targets["Sell"]["Price"], - "inventory_use": action_targets["Use"]["InventoryItem"], - } - - actions = [] - for key, layer in self.layers.items(): - mask = None - mask = action_targets[key] - embs = embeddings.get(key) - - # NOTE: SHOULD not hit this - # if embs is not None and embs.shape[1] != mask.shape[1]: - # b, _, f = embs.shape - # zeros = torch.zeros([b, 1, f], dtype=embs.dtype, device=embs.device) - # embs = torch.cat([embs, zeros], dim=1) - - action = self.apply_layer(layer, embs, mask, hidden) - actions.append(action) - - return actions + + actions = [] + for key, layer in self.layers.items(): + mask = None + mask = action_targets[key] + embs = embeddings.get(key) + + # NOTE: SHOULD not hit this + # if embs is not None and embs.shape[1] != mask.shape[1]: + # b, _, f = embs.shape + # zeros = torch.zeros([b, 1, f], dtype=embs.dtype, device=embs.device) + # embs = torch.cat([embs, zeros], dim=1) + + action = self.apply_layer(layer, embs, mask, hidden) + actions.append(action) + + return actions diff --git a/agent_zoo/neurips23_start_kit/reward_wrapper.py b/agent_zoo/neurips23_start_kit/reward_wrapper.py index 7638ed50..b907740a 100644 --- a/agent_zoo/neurips23_start_kit/reward_wrapper.py +++ b/agent_zoo/neurips23_start_kit/reward_wrapper.py @@ -9,7 +9,6 @@ def __init__( eval_mode=False, early_stop_agent_num=0, stat_prefix=None, - # Custom reward wrapper args heal_bonus_weight=0, explore_bonus_weight=0, @@ -22,15 +21,15 @@ def __init__( self.clip_unique_event = clip_unique_event def reset(self, **kwargs): - '''Called at the start of each episode''' + """Called at the start of each episode""" self._reset_reward_vars() return super().reset(**kwargs) def _reset_reward_vars(self): self._history = { agent_id: { - 'prev_price': 0, - 'prev_moves': [], + "prev_price": 0, + "prev_moves": [], } for agent_id in self.env.possible_agents } @@ -43,20 +42,20 @@ def observation_space(self): """ def observation(self, agent_id, agent_obs): - '''Called before observations are returned from the environment + """Called before observations are returned from the environment Use this to define custom featurizers. Changing the space itself requires you to define the observation space again (i.e. Gym.spaces.Dict(gym.spaces....)) - ''' + """ # Mask the price of the previous action, to encourage agents to explore new prices - agent_obs['ActionTargets']['Sell']['Price'][self._history[agent_id]['prev_price']] = 0 + agent_obs["ActionTargets"]["Sell"]["Price"][self._history[agent_id]["prev_price"]] = 0 return agent_obs def action(self, agent_id, agent_atn): - '''Called before actions are passed from the model to the environment''' + """Called before actions are passed from the model to the environment""" # Keep track of the previous price and moves for each agent - self._history[agent_id]['prev_price'] = agent_atn['Sell']['Price'] - self._history[agent_id]['prev_moves'].append(agent_atn['Move']['Direction']) + self._history[agent_id]["prev_price"] = agent_atn["Sell"]["Price"] + self._history[agent_id]["prev_moves"].append(agent_atn["Move"]["Direction"]) return agent_atn def reward_terminated_truncated_info(self, agent_id, reward, terminated, truncated, info): @@ -72,8 +71,8 @@ def reward_terminated_truncated_info(self, agent_id, reward, terminated, truncat # The number of unique events are available in self._unique_events[agent_id] uniq = self._unique_events[agent_id] explore_bonus = 0 - if self.explore_bonus_weight > 0 and uniq['curr_count'] > uniq['prev_count']: - explore_bonus = min(self.clip_unique_event, uniq['curr_count'] - uniq['prev_count']) + if self.explore_bonus_weight > 0 and uniq["curr_count"] > uniq["prev_count"]: + explore_bonus = min(self.clip_unique_event, uniq["curr_count"] - uniq["prev_count"]) explore_bonus *= self.explore_bonus_weight reward += healing_bonus + explore_bonus diff --git a/analysis/proc_eval_result.py b/analysis/proc_eval_result.py index 815a2701..2531014b 100644 --- a/analysis/proc_eval_result.py +++ b/analysis/proc_eval_result.py @@ -8,85 +8,87 @@ # Make the table output simpler pl.Config.set_tbl_hide_dataframe_shape(True) -pl.Config.set_tbl_formatting('NOTHING') +pl.Config.set_tbl_formatting("NOTHING") pl.Config.set_tbl_hide_column_data_types(True) # string matching for task names WEIGHT_DICT = { - 'TickGE': ('survival', 100 / 6), # 1 survival task - 'PLAYER_KILL': ('combat', 100 / (6*3)), # 3 combat tasks - 'DefeatEntity': ('combat', 100 / (6*3)), - 'GO_FARTHEST': ('exploration', 100 / (6*2)), # 2 exploration tasks - 'OccupyTile': ('exploration', 100 / (6*2)), - 'AttainSkill': ('skill', 100 / (6*8)), # 8 skill tasks - 'HarvestItem': ('item', 100 / (6*44)), # 44 item tasks - 'ConsumeItem': ('item', 100 / (6*44)), - 'EquipItem': ('item', 100 / (6*44)), - 'FullyArmed': ('item', 100 / (6*44)), - 'EARN_GOLD': ('market', 100 / (6*5)), # 5 market tasks - 'BUY_ITEM': ('market', 100 / (6*5)), - 'EarnGold': ('market', 100 / (6*5)), - 'HoardGold': ('market', 100 / (6*5)), - 'MakeProfit': ('market', 100 / (6*5)), + "TickGE": ("survival", 100 / 6), # 1 survival task + "PLAYER_KILL": ("combat", 100 / (6 * 3)), # 3 combat tasks + "DefeatEntity": ("combat", 100 / (6 * 3)), + "GO_FARTHEST": ("exploration", 100 / (6 * 2)), # 2 exploration tasks + "OccupyTile": ("exploration", 100 / (6 * 2)), + "AttainSkill": ("skill", 100 / (6 * 8)), # 8 skill tasks + "HarvestItem": ("item", 100 / (6 * 44)), # 44 item tasks + "ConsumeItem": ("item", 100 / (6 * 44)), + "EquipItem": ("item", 100 / (6 * 44)), + "FullyArmed": ("item", 100 / (6 * 44)), + "EARN_GOLD": ("market", 100 / (6 * 5)), # 5 market tasks + "BUY_ITEM": ("market", 100 / (6 * 5)), + "EarnGold": ("market", 100 / (6 * 5)), + "HoardGold": ("market", 100 / (6 * 5)), + "MakeProfit": ("market", 100 / (6 * 5)), } + def get_task_weight(task_name): for key, val in WEIGHT_DICT.items(): if key in task_name: return val - logging.warning(f'Task name {task_name} not found in weight dict') - return 'etc', 0 + logging.warning(f"Task name {task_name} not found in weight dict") + return "etc", 0 + def get_summary_dict(vals, key): - progress = vals if key == 'length' else [v[0] for v in vals] - summ = { - 'count': len(progress), - 'mean': np.mean(progress), - 'median': np.median(progress) - } - - if key == 'length': + progress = vals if key == "length" else [v[0] for v in vals] + summ = {"count": len(progress), "mean": np.mean(progress), "median": np.median(progress)} + + if key == "length": progress = np.array(progress) / 1023 # full episode length - summ['completed'] = np.mean([1 if v >= 1 else 0 for v in progress]) - summ['over30pcnt'] = np.mean([1 if v >= 0.3 else 0 for v in progress]) + summ["completed"] = np.mean([1 if v >= 1 else 0 for v in progress]) + summ["over30pcnt"] = np.mean([1 if v >= 0.3 else 0 for v in progress]) return summ + def summarize_single_eval(data, weighted_score=False): summary = {} # task-level info for key, vals in data.items(): - if key.startswith('curriculum') or key == 'length': + if key.startswith("curriculum") or key == "length": summary[key] = get_summary_dict(vals, key) - if weighted_score and key.startswith('curriculum'): + if weighted_score and key.startswith("curriculum"): category, weight = get_task_weight(key) - summary[key]['category'] = category - summary[key]['weight'] = weight - summary[key]['weighted_score'] = summary[key]['mean'] * weight + summary[key]["category"] = category + summary[key]["weight"] = weight + summary[key]["weighted_score"] = summary[key]["mean"] * weight # meta info - summary['avg_progress'] = np.mean([v['mean'] for k, v in summary.items() - if k.startswith('curriculum')]) + summary["avg_progress"] = np.mean( + [v["mean"] for k, v in summary.items() if k.startswith("curriculum")] + ) if weighted_score: - summary['weighted_score'] = np.sum([v['weighted_score'] for k, v in summary.items() - if k.startswith('curriculum')]) + summary["weighted_score"] = np.sum( + [v["weighted_score"] for k, v in summary.items() if k.startswith("curriculum")] + ) return summary + def process_eval_files(policy_store_dir, eval_prefix): summ_policy = [] summ_task = [] for file in os.listdir(policy_store_dir): # NOTE: assumes the file naming convention is 'eval__.json' - if not file.startswith(eval_prefix) or not file.endswith('.json'): + if not file.startswith(eval_prefix) or not file.endswith(".json"): continue - mode = file.split('_')[1] - random_seed = file.split('_')[2].replace('.json', '') + mode = file.split("_")[1] + random_seed = file.split("_")[2].replace(".json", "") - with open(os.path.join(policy_store_dir, file), 'r') as f: + with open(os.path.join(policy_store_dir, file), "r") as f: data = json.load(f) for pol_name, pol_data in data.items(): @@ -94,59 +96,82 @@ def process_eval_files(policy_store_dir, eval_prefix): continue summary = summarize_single_eval(pol_data, weighted_score=True) - summ_policy.append({ - 'policy_name': pol_name, - 'mode': mode, - 'seed': random_seed, - 'count': summary['length']['count'], - 'length': summary['length']['mean'], - 'score': summary['avg_progress'], - 'weighted_score': summary['weighted_score'] - }) + summ_policy.append( + { + "policy_name": pol_name, + "mode": mode, + "seed": random_seed, + "count": summary["length"]["count"], + "length": summary["length"]["mean"], + "score": summary["avg_progress"], + "weighted_score": summary["weighted_score"], + } + ) # also gather the results across random seeds for each task, then average for task_name, task_data in summary.items(): - if not task_name.startswith('curriculum'): + if not task_name.startswith("curriculum"): continue - summ_task.append({ - 'category': task_data['category'], - 'task_name': task_name, - 'weight': task_data['weight'], - 'policy_name': pol_name, - 'mode': mode, - 'seed': random_seed, - 'count': task_data['count'], - 'score': task_data['mean'] - }) - - summ_df = pl.DataFrame(summ_policy).sort(['policy_name', 'mode', 'seed']) - summ_grp = summ_df.group_by(['policy_name', 'mode']).agg( - pl.col('score').mean(), - pl.col('weighted_score').mean(), + summ_task.append( + { + "category": task_data["category"], + "task_name": task_name, + "weight": task_data["weight"], + "policy_name": pol_name, + "mode": mode, + "seed": random_seed, + "count": task_data["count"], + "score": task_data["mean"], + } + ) + + summ_df = pl.DataFrame(summ_policy).sort(["policy_name", "mode", "seed"]) + summ_grp = summ_df.group_by(["policy_name", "mode"]).agg( + pl.col("score").mean(), + pl.col("weighted_score").mean(), ) - summ_grp = summ_grp.sort('weighted_score', descending=True) - summ_grp.write_csv(os.path.join(policy_store_dir, 'score_summary.tsv'), separator="\t", float_precision=6) - print('\nPolicy score summary, sorted by weighted_score:') + summ_grp = summ_grp.sort("weighted_score", descending=True) + summ_grp.write_csv( + os.path.join(policy_store_dir, "score_summary.tsv"), separator="\t", float_precision=6 + ) + print("\nPolicy score summary, sorted by weighted_score:") print(summ_grp) - task_df = pl.DataFrame(summ_task).sort(['mode', 'category', 'task_name', 'policy_name', 'seed']) - task_grp = task_df.group_by(['mode', 'category', 'task_name', 'policy_name']).agg(pl.col('score').mean()) - task_grp = task_grp.sort(['mode', 'category', 'task_name', 'policy_name']) - task_grp.write_csv(os.path.join(policy_store_dir, 'score_task_summary.tsv'), separator="\t", float_precision=6) - cate_grp = task_df.group_by(['mode', 'category', 'policy_name']).agg(pl.col('score').mean()) - cate_grp = cate_grp.sort(['mode', 'category', 'policy_name']) - cate_grp.write_csv(os.path.join(policy_store_dir, 'score_category_summary.tsv'), separator="\t", float_precision=6) + task_df = pl.DataFrame(summ_task).sort(["mode", "category", "task_name", "policy_name", "seed"]) + task_grp = task_df.group_by(["mode", "category", "task_name", "policy_name"]).agg( + pl.col("score").mean() + ) + task_grp = task_grp.sort(["mode", "category", "task_name", "policy_name"]) + task_grp.write_csv( + os.path.join(policy_store_dir, "score_task_summary.tsv"), separator="\t", float_precision=6 + ) + cate_grp = task_df.group_by(["mode", "category", "policy_name"]).agg(pl.col("score").mean()) + cate_grp = cate_grp.sort(["mode", "category", "policy_name"]) + cate_grp.write_csv( + os.path.join(policy_store_dir, "score_category_summary.tsv"), + separator="\t", + float_precision=6, + ) - if len(summ_df['seed'].unique()) > 1: - summ_df.write_csv(os.path.join(policy_store_dir, 'score_by_seed.tsv'), separator="\t", float_precision=6) - task_df.write_csv(os.path.join(policy_store_dir, 'score_by_task_seed.tsv'), separator="\t", float_precision=6) + if len(summ_df["seed"].unique()) > 1: + summ_df.write_csv( + os.path.join(policy_store_dir, "score_by_seed.tsv"), separator="\t", float_precision=6 + ) + task_df.write_csv( + os.path.join(policy_store_dir, "score_by_task_seed.tsv"), + separator="\t", + float_precision=6, + ) return summ_df, summ_grp, task_df, task_grp, cate_grp -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Process the evaluation result files') - parser.add_argument('policy_store_dir', type=str, help='Path to the policy directory') - parser.add_argument('-p', '--prefix', type=str, default='eval_', help='Prefix of the evaluation result files') + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process the evaluation result files") + parser.add_argument("policy_store_dir", type=str, help="Path to the policy directory") + parser.add_argument( + "-p", "--prefix", type=str, default="eval_", help="Prefix of the evaluation result files" + ) args = parser.parse_args() process_eval_files(args.policy_store_dir, args.prefix) diff --git a/analysis/proc_task_cond_result.py b/analysis/proc_task_cond_result.py index 53b10468..651ceba1 100644 --- a/analysis/proc_task_cond_result.py +++ b/analysis/proc_task_cond_result.py @@ -11,53 +11,55 @@ from nmmo.systems.item import ALL_ITEM from nmmo.systems.skill import COMBAT_SKILL, HARVEST_SKILL -CODE_TO_EVENT = { - v: k for k, v in EventCode.__dict__.items() if not k.startswith('_') -} +CODE_TO_EVENT = {v: k for k, v in EventCode.__dict__.items() if not k.startswith("_")} -ITEM_ID_TO_NAME = { - item.ITEM_TYPE_ID: item.__name__ for item in ALL_ITEM -} +ITEM_ID_TO_NAME = {item.ITEM_TYPE_ID: item.__name__ for item in ALL_ITEM} + +SKILL_ID_TO_NAME = {skill.SKILL_ID: skill.__name__ for skill in COMBAT_SKILL + HARVEST_SKILL} -SKILL_ID_TO_NAME = { - skill.SKILL_ID: skill.__name__ for skill in COMBAT_SKILL + HARVEST_SKILL -} # event tuple key to string def event_key_to_str(event_key): if event_key[0] == EventCode.LEVEL_UP: - return f'LEVEL_{SKILL_ID_TO_NAME[event_key[1]]}' + return f"LEVEL_{SKILL_ID_TO_NAME[event_key[1]]}" elif event_key[0] == EventCode.SCORE_HIT: - return f'ATTACK_NUM_{SKILL_ID_TO_NAME[event_key[1]]}' + return f"ATTACK_NUM_{SKILL_ID_TO_NAME[event_key[1]]}" - elif event_key[0] in [EventCode.HARVEST_ITEM, EventCode.CONSUME_ITEM, EventCode.EQUIP_ITEM, - EventCode.LIST_ITEM, EventCode.BUY_ITEM]: - return f'{CODE_TO_EVENT[event_key[0]]}_{ITEM_ID_TO_NAME[event_key[1]]}' + elif event_key[0] in [ + EventCode.HARVEST_ITEM, + EventCode.CONSUME_ITEM, + EventCode.EQUIP_ITEM, + EventCode.LIST_ITEM, + EventCode.BUY_ITEM, + ]: + return f"{CODE_TO_EVENT[event_key[0]]}_{ITEM_ID_TO_NAME[event_key[1]]}" elif event_key[0] == EventCode.GO_FARTHEST: - return '3_PROGRESS_TO_CENTER' + return "3_PROGRESS_TO_CENTER" elif event_key[0] == EventCode.AGENT_CULLED: - return '2_AGENT_LIFESPAN' + return "2_AGENT_LIFESPAN" else: return CODE_TO_EVENT[event_key[0]] + def extract_task_name(task_str): - name = task_str.split('Task_eval_fn:(')[1].split(')_assignee:')[0] + name = task_str.split("Task_eval_fn:(")[1].split(")_assignee:")[0] # then take out (agent_id,) - return name.split('_(')[0] + '_' + name.split(')_')[1] + return name.split("_(")[0] + "_" + name.split(")_")[1] + def gather_agent_events_by_task(data_dir): data_by_task = defaultdict(list) - file_list = [f for f in os.listdir(data_dir) if f.endswith('.metadata.pkl')] + file_list = [f for f in os.listdir(data_dir) if f.endswith(".metadata.pkl")] for file_name in tqdm(file_list): - data = dill.load(open(f'{data_dir}/{file_name}', 'rb')) - final_tick = data['tick'] + data = dill.load(open(f"{data_dir}/{file_name}", "rb")) + final_tick = data["tick"] - for agent_id, vals in data['event_stats'].items(): - task_name = extract_task_name(data['task'][agent_id]) + for agent_id, vals in data["event_stats"].items(): + task_name = extract_task_name(data["task"][agent_id]) # Agent survived until the end if EventCode.AGENT_CULLED not in vals: @@ -66,9 +68,10 @@ def gather_agent_events_by_task(data_dir): return data_by_task + def get_event_stats(task_name, task_data): num_agents = len(task_data) - assert num_agents > 0, 'There should be at least one agent' + assert num_agents > 0, "There should be at least one agent" cnt_attack = 0 cnt_buy = 0 @@ -77,7 +80,7 @@ def get_event_stats(task_name, task_data): cnt_harvest = 0 cnt_list = 0 - results = {'0_NAME': task_name, '1_COUNT': num_agents} + results = {"0_NAME": task_name, "1_COUNT": num_agents} event_data = defaultdict(list) for data in task_data: for event, val in data.items(): @@ -90,11 +93,11 @@ def get_event_stats(task_name, task_data): results[event_key_to_str(event)] = np.mean(vals) # AVG skill level elif event[0] == EventCode.AGENT_CULLED: life_span = np.mean(vals) - results['2_AGENT_LIFESPAN_AVG'] = life_span - results['2_AGENT_LIFESPAN_SD'] = np.std(vals) + results["2_AGENT_LIFESPAN_AVG"] = life_span + results["2_AGENT_LIFESPAN_SD"] = np.std(vals) elif event[0] == EventCode.GO_FARTHEST: - results['3_PROGRESS_TO_CENTER_AVG'] = np.mean(vals) - results['3_PROGRESS_TO_CENTER_SD'] = np.std(vals) + results["3_PROGRESS_TO_CENTER_AVG"] = np.mean(vals) + results["3_PROGRESS_TO_CENTER_SD"] = np.std(vals) else: results[event_key_to_str(event)] = sum(vals) / num_agents @@ -111,31 +114,31 @@ def get_event_stats(task_name, task_data): if event[0] == EventCode.LIST_ITEM: cnt_list += sum(vals) - results['4_NORM_ATTACK'] = cnt_attack / life_span - results['4_NORM_BUY'] = cnt_buy / life_span - results['4_NORM_CONSUME'] = cnt_consume / life_span - results['4_NORM_EQUIP'] = cnt_equip / life_span - results['4_NORM_HARVEST'] = cnt_harvest / life_span - results['4_NORM_LIST'] = cnt_list / life_span + results["4_NORM_ATTACK"] = cnt_attack / life_span + results["4_NORM_BUY"] = cnt_buy / life_span + results["4_NORM_CONSUME"] = cnt_consume / life_span + results["4_NORM_EQUIP"] = cnt_equip / life_span + results["4_NORM_HARVEST"] = cnt_harvest / life_span + results["4_NORM_LIST"] = cnt_list / life_span return results -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Process replay data') - parser.add_argument('policy_store_dir', type=str, help='Path to the policy directory') + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process replay data") + parser.add_argument("policy_store_dir", type=str, help="Path to the policy directory") args = parser.parse_args() # Gather the event data by tasks, across multiple replays data_by_task = gather_agent_events_by_task(args.policy_store_dir) task_results = [ - get_event_stats(task_name, task_data) - for task_name, task_data in data_by_task.items() + get_event_stats(task_name, task_data) for task_name, task_data in data_by_task.items() ] - task_df = pl.DataFrame(task_results).fill_null(0).sort('0_NAME') + task_df = pl.DataFrame(task_results).fill_null(0).sort("0_NAME") task_df = task_df.select(sorted(task_df.columns)) - task_df.write_csv('task_conditioning.tsv', separator='\t', float_precision=5) + task_df.write_csv("task_conditioning.tsv", separator="\t", float_precision=5) - print('Result file saved as task_conditioning.tsv') - print('Done.') + print("Result file saved as task_conditioning.tsv") + print("Done.") diff --git a/analysis/run_task_conditioning.py b/analysis/run_task_conditioning.py index fb324b54..6d889719 100644 --- a/analysis/run_task_conditioning.py +++ b/analysis/run_task_conditioning.py @@ -8,39 +8,52 @@ from evaluate import make_env_creator, make_agent_creator -CURRICULUM_FILE = 'neurips23_evaluation/heldout_task_with_embedding.pkl' +CURRICULUM_FILE = "neurips23_evaluation/heldout_task_with_embedding.pkl" -if __name__ == '__main__': +if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser(description='Parse environment argument', add_help=False) - parser.add_argument('eval_model_path', type=str, default=None, help='Path to model to evaluate') - parser.add_argument('-c', '--curriculum', type=str, default=CURRICULUM_FILE, help='Path to curriculum file') - parser.add_argument('-t', '--task-to-assign', type=int, default=None, - help='The index of the task to assign in the curriculum file') - parser.add_argument('-r', '--repeat', type=int, default=1, help='Number of times to repeat the evaluation') + parser = argparse.ArgumentParser(description="Parse environment argument", add_help=False) + parser.add_argument("eval_model_path", type=str, default=None, help="Path to model to evaluate") + parser.add_argument( + "-c", "--curriculum", type=str, default=CURRICULUM_FILE, help="Path to curriculum file" + ) + parser.add_argument( + "-t", + "--task-to-assign", + type=int, + default=None, + help="The index of the task to assign in the curriculum file", + ) + parser.add_argument( + "-r", "--repeat", type=int, default=1, help="Number of times to repeat the evaluation" + ) clean_parser = argparse.ArgumentParser(parents=[parser]) args = parser.parse_known_args()[0].__dict__ # required args when using train.py's helper functions - args['no_track'] = True - args['no_recurrence'] = False - args['vectorization'] = 'serial' - args['debug'] = False + args["no_track"] = True + args["no_recurrence"] = False + args["vectorization"] = "serial" + args["debug"] = False # Generate argparse menu from config - config = load_from_config('neurips23_start_kit') # dummy learner + config = load_from_config("neurips23_start_kit") # dummy learner args = combine_config_args(parser, args, config) - args = update_args(args, mode='replay') + args = update_args(args, mode="replay") agent_creator = make_agent_creator() - env_creator = make_env_creator(args['curriculum'], 'pvp') + env_creator = make_env_creator(args["curriculum"], "pvp") # Generate replay for i in range(args.repeat): - generate_replay(args, env_creator, agent_creator, - stop_when_all_complete_task=False, - seed=random.randint(10000000, 99999999)) - print(f'Generated replay for task {i+1}/{args.repeat}...') - - print('Done!') + generate_replay( + args, + env_creator, + agent_creator, + stop_when_all_complete_task=False, + seed=random.randint(10000000, 99999999), + ) + print(f"Generated replay for task {i+1}/{args.repeat}...") + + print("Done!") diff --git a/curriculum_generation/curriculum_tutorial.py b/curriculum_generation/curriculum_tutorial.py index 40b5e441..cc96dcf0 100644 --- a/curriculum_generation/curriculum_tutorial.py +++ b/curriculum_generation/curriculum_tutorial.py @@ -41,6 +41,7 @@ ############################################################################## # Create training tasks using custom evaluation functions + def PracticeEating(gs, subject): """The progress, the max of which is 1, should * increase small for each eating @@ -55,12 +56,15 @@ def PracticeEating(gs, subject): progress += 0.3 return norm(progress) # norm is a helper function to normalize the value to [0, 1] + curriculum.append(TaskSpec(eval_fn=PracticeEating, eval_fn_kwargs={})) + # You can also use pre-built eval functions to define your own eval functions def PracticeInventoryManagement(gs, subject, space, num_tick): return norm(InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick)) + for space in [2, 4, 8]: curriculum.append( TaskSpec( @@ -74,6 +78,7 @@ def PracticeInventoryManagement(gs, subject, space, num_tick): # Import the custom curriculum print("------------------------------------------------------------") import curriculum_tutorial # which is this file + CURRICULUM = curriculum_tutorial.curriculum print("The number of training tasks in the curriculum:", len(CURRICULUM)) @@ -96,6 +101,7 @@ def PracticeInventoryManagement(gs, subject, space, num_tick): CURRICULUM_FILE_PATH = "custom_curriculum_with_embedding.pkl" with open(CURRICULUM_FILE_PATH, "wb") as f: import dill + dill.dump(CURRICULUM, f) print("All training tasks are picklable.") @@ -105,6 +111,7 @@ def PracticeInventoryManagement(gs, subject, space, num_tick): print("------------------------------------------------------------") print("Generating the task spec with embedding file ...") from task_encoder import TaskEncoder + LLM_CHECKPOINT = "deepseek-ai/deepseek-coder-1.3b-instruct" # Get the task embeddings for the training tasks and save to file diff --git a/curriculum_generation/manual_curriculum.py b/curriculum_generation/manual_curriculum.py index 37fe9e11..0f207573 100644 --- a/curriculum_generation/manual_curriculum.py +++ b/curriculum_generation/manual_curriculum.py @@ -1,9 +1,7 @@ """Manual test for creating learning curriculum manually""" -# pylint: disable=invalid-name,redefined-outer-name,bad-builtin -# pylint: disable=wildcard-import,unused-wildcard-import + from typing import List -import nmmo import nmmo.lib.material as m import nmmo.systems.item as Item import nmmo.systems.skill as Skill @@ -64,14 +62,14 @@ "DRINK_WATER", ] for event_code in most_essentials: - for cnt in range(1, 10): - curriculum.append( - TaskSpec( - eval_fn=CountEvent, - eval_fn_kwargs={"event": event_code, "N": cnt}, - sampling_weight=100, + for cnt in range(1, 10): + curriculum.append( + TaskSpec( + eval_fn=CountEvent, + eval_fn_kwargs={"event": event_code, "N": cnt}, + sampling_weight=100, + ) ) - ) essential_skills = [ "SCORE_HIT", @@ -85,14 +83,14 @@ "BUY_ITEM", ] for event_code in essential_skills: - for cnt in EVENT_NUMBER_GOAL: - curriculum.append( - TaskSpec( - eval_fn=CountEvent, - eval_fn_kwargs={"event": event_code, "N": cnt}, - sampling_weight=20, + for cnt in EVENT_NUMBER_GOAL: + curriculum.append( + TaskSpec( + eval_fn=CountEvent, + eval_fn_kwargs={"event": event_code, "N": cnt}, + sampling_weight=20, + ) ) - ) # item/market skills, which happen less frequently or should not do too much item_skills = [ @@ -101,248 +99,241 @@ "GIVE_GOLD", ] for event_code in item_skills: - curriculum += [ - TaskSpec(eval_fn=CountEvent, eval_fn_kwargs={ - "event": event_code, "N": cnt}) - for cnt in INFREQUENT_GOAL - ] # less than 10 + curriculum += [ + TaskSpec(eval_fn=CountEvent, eval_fn_kwargs={"event": event_code, "N": cnt}) + for cnt in INFREQUENT_GOAL + ] # less than 10 # find resource tiles for resource in m.Harvestable: - curriculum.append( - TaskSpec( - eval_fn=CanSeeTile, - eval_fn_kwargs={"tile_type": resource}, - sampling_weight=10, - ) - ) + curriculum.append( + TaskSpec( + eval_fn=CanSeeTile, + eval_fn_kwargs={"tile_type": resource}, + sampling_weight=10, + ) + ) + # practice particular skill with a tool def PracticeSkillWithTool(gs, subject, skill, exp): - return 0.3 * EquipItem(gs, subject, item=TOOL_FOR_SKILL[skill], level=1, num_agent=1) + \ - 0.7 * GainExperience(gs, subject, skill, exp, num_agent=1) + return 0.3 * EquipItem( + gs, subject, item=TOOL_FOR_SKILL[skill], level=1, num_agent=1 + ) + 0.7 * GainExperience(gs, subject, skill, exp, num_agent=1) + for skill in SKILLS: - # level up a skill - for level in LEVEL_GOAL[1:]: - # since this is an agent task, num_agent must be 1 - curriculum.append( - TaskSpec( - eval_fn=AttainSkill, - eval_fn_kwargs={"skill": skill, "level": level, "num_agent": 1}, - sampling_weight=10*(6 - level) if level < 6 else 5, + # level up a skill + for level in LEVEL_GOAL[1:]: + # since this is an agent task, num_agent must be 1 + curriculum.append( + TaskSpec( + eval_fn=AttainSkill, + eval_fn_kwargs={"skill": skill, "level": level, "num_agent": 1}, + sampling_weight=10 * (6 - level) if level < 6 else 5, + ) ) - ) - - # gain experience on particular skill - for exp in EXP_GOAL: - curriculum.append( - TaskSpec( - eval_fn=PracticeSkillWithTool, - eval_fn_kwargs={"skill": skill, "exp": exp}, - sampling_weight=50, + + # gain experience on particular skill + for exp in EXP_GOAL: + curriculum.append( + TaskSpec( + eval_fn=PracticeSkillWithTool, + eval_fn_kwargs={"skill": skill, "exp": exp}, + sampling_weight=50, + ) ) - ) # stay alive ... like ... for 300 ticks # i.e., getting incremental reward for each tick alive as an individual or a team for num_tick in STAY_ALIVE_GOAL: - curriculum.append( - TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": num_tick})) + curriculum.append(TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": num_tick})) # occupy the center tile, assuming the Medium map size # TODO: it'd be better to have some intermediate targets toward the center -curriculum.append(TaskSpec(eval_fn=OccupyTile, - eval_fn_kwargs={"row": 80, "col": 80})) +curriculum.append(TaskSpec(eval_fn=OccupyTile, eval_fn_kwargs={"row": 80, "col": 80})) # find the other team leader for target in ["left_team_leader", "right_team_leader"]: - curriculum.append(TaskSpec(eval_fn=CanSeeAgent, - eval_fn_kwargs={"target": target})) + curriculum.append(TaskSpec(eval_fn=CanSeeAgent, eval_fn_kwargs={"target": target})) # find the other team (any agent) for target in ["left_team", "right_team"]: - curriculum.append(TaskSpec(eval_fn=CanSeeGroup, - eval_fn_kwargs={"target": target})) + curriculum.append(TaskSpec(eval_fn=CanSeeGroup, eval_fn_kwargs={"target": target})) # practice specific combat style for style in COMBAT_STYLE: - for cnt in EVENT_NUMBER_GOAL: - curriculum.append( - TaskSpec( - eval_fn=ScoreHit, - eval_fn_kwargs={"combat_style": style, "N": cnt}, - sampling_weight=5, + for cnt in EVENT_NUMBER_GOAL: + curriculum.append( + TaskSpec( + eval_fn=ScoreHit, + eval_fn_kwargs={"combat_style": style, "N": cnt}, + sampling_weight=5, + ) ) - ) # hoarding gold -- evaluated on the current gold for amount in EVENT_NUMBER_GOAL: - curriculum.append( - TaskSpec( - eval_fn=HoardGold, eval_fn_kwargs={"amount": amount}, sampling_weight=10 - ) - ) + curriculum.append( + TaskSpec(eval_fn=HoardGold, eval_fn_kwargs={"amount": amount}, sampling_weight=10) + ) # earning gold -- evaluated on the total gold earned by selling items for amount in EVENT_NUMBER_GOAL: - curriculum.append( - TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={ - "amount": amount}, sampling_weight=10) - ) + curriculum.append( + TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": amount}, sampling_weight=10) + ) # spending gold, by buying items for amount in EVENT_NUMBER_GOAL: - curriculum.append( - TaskSpec( - eval_fn=SpendGold, eval_fn_kwargs={"amount": amount}, sampling_weight=5 - ) - ) + curriculum.append( + TaskSpec(eval_fn=SpendGold, eval_fn_kwargs={"amount": amount}, sampling_weight=5) + ) # making profits by trading -- only buying and selling are counted for amount in EVENT_NUMBER_GOAL: - curriculum.append( - TaskSpec( - eval_fn=MakeProfit, eval_fn_kwargs={"amount": amount}, sampling_weight=3 - ) - ) + curriculum.append( + TaskSpec(eval_fn=MakeProfit, eval_fn_kwargs={"amount": amount}, sampling_weight=3) + ) + # managing inventory space def PracticeInventoryManagement(gs, subject, space, num_tick): - return InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick) + return InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick) + for space in [2, 4, 8]: - curriculum += [ - TaskSpec( - eval_fn=PracticeInventoryManagement, - eval_fn_kwargs={"space": space, "num_tick": num_tick}, - ) - for num_tick in STAY_ALIVE_GOAL - ] + curriculum += [ + TaskSpec( + eval_fn=PracticeInventoryManagement, + eval_fn_kwargs={"space": space, "num_tick": num_tick}, + ) + for num_tick in STAY_ALIVE_GOAL + ] # own item, evaluated on the current inventory for item in ALL_ITEM: - for level in LEVEL_GOAL: - # agent task - for quantity in ITEM_NUM_GOAL: - if level + quantity <= 6 or quantity == 1: # heuristic prune - curriculum.append( - TaskSpec( - eval_fn=OwnItem, - eval_fn_kwargs={ - "item": item, - "level": level, - "quantity": quantity, - }, - sampling_weight=4 - level if level < 4 else 1, - ) - ) + for level in LEVEL_GOAL: + # agent task + for quantity in ITEM_NUM_GOAL: + if level + quantity <= 6 or quantity == 1: # heuristic prune + curriculum.append( + TaskSpec( + eval_fn=OwnItem, + eval_fn_kwargs={ + "item": item, + "level": level, + "quantity": quantity, + }, + sampling_weight=4 - level if level < 4 else 1, + ) + ) # equip item, evaluated on the current inventory and equipment status for item in EQUIP_ITEM: - for level in LEVEL_GOAL: - # agent task - curriculum.append( - TaskSpec( - eval_fn=EquipItem, - eval_fn_kwargs={"item": item, "level": level, "num_agent": 1}, - sampling_weight=4 - level if level < 4 else 1, - ) - ) - -# consume items (ration, potion), evaluated based on the event log -for item in Item.CONSUMABLE: - for level in LEVEL_GOAL: - # agent task - for quantity in ITEM_NUM_GOAL: - if level + quantity <= 6 or quantity == 1: # heuristic prune + for level in LEVEL_GOAL: + # agent task curriculum.append( TaskSpec( - eval_fn=ConsumeItem, - eval_fn_kwargs={ - "item": item, - "level": level, - "quantity": quantity, - }, + eval_fn=EquipItem, + eval_fn_kwargs={"item": item, "level": level, "num_agent": 1}, sampling_weight=4 - level if level < 4 else 1, ) ) +# consume items (ration, potion), evaluated based on the event log +for item in Item.CONSUMABLE: + for level in LEVEL_GOAL: + # agent task + for quantity in ITEM_NUM_GOAL: + if level + quantity <= 6 or quantity == 1: # heuristic prune + curriculum.append( + TaskSpec( + eval_fn=ConsumeItem, + eval_fn_kwargs={ + "item": item, + "level": level, + "quantity": quantity, + }, + sampling_weight=4 - level if level < 4 else 1, + ) + ) + # harvest items, evaluated based on the event log for item in HARVEST_ITEM: - for level in LEVEL_GOAL: - # agent task - for quantity in ITEM_NUM_GOAL: - if level + quantity <= 6 or quantity == 1: # heuristic prune - curriculum.append( - TaskSpec( - eval_fn=HarvestItem, - eval_fn_kwargs={ - "item": item, - "level": level, - "quantity": quantity, - }, - sampling_weight=4 - level if level < 4 else 1, - ) - ) + for level in LEVEL_GOAL: + # agent task + for quantity in ITEM_NUM_GOAL: + if level + quantity <= 6 or quantity == 1: # heuristic prune + curriculum.append( + TaskSpec( + eval_fn=HarvestItem, + eval_fn_kwargs={ + "item": item, + "level": level, + "quantity": quantity, + }, + sampling_weight=4 - level if level < 4 else 1, + ) + ) # list items, evaluated based on the event log for item in ALL_ITEM: - for level in LEVEL_GOAL: - # agent task - for quantity in ITEM_NUM_GOAL: - if level + quantity <= 6 or quantity == 1: # heuristic prune - curriculum.append( - TaskSpec( - eval_fn=ListItem, - eval_fn_kwargs={ - "item": item, - "level": level, - "quantity": quantity, - }, - sampling_weight=4 - level if level < 4 else 1, - ) - ) + for level in LEVEL_GOAL: + # agent task + for quantity in ITEM_NUM_GOAL: + if level + quantity <= 6 or quantity == 1: # heuristic prune + curriculum.append( + TaskSpec( + eval_fn=ListItem, + eval_fn_kwargs={ + "item": item, + "level": level, + "quantity": quantity, + }, + sampling_weight=4 - level if level < 4 else 1, + ) + ) # buy items, evaluated based on the event log for item in ALL_ITEM: - for level in LEVEL_GOAL: - # agent task - for quantity in ITEM_NUM_GOAL: - if level + quantity <= 6 or quantity == 1: # heuristic prune - curriculum.append( - TaskSpec( - eval_fn=BuyItem, - eval_fn_kwargs={ - "item": item, - "level": level, - "quantity": quantity, - }, - sampling_weight=4 - level if level < 4 else 1, - ) - ) + for level in LEVEL_GOAL: + # agent task + for quantity in ITEM_NUM_GOAL: + if level + quantity <= 6 or quantity == 1: # heuristic prune + curriculum.append( + TaskSpec( + eval_fn=BuyItem, + eval_fn_kwargs={ + "item": item, + "level": level, + "quantity": quantity, + }, + sampling_weight=4 - level if level < 4 else 1, + ) + ) if __name__ == "__main__": - import multiprocessing as mp - from contextlib import contextmanager - - import dill - import numpy as np - import psutil - - @contextmanager - def create_pool(num_proc): - pool = mp.Pool(processes=num_proc) - yield pool - pool.close() - pool.join() - - # 1609 task specs: divide the specs into chunks - num_workers = round(psutil.cpu_count(logical=False)*0.7) - spec_chunks = np.array_split(curriculum, num_workers) - with create_pool(num_workers) as pool: - pool.map(check_task_spec, spec_chunks) - - # test if the task spec is pickalable - with open("pickle_test.pkl", "wb") as f: - dill.dump(curriculum, f) \ No newline at end of file + import multiprocessing as mp + from contextlib import contextmanager + + import dill + import numpy as np + import psutil + + @contextmanager + def create_pool(num_proc): + pool = mp.Pool(processes=num_proc) + yield pool + pool.close() + pool.join() + + # 1609 task specs: divide the specs into chunks + num_workers = round(psutil.cpu_count(logical=False) * 0.7) + spec_chunks = np.array_split(curriculum, num_workers) + with create_pool(num_workers) as pool: + pool.map(check_task_spec, spec_chunks) + + # test if the task spec is pickalable + with open("pickle_test.pkl", "wb") as f: + dill.dump(curriculum, f) diff --git a/curriculum_generation/task_encoder.py b/curriculum_generation/task_encoder.py index 7e6f1409..4a2ff489 100644 --- a/curriculum_generation/task_encoder.py +++ b/curriculum_generation/task_encoder.py @@ -14,17 +14,23 @@ def extract_module_fn(module: ModuleType): - fn_dict = {} - for name, fn in module.__dict__.items(): - if inspect.isfunction(fn) and not inspect.isbuiltin(fn) and not name.startswith("_"): - fn_dict[name] = fn - return fn_dict + fn_dict = {} + for name, fn in module.__dict__.items(): + if inspect.isfunction(fn) and not inspect.isbuiltin(fn) and not name.startswith("_"): + fn_dict[name] = fn + return fn_dict class TaskEncoder: """A class for encoding tasks into embeddings using a pretrained model.""" - - def __init__(self, checkpoint: str, context: ModuleType, batch_size=2, tmp_file_path="tmp_task_encoder.pkl"): + + def __init__( + self, + checkpoint: str, + context: ModuleType, + batch_size=2, + tmp_file_path="tmp_task_encoder.pkl", + ): """ Initialize the TaskEncoder. @@ -38,13 +44,13 @@ def __init__(self, checkpoint: str, context: ModuleType, batch_size=2, tmp_file_ self.tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) self.tokenizer.pad_token = self.tokenizer.eos_token if self.device == "cuda": - self.model = AutoModelForCausalLM.from_pretrained(checkpoint, - trust_remote_code=True, - device_map="auto", - torch_dtype=torch.bfloat16) + self.model = AutoModelForCausalLM.from_pretrained( + checkpoint, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16 + ) else: - self.model = AutoModelForCausalLM.from_pretrained(checkpoint, - trust_remote_code=True).to(self.device) + self.model = AutoModelForCausalLM.from_pretrained( + checkpoint, trust_remote_code=True + ).to(self.device) self.model.eval() self.batch_size = batch_size self.temp_file_path = tmp_file_path @@ -70,10 +76,14 @@ def _get_embedding(self, prompts: List[str]) -> list: all_embeddings = [] with torch.no_grad(): for i in tqdm(range(0, len(prompts), self.batch_size)): - batch = prompts[i: i + self.batch_size] - tokens = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(self.device) + batch = prompts[i : i + self.batch_size] + tokens = self.tokenizer( + batch, return_tensors="pt", padding=True, truncation=True + ).to(self.device) outputs = self.model(**tokens, output_hidden_states=True) - embeddings = outputs.hidden_states[-1].mean(dim=1).detach().cpu().to(torch.float32).numpy() + embeddings = ( + outputs.hidden_states[-1].mean(dim=1).detach().cpu().to(torch.float32).numpy() + ) all_embeddings.extend(embeddings.astype(np.float16)) return all_embeddings @@ -88,8 +98,18 @@ def _get_task_deps_src(self, eval_fn) -> tuple: A tuple with source code and dependencies of eval_fn. """ eval_src = inspect.getsource(eval_fn) - deps_fns = [node.func.id for node in ast.walk(ast.parse(eval_src)) if isinstance(node, ast.Call) and isinstance(node.func, ast.Name)] - deps_src = "\n".join([inspect.getsource(self._fn_dict[fn_name]) for fn_name in deps_fns if fn_name in self._fn_dict]) + deps_fns = [ + node.func.id + for node in ast.walk(ast.parse(eval_src)) + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) + ] + deps_src = "\n".join( + [ + inspect.getsource(self._fn_dict[fn_name]) + for fn_name in deps_fns + if fn_name in self._fn_dict + ] + ) return eval_src, deps_src def _construct_prompt(self, reward_to, eval_fn, eval_fn_kwargs) -> str: @@ -126,7 +146,12 @@ def get_task_embedding(self, task_spec_list: List[ts.TaskSpec], save_to_file: st Updated task specifications with embeddings. """ assert self.model is not None, "Model has been unloaded. Re-initialize the TaskEncoder." - prompts = [self._construct_prompt(single_spec.reward_to, single_spec.eval_fn, single_spec.eval_fn_kwargs) for single_spec in task_spec_list] + prompts = [ + self._construct_prompt( + single_spec.reward_to, single_spec.eval_fn, single_spec.eval_fn_kwargs + ) + for single_spec in task_spec_list + ] embeddings = self._get_embedding(prompts) for single_spec, embedding in zip(task_spec_list, embeddings): @@ -155,11 +180,9 @@ def __exit__(self, exc_type, exc_value, traceback): if __name__ == "__main__": import curriculum_generation.manual_curriculum as curriculum + LLM_CHECKPOINT = "deepseek-ai/deepseek-coder-1.3b-instruct" CURRICULUM_FILE_PATH = "curriculum_generation/curriculum_with_embedding.pkl" with TaskEncoder(LLM_CHECKPOINT, curriculum, batch_size=6) as task_encoder: - task_encoder.get_task_embedding( - curriculum.curriculum, - save_to_file=CURRICULUM_FILE_PATH - ) + task_encoder.get_task_embedding(curriculum.curriculum, save_to_file=CURRICULUM_FILE_PATH) diff --git a/curriculum_generation/task_sampler.py b/curriculum_generation/task_sampler.py index eba88def..b6ad2603 100644 --- a/curriculum_generation/task_sampler.py +++ b/curriculum_generation/task_sampler.py @@ -4,10 +4,9 @@ from nmmo.task import task_spec as ts + class LearnableTaskSampler: - def __init__(self, - task_spec: List[ts.TaskSpec], - average_window = 50): + def __init__(self, task_spec: List[ts.TaskSpec], average_window=50): self.task_spec = task_spec self.name_to_spec = {single_spec.name: single_spec for single_spec in self.task_spec} self.task_stats = {} @@ -27,12 +26,12 @@ def update(self, infos, prefix="curriculum/"): for key, val in infos.items(): # Process the new infos if key.startswith(prefix): - spec_name = key.replace(prefix,"") - completed, prog_over_10pcnt, rcnt_over_2 = [], [], [] + spec_name = key.replace(prefix, "") + completed, _, rcnt_over_2 = [], [], [] for sublist in val: for prog, rcnt in sublist: completed.append(float(prog >= 1)) - rcnt_over_2.append(float(rcnt >= 2)) # rewarded >= 2 times + rcnt_over_2.append(float(rcnt >= 2)) # rewarded >= 2 times # Add to the task_stats if spec_name not in self.task_stats: @@ -42,32 +41,37 @@ def update(self, infos, prefix="curriculum/"): # Keep only the recent values -- self.average_window (50) for key, vals in self.task_stats[spec_name].items(): - self.task_stats[spec_name][key] = vals[-self.average_window:] + self.task_stats[spec_name][key] = vals[-self.average_window :] - def get_learnable_tasks(self, num_tasks, - max_completed = 0.8, # filter out easy tasks - min_completed = 0.05, # filter out harder tasks - min_rcnt_rate = 0.1, # reward signal generating + def get_learnable_tasks( + self, + num_tasks, + max_completed=0.8, # filter out easy tasks + min_completed=0.05, # filter out harder tasks + min_rcnt_rate=0.1, # reward signal generating ) -> List[ts.TaskSpec]: learnable = [] for spec_name, stat in self.task_stats.items(): completion_rate = np.mean(stat["completed"]) rcnt_over2_rate = np.mean(stat["rcnt_over_2"]) - if completion_rate < max_completed and\ - (completion_rate >= min_completed or rcnt_over2_rate >= min_rcnt_rate): + if completion_rate < max_completed and ( + completion_rate >= min_completed or rcnt_over2_rate >= min_rcnt_rate + ): learnable.append(self.name_to_spec[spec_name]) if len(learnable) > num_tasks: return list(np.random.choice(learnable, num_tasks)) return learnable - def sample_tasks(self, num_tasks, - random_ratio = 0.5, - reset_sampling_weight = True, + def sample_tasks( + self, + num_tasks, + random_ratio=0.5, + reset_sampling_weight=True, ) -> List[ts.TaskSpec]: task_spec = [] if 0 <= random_ratio < 1: - num_learnable = round(num_tasks * (1-random_ratio)) + num_learnable = round(num_tasks * (1 - random_ratio)) task_spec = self.get_learnable_tasks(num_learnable) # fill in with the randomly-sampled tasks diff --git a/evaluate.py b/evaluate.py index c3ac9880..7455410f 100644 --- a/evaluate.py +++ b/evaluate.py @@ -17,75 +17,97 @@ from train import get_init_args NUM_AGENTS = 128 -EVAL_TASK_FILE = 'neurips23_evaluation/sample_eval_task_with_embedding.pkl' +EVAL_TASK_FILE = "neurips23_evaluation/sample_eval_task_with_embedding.pkl" NUM_PVE_EVAL_EPISODE = 32 NUM_PVP_EVAL_EPISODE = 200 # TODO: cannot do more due to memory leak + def get_eval_config(debug=False): return { - 'device': 'cuda', - 'num_envs': 6 if not debug else 1, - 'batch_size': 2**15 if not debug else 2**12, + "device": "cuda", + "num_envs": 6 if not debug else 1, + "batch_size": 2**15 if not debug else 2**12, } -class EvalConfig(nc.Medium, nc.Terrain, nc.Resource, nc.Combat, nc.NPC, nc.Progression, - nc.Item, nc.Equipment, nc.Profession, nc.Exchange): - '''NMMO config for NeurIPS 2023 competition evaluation. - Hardcoded to keep the eval config independent from the training config. - ''' +class EvalConfig( + nc.Medium, + nc.Terrain, + nc.Resource, + nc.Combat, + nc.NPC, + nc.Progression, + nc.Item, + nc.Equipment, + nc.Profession, + nc.Exchange, +): + """NMMO config for NeurIPS 2023 competition evaluation. + Hardcoded to keep the eval config independent from the training config. + """ + def __init__(self, task_file, mode): super().__init__() - self.set('GAME_PACKS', [(nmmo.core.game_api.AgentTraining, 1)]) - self.set('CURRICULUM_FILE_PATH', task_file) - self.set('TASK_EMBED_DIM', 2048) # must match the task file + self.set("GAME_PACKS", [(nmmo.core.game_api.AgentTraining, 1)]) + self.set("CURRICULUM_FILE_PATH", task_file) + self.set("TASK_EMBED_DIM", 2048) # must match the task file # Eval constants - self.set('PROVIDE_ACTION_TARGETS', True) - self.set('PROVIDE_NOOP_ACTION_TARGET', True) - self.set('PLAYER_N', NUM_AGENTS) - self.set('HORIZON', 1024) - self.set('PLAYER_DEATH_FOG', None) - self.set('NPC_N', 256) - self.set('RESOURCE_RESILIENT_POPULATION', 0) - self.set('COMBAT_SPAWN_IMMUNITY', 20) + self.set("PROVIDE_ACTION_TARGETS", True) + self.set("PROVIDE_NOOP_ACTION_TARGET", True) + self.set("PLAYER_N", NUM_AGENTS) + self.set("HORIZON", 1024) + self.set("PLAYER_DEATH_FOG", None) + self.set("NPC_N", 256) + self.set("RESOURCE_RESILIENT_POPULATION", 0) + self.set("COMBAT_SPAWN_IMMUNITY", 20) # Map related - self.set('TERRAIN_FLIP_SEED', True) - self.set('MAP_CENTER', 128) - self.set('MAP_FORCE_GENERATION', False) - self.set('MAP_GENERATE_PREVIEWS', True) - if mode not in ['pve', 'pvp']: - raise ValueError(f'Invalid eval_mode: {mode}') - if mode == 'pve': - self.set('MAP_N', 4) - self.set('PATH_MAPS', 'maps/pve_eval/') + self.set("TERRAIN_FLIP_SEED", True) + self.set("MAP_CENTER", 128) + self.set("MAP_FORCE_GENERATION", False) + self.set("MAP_GENERATE_PREVIEWS", True) + if mode not in ["pve", "pvp"]: + raise ValueError(f"Invalid eval_mode: {mode}") + if mode == "pve": + self.set("MAP_N", 4) + self.set("PATH_MAPS", "maps/pve_eval/") else: - self.set('MAP_N', 256) - self.set('PATH_MAPS', 'maps/pvp_eval/') + self.set("MAP_N", 256) + self.set("PATH_MAPS", "maps/pvp_eval/") + def make_env_creator(task_file, mode): def env_creator(*args, **kwargs): # dummy args env = nmmo.Env(EvalConfig(task_file, mode)) # Reward wrapper is for the learner, which is not used in evaluation env = default_learner.RewardWrapper( - env, **{'eval_mode': True, 'early_stop_agent_num': 0,} + env, + **{ + "eval_mode": True, + "early_stop_agent_num": 0, + }, ) env = pufferlib.emulation.PettingZooPufferEnv(env) return env + return env_creator + def make_agent_creator(): # NOTE: Assuming all policies are recurrent, which may not be true policy_args = get_init_args(default_learner.Policy.__init__) recurrent_args = get_init_args(default_learner.Recurrent.__init__) + def agent_creator(env, args=None): policy = default_learner.Policy(env, **policy_args) policy = default_learner.Recurrent(env, policy, **recurrent_args) policy = pufferlib.frameworks.cleanrl.RecurrentPolicy(policy) - return policy.to(get_eval_config()['device']) + return policy.to(get_eval_config()["device"]) + return agent_creator + class EvalRunner: def __init__(self, policy_store_dir, debug=False): self.policy_store_dir = policy_store_dir @@ -96,22 +118,24 @@ def set_debug(self, debug): def setup_evaluator(self, mode, task_file, seed): policies = pp.get_policy_names(self.policy_store_dir) - assert len(policies) > 0, 'No policies found in eval_model_path' - if mode == 'pve': - assert len(policies) == 1, 'PvE mode requires only one policy' - logging.info(f'Policies to evaluate: {policies}') + assert len(policies) > 0, "No policies found in eval_model_path" + if mode == "pve": + assert len(policies) == 1, "PvE mode requires only one policy" + logging.info(f"Policies to evaluate: {policies}") # pool_kernel determines policy-agent mapping - pool_kernel = pp.create_kernel(NUM_AGENTS, len(policies), - shuffle_with_seed=seed) + pool_kernel = pp.create_kernel(NUM_AGENTS, len(policies), shuffle_with_seed=seed) config = self.get_pufferl_config(self._debug) config.seed = seed config.data_dir = self.policy_store_dir config.pool_kernel = pool_kernel - vectorization = pufferlib.vectorization.Serial if self._debug \ + vectorization = ( + pufferlib.vectorization.Serial + if self._debug else pufferlib.vectorization.Multiprocessing + ) return clean_pufferl.create( config=config, @@ -127,14 +151,14 @@ def setup_evaluator(self, mode, task_file, seed): def get_pufferl_config(debug=False): config = get_eval_config(debug) # add required configs - config['torch_deterministic'] = True - config['total_timesteps'] = 100_000_000 # arbitrarily large, but will end much earlier - config['envs_per_batch'] = config['num_envs'] - config['envs_per_worker'] = 1 - config['env_pool'] = False - config['learning_rate'] = 1e-4 - config['compile'] = False - config['verbose'] = True # not debug + config["torch_deterministic"] = True + config["total_timesteps"] = 100_000_000 # arbitrarily large, but will end much earlier + config["envs_per_batch"] = config["num_envs"] + config["envs_per_worker"] = 1 + config["env_pool"] = False + config["learning_rate"] = 1e-4 + config["compile"] = False + config["verbose"] = True # not debug return pufferlib.namespace(**config) def perform_eval(self, mode, task_file, seed, num_eval_episode, save_file_prefix): @@ -148,40 +172,38 @@ def perform_eval(self, mode, task_file, seed, num_eval_episode, save_file_prefix _, infos = clean_pufferl.evaluate(pufferl_data) for pol, vals in infos.items(): - cnt_episode += sum(infos[pol]['episode_done']) + cnt_episode += sum(infos[pol]["episode_done"]) if pol not in eval_results: eval_results[pol] = defaultdict(list) for k, v in vals.items(): - if k == 'length': + if k == "length": eval_results[pol][k] += v # length is a plain list - if k.startswith('curriculum'): + if k.startswith("curriculum"): eval_results[pol][k] += [vv[0] for vv in v] pufferl_data.sort_keys = [] # TODO: check if this solves memory leak - print(f'\nSeed: {seed}, evaluated {cnt_episode} episodes.\n') + print(f"\nSeed: {seed}, evaluated {cnt_episode} episodes.\n") - file_name = f'{save_file_prefix}_{seed}.json' + file_name = f"{save_file_prefix}_{seed}.json" self._save_results(eval_results, file_name) clean_pufferl.close(pufferl_data) return eval_results, file_name def _save_results(self, results, file_name): - with open(os.path.join(self.policy_store_dir, file_name), 'w') as f: + with open(os.path.join(self.policy_store_dir, file_name), "w") as f: json.dump(results, f) - def run(self, mode, - task_file=EVAL_TASK_FILE, - seed=None, - num_episode=None, - save_file_prefix=None): - assert mode in ['pve', 'pvp'], f'Invalid mode: {mode}' - if mode == 'pve': + def run( + self, mode, task_file=EVAL_TASK_FILE, seed=None, num_episode=None, save_file_prefix=None + ): + assert mode in ["pve", "pvp"], f"Invalid mode: {mode}" + if mode == "pve": num_episode = num_episode or NUM_PVE_EVAL_EPISODE - save_file_prefix = save_file_prefix or 'eval_pve' + save_file_prefix = save_file_prefix or "eval_pve" else: num_episode = num_episode or NUM_PVP_EVAL_EPISODE - save_file_prefix = save_file_prefix or 'eval_pvp' + save_file_prefix = save_file_prefix or "eval_pvp" if self._debug: num_episode = 4 @@ -189,25 +211,33 @@ def run(self, mode, if seed is None: seed = random.randint(10000000, 99999999) - logging.info(f'Evaluating {self.policy_store_dir} in the {mode} mode with seed: {seed}') - logging.info(f'Using the task file: {task_file}') + logging.info(f"Evaluating {self.policy_store_dir} in the {mode} mode with seed: {seed}") + logging.info(f"Using the task file: {task_file}") _, file_name = self.perform_eval(mode, task_file, seed, num_episode, save_file_prefix) - print(f'Saved the result file to: {file_name}.') + print(f"Saved the result file to: {file_name}.") -if __name__ == '__main__': +if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser(description='Evaluate a policy store') - parser.add_argument('policy_store_dir', type=str, help='Path to the policy directory') - parser.add_argument('mode', type=str, choices=['pve', 'pvp'], help='Evaluation mode') - parser.add_argument('-t', '--task-file', type=str, default=EVAL_TASK_FILE, help='Path to the task file') - parser.add_argument('-s', '--seed', type=int, default=1, help='Random seed') - parser.add_argument('-n', '--num-episode', type=int, default=None, help='Number of episodes to evaluate') - parser.add_argument('-r', '--repeat', type=int, default=1, help='Number of times to repeat the evaluation') - parser.add_argument('--save-file-prefix', type=str, default=None, help='Prefix for the save file') - parser.add_argument('--debug', action='store_true', help='Debug mode') + parser = argparse.ArgumentParser(description="Evaluate a policy store") + parser.add_argument("policy_store_dir", type=str, help="Path to the policy directory") + parser.add_argument("mode", type=str, choices=["pve", "pvp"], help="Evaluation mode") + parser.add_argument( + "-t", "--task-file", type=str, default=EVAL_TASK_FILE, help="Path to the task file" + ) + parser.add_argument("-s", "--seed", type=int, default=1, help="Random seed") + parser.add_argument( + "-n", "--num-episode", type=int, default=None, help="Number of episodes to evaluate" + ) + parser.add_argument( + "-r", "--repeat", type=int, default=1, help="Number of times to repeat the evaluation" + ) + parser.add_argument( + "--save-file-prefix", type=str, default=None, help="Prefix for the save file" + ) + parser.add_argument("--debug", action="store_true", help="Debug mode") args = parser.parse_args() runner = EvalRunner(args.policy_store_dir, args.debug) diff --git a/neurips23_evaluation/export_embeddings.py b/neurips23_evaluation/export_embeddings.py index 55d1ae39..28b33a25 100644 --- a/neurips23_evaluation/export_embeddings.py +++ b/neurips23_evaluation/export_embeddings.py @@ -8,7 +8,7 @@ eval_tasks = dill.load(f) # metadata: task name (including full info), predicate, kwargs, sampling weights, training vs evaluation -# group by +# group by # - train vs eval # - predicate @@ -17,29 +17,35 @@ embeddings = [] metadata = [] + def get_task_predicate(spec): name = spec.name.split("_")[1] if name == "CountEvent": return name + "=" + spec.eval_fn_kwargs["event"] return name + for spec in curriculum: embeddings.append(spec.embedding) - metadata.append({ - "task_name": spec.name.replace("Task_", "").replace("_reward_to:agent", ""), - "predicate": get_task_predicate(spec), - "used_for": "train", - "sampling_weight": spec.sampling_weight, - }) + metadata.append( + { + "task_name": spec.name.replace("Task_", "").replace("_reward_to:agent", ""), + "predicate": get_task_predicate(spec), + "used_for": "train", + "sampling_weight": spec.sampling_weight, + } + ) for spec in eval_tasks: embeddings.append(spec.embedding) - metadata.append({ - "task_name": spec.name.replace("Task_", "").replace("_reward_to:agent", ""), - "predicate": get_task_predicate(spec), - "used_for": "eval", - "sampling_weight": spec.sampling_weight, - }) + metadata.append( + { + "task_name": spec.name.replace("Task_", "").replace("_reward_to:agent", ""), + "predicate": get_task_predicate(spec), + "used_for": "eval", + "sampling_weight": spec.sampling_weight, + } + ) embed_df = pl.DataFrame(embeddings) diff --git a/neurips23_evaluation/heldout_evaluation_task.py b/neurips23_evaluation/heldout_evaluation_task.py index 1fbc2bcc..1d407419 100644 --- a/neurips23_evaluation/heldout_evaluation_task.py +++ b/neurips23_evaluation/heldout_evaluation_task.py @@ -1,9 +1,23 @@ """Held-out evaluation tasks for NeurIPS 2023 competition.""" + from typing import List from nmmo.systems import skill as s from nmmo.systems import item as i -from nmmo.task.base_predicates import * +from nmmo.task.base_predicates import ( + AttainSkill, + ConsumeItem, + CountEvent, + DefeatEntity, + EarnGold, + EquipItem, + FullyArmed, + HarvestItem, + HoardGold, + MakeProfit, + OccupyTile, + TickGE, +) from nmmo.task.task_spec import TaskSpec, check_task_spec @@ -16,9 +30,7 @@ curriculum: List[TaskSpec] = [] # Survive to the end -curriculum.append( - TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024}) -) +curriculum.append(TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024})) # Kill 20 players/npcs curriculum.append( @@ -117,34 +129,20 @@ ) # Earn 100 gold (revenue), just by trading -curriculum.append( - TaskSpec( - eval_fn=EarnGold, - eval_fn_kwargs={"amount": GOLD_GOAL} - ) -) +curriculum.append(TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": GOLD_GOAL})) # Own and protect 100 gold by any means (looting or trading) -curriculum.append( - TaskSpec( - eval_fn=HoardGold, - eval_fn_kwargs={"amount": GOLD_GOAL} - ) -) +curriculum.append(TaskSpec(eval_fn=HoardGold, eval_fn_kwargs={"amount": GOLD_GOAL})) # Make profit of 100 gold by any means -curriculum.append( - TaskSpec( - eval_fn=MakeProfit, - eval_fn_kwargs={"amount": GOLD_GOAL} - ) -) +curriculum.append(TaskSpec(eval_fn=MakeProfit, eval_fn_kwargs={"amount": GOLD_GOAL})) if __name__ == "__main__": # Import the custom curriculum print("------------------------------------------------------------") from neurips23_evaluation import heldout_evaluation_task # which is this file + CURRICULUM = heldout_evaluation_task.curriculum print("The number of training tasks in the curriculum:", len(CURRICULUM)) @@ -166,6 +164,7 @@ print("Checking if the training tasks are picklable ...") with open(CURRICULUM_FILE_PATH, "wb") as f: import dill + dill.dump(CURRICULUM, f) print("All training tasks are picklable.") @@ -175,6 +174,7 @@ print("------------------------------------------------------------") print("Generating the task spec with embedding file ...") from curriculum_generation.task_encoder import TaskEncoder + LLM_CHECKPOINT = "deepseek-ai/deepseek-coder-1.3b-instruct" # Get the task embeddings for the training tasks and save to file diff --git a/neurips23_evaluation/sample_evaluation_task.py b/neurips23_evaluation/sample_evaluation_task.py index 6a06fa56..fce5e394 100644 --- a/neurips23_evaluation/sample_evaluation_task.py +++ b/neurips23_evaluation/sample_evaluation_task.py @@ -1,6 +1,5 @@ """Manual test for creating learning curriculum manually""" -# pylint: disable=invalid-name,redefined-outer-name,bad-builtin -# pylint: disable=wildcard-import,unused-wildcard-import + from typing import List from nmmo.systems import skill as s @@ -13,9 +12,7 @@ curriculum: List[TaskSpec] = [] # Stay alive as long as possible -curriculum.append( - TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024}) -) +curriculum.append(TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024})) # Perform these 10 times essential_skills = [ @@ -52,9 +49,7 @@ ) # Earn gold 50 -curriculum.append( - TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": 50}) -) +curriculum.append(TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": 50})) if __name__ == "__main__": from neurips23_evaluation import sample_evaluation_task as curriculum @@ -63,7 +58,4 @@ LLM_CHECKPOINT = "deepseek-ai/deepseek-coder-1.3b-instruct" with TaskEncoder(LLM_CHECKPOINT, curriculum, batch_size=6) as task_encoder: - task_encoder.get_task_embedding( - curriculum.curriculum, - save_to_file=CURRICULUM_FILE_PATH - ) + task_encoder.get_task_embedding(curriculum.curriculum, save_to_file=CURRICULUM_FILE_PATH) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..7a4d30ed --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,121 @@ +[project] +name = "nmmo2-baselines" +version = "0.1.0" +description = "Neural MMO 2.1 baselines" +keywords = [] +classifiers = [ + "Natural Language :: English", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS :: MacOS X", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: Implementation :: CPython", +] +dependencies = [ + "accelerate==0.27.2", + "nmmo@git+https://github.com/kywch/nmmo-environment", # WIP nmmo 2.1 + "polars", + "pufferlib[nmmo]==0.7.3", + "psutil", + "torch>=2.1", + "transformers==4.37.2", + "wandb", +] + +[tool.setuptools.packages.find] +where = ["."] +exclude = ["tests"] + +[project.optional-dependencies] +monitoring = [ + "nvitop" +] +dev = [ + "pre-commit", + "ruff" +] + +[tool.distutils.bdist_wheel] +universal = true + +[tool.ruff] +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", +] + +# Same as Black. +line-length = 100 +indent-width = 4 + +# Assume Python 3.10 +target-version = "py310" + +[tool.ruff.lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F"] +ignore = [] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401"] # Ignore imported but unused + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = false + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = "dynamic" diff --git a/reinforcement_learning/clean_pufferl.py b/reinforcement_learning/clean_pufferl.py index 5824483f..f72e76c7 100644 --- a/reinforcement_learning/clean_pufferl.py +++ b/reinforcement_learning/clean_pufferl.py @@ -1,5 +1,7 @@ +# ruff: noqa: E722, F841 + # Copied from: https://github.com/PufferAI/PufferLib/blob/0.7/clean_pufferl.py -from pdb import set_trace as T +# from pdb import set_trace as T import os import random import time @@ -21,7 +23,7 @@ import pufferlib.frameworks.cleanrl import pufferlib.policy_pool -SKIP_LOG_KEYS = ['curriculum/Task_', 'env_id'] +SKIP_LOG_KEYS = ["curriculum/Task_", "env_id"] @pufferlib.dataclass @@ -42,6 +44,7 @@ class Performance: train_pytorch_memory = 0 misc_time = 0 + @pufferlib.dataclass class Losses: policy_loss = 0 @@ -52,6 +55,7 @@ class Losses: clipfrac = 0 explained_variance = 0 + @pufferlib.dataclass class Charts: global_step = 0 @@ -60,29 +64,26 @@ class Charts: agent_step = 0 agent_SPS = 0 + def create( - self: object = None, - config: pufferlib.namespace = None, - exp_name: str = None, - track: bool = False, - - # Agent - agent: nn.Module = None, - agent_creator: callable = None, - agent_kwargs: dict = None, - - # Environment - env_creator: callable = None, - env_creator_kwargs: dict = None, - vectorization: ... = pufferlib.vectorization.Serial, - - # Evaluation or replay mode - eval_mode: bool = False, - eval_model_path: str = None, - - # Policy Pool options - policy_selector: callable = None, - ): + self: object = None, + config: pufferlib.namespace = None, + exp_name: str = None, + track: bool = False, + # Agent + agent: nn.Module = None, + agent_creator: callable = None, + agent_kwargs: dict = None, + # Environment + env_creator: callable = None, + env_creator_kwargs: dict = None, + vectorization: ... = pufferlib.vectorization.Serial, + # Evaluation or replay mode + eval_mode: bool = False, + eval_model_path: str = None, + # Policy Pool options + policy_selector: callable = None, +): if config is None: config = pufferlib.args.CleanPuffeRL() @@ -121,24 +122,26 @@ def create( resume_state = {} path = os.path.join(config.data_dir, exp_name) if os.path.exists(path): - trainer_path = os.path.join(path, 'trainer_state.pt') + trainer_path = os.path.join(path, "trainer_state.pt") resume_state = torch.load(trainer_path) model_path = os.path.join(path, resume_state["model_name"]) agent = torch.load(model_path, map_location=device) - print(f'Resumed from update {resume_state["update"]} ' - f'with policy {resume_state["model_name"]}') + print( + f'Resumed from update {resume_state["update"]} ' + f'with policy {resume_state["model_name"]}' + ) else: agent = pufferlib.emulation.make_object( - agent, agent_creator, [pool.driver_env], agent_kwargs) + agent, agent_creator, [pool.driver_env], agent_kwargs + ) global_step = resume_state.get("global_step", 0) agent_step = resume_state.get("agent_step", 0) update = resume_state.get("update", 0) - optimizer = optim.Adam(agent.parameters(), - lr=config.learning_rate, eps=1e-5) + optimizer = optim.Adam(agent.parameters(), lr=config.learning_rate, eps=1e-5) - uncompiled_agent = agent # Needed to save the model + uncompiled_agent = agent # Needed to save the model if config.compile: agent = torch.compile(agent, mode=config.compile_mode) @@ -157,8 +160,13 @@ def create( if policy_selector is None: policy_selector = pufferlib.policy_pool.RandomPolicySelector(config.seed) policy_pool = pufferlib.policy_pool.PolicyPool( - agent, pool_agents, atn_shape, device, pool_path, - config.pool_kernel, policy_selector, + agent, + pool_agents, + atn_shape, + device, + pool_path, + config.pool_kernel, + policy_selector, ) # Allocate Storage @@ -166,19 +174,19 @@ def create( next_lstm_state = [] pool.async_reset(config.seed) next_lstm_state = None - if hasattr(agent, 'lstm'): + if hasattr(agent, "lstm"): shape = (agent.lstm.num_layers, total_agents, agent.lstm.hidden_size) next_lstm_state = ( torch.zeros(shape).to(device), torch.zeros(shape).to(device), ) - obs=torch.zeros(config.batch_size + 1, *obs_shape) - actions=torch.zeros(config.batch_size + 1, *atn_shape, dtype=int) - logprobs=torch.zeros(config.batch_size + 1) - rewards=torch.zeros(config.batch_size + 1) - dones=torch.zeros(config.batch_size + 1) - truncateds=torch.zeros(config.batch_size + 1) - values=torch.zeros(config.batch_size + 1) + obs = torch.zeros(config.batch_size + 1, *obs_shape) + actions = torch.zeros(config.batch_size + 1, *atn_shape, dtype=int) + logprobs = torch.zeros(config.batch_size + 1) + rewards = torch.zeros(config.batch_size + 1) + dones = torch.zeros(config.batch_size + 1) + truncateds = torch.zeros(config.batch_size + 1) + values = torch.zeros(config.batch_size + 1) obs_ary = np.asarray(obs) actions_ary = np.asarray(actions) @@ -190,83 +198,85 @@ def create( storage_profiler.stop() - #"charts/actions": wandb.Histogram(b_actions.cpu().numpy()), + # "charts/actions": wandb.Histogram(b_actions.cpu().numpy()), init_performance = pufferlib.namespace( - init_time = time.time() - start_time, - init_env_time = init_profiler.elapsed, - init_env_memory = init_profiler.memory, - tensor_memory = storage_profiler.memory, - tensor_pytorch_memory = storage_profiler.pytorch_memory, + init_time=time.time() - start_time, + init_env_time=init_profiler.elapsed, + init_env_memory=init_profiler.memory, + tensor_memory=storage_profiler.memory, + tensor_pytorch_memory=storage_profiler.pytorch_memory, ) - - return pufferlib.namespace(self, + + return pufferlib.namespace( + self, # Agent, Optimizer, and Environment config=config, - pool = pool, - agent = agent, - uncompiled_agent = uncompiled_agent, - optimizer = optimizer, - policy_pool = policy_pool, - + pool=pool, + agent=agent, + uncompiled_agent=uncompiled_agent, + optimizer=optimizer, + policy_pool=policy_pool, # Logging - exp_name = exp_name, - wandb = wandb, + exp_name=exp_name, + wandb=wandb, learning_rate=config.learning_rate, - losses = Losses(), - init_performance = init_performance, - performance = Performance(), - + losses=Losses(), + init_performance=init_performance, + performance=Performance(), # Storage - sort_keys = [], - next_lstm_state = next_lstm_state, - obs = obs, - actions = actions, - logprobs = logprobs, - rewards = rewards, - dones = dones, - values = values, - obs_ary = obs_ary, - actions_ary = actions_ary, - logprobs_ary = logprobs_ary, - rewards_ary = rewards_ary, - dones_ary = dones_ary, - truncateds_ary = truncateds_ary, - values_ary = values_ary, - + sort_keys=[], + next_lstm_state=next_lstm_state, + obs=obs, + actions=actions, + logprobs=logprobs, + rewards=rewards, + dones=dones, + values=values, + obs_ary=obs_ary, + actions_ary=actions_ary, + logprobs_ary=logprobs_ary, + rewards_ary=rewards_ary, + dones_ary=dones_ary, + truncateds_ary=truncateds_ary, + values_ary=values_ary, # Misc - total_updates = total_updates, - update = update, - global_step = global_step, - agent_step = agent_step, - device = device, - start_time = start_time, - eval_mode = eval_mode, + total_updates=total_updates, + update=update, + global_step=global_step, + agent_step=agent_step, + device=device, + start_time=start_time, + eval_mode=eval_mode, ) + @pufferlib.utils.profile def evaluate(data): config = data.config # TODO: Handle update on resume if data.wandb is not None and data.performance.total_uptime > 0: - data.wandb.log({ - 'agent_SPS': data.agent_SPS, - 'agent_step': data.agent_step, - 'global_step': data.global_step, - 'learning_rate': data.optimizer.param_groups[0]["lr"], - **{f'losses/{k}': v for k, v in data.losses.items()}, - **{f'performance/{k}': v - for k, v in data.performance.items()}, - **{f'{k}': v for k, v in data.stats.items()}, # comes with stats/ prefix - **{f'skillrank/{policy}': elo - for policy, elo in data.policy_pool.ranker.ratings.items()}, - }) + data.wandb.log( + { + "agent_SPS": data.agent_SPS, + "agent_step": data.agent_step, + "global_step": data.global_step, + "learning_rate": data.optimizer.param_groups[0]["lr"], + **{f"losses/{k}": v for k, v in data.losses.items()}, + **{f"performance/{k}": v for k, v in data.performance.items()}, + **{f"{k}": v for k, v in data.stats.items()}, # comes with stats/ prefix + **{ + f"skillrank/{policy}": elo + for policy, elo in data.policy_pool.ranker.ratings.items() + }, + } + ) # update_policies() changes the policy id (in kernel) - policy mapping # It's good for training but not wanted for replay or evaluation, so we skip it if not data.eval_mode: data.policy_pool.update_policies() - performance = defaultdict(list) + # performance = defaultdict(list) env_profiler = pufferlib.utils.Profiler() inference_profiler = pufferlib.utils.Profiler() eval_profiler = pufferlib.utils.Profiler(memory=True, pytorch_memory=True).start() @@ -285,9 +295,8 @@ def evaluate(data): with misc_profiler: i = data.policy_pool.update_scores(i, "return") # TODO: Update this for policy pool - for ii, ee in zip(i['learner'], env_id): - ii['env_id'] = ee - + for ii, ee in zip(i["learner"], env_id): + ii["env_id"] = ee with inference_profiler, torch.no_grad(): o = torch.as_tensor(o) @@ -306,7 +315,8 @@ def evaluate(data): ) actions, logprob, value, next_lstm_state = data.policy_pool.forwards( - o.to(data.device), next_lstm_state) + o.to(data.device), next_lstm_state + ) if next_lstm_state is not None: h, c = next_lstm_state @@ -315,16 +325,15 @@ def evaluate(data): value = value.flatten() - with misc_profiler: actions = actions.cpu().numpy() - + # Index alive mask with policy pool idxs... # TODO: Find a way to avoid having to do this learner_mask = torch.Tensor(mask * data.policy_pool.mask) # Ensure indices do not exceed batch size - indices = torch.where(learner_mask)[0][:config.batch_size - ptr + 1].numpy() + indices = torch.where(learner_mask)[0][: config.batch_size - ptr + 1].numpy() end = ptr + len(indices) # Batch indexing @@ -372,8 +381,8 @@ def evaluate(data): data.stats = {} # Get stats only from the learner - for k, v in infos['learner'].items(): - try: # TODO: Better checks on log data types + for k, v in infos["learner"].items(): + try: # TODO: Better checks on log data types # Skip the unnecessary info from the stats if not any(skip in k for skip in SKIP_LOG_KEYS): data.stats[k] = np.mean(v) @@ -385,11 +394,11 @@ def evaluate(data): return data.stats, infos + @pufferlib.utils.profile def train(data): if done_training(data): - raise RuntimeError( - f"Max training updates {data.total_updates} already reached") + raise RuntimeError(f"Max training updates {data.total_updates} already reached") config = data.config # assert data.num_steps % bptt_horizon == 0, "num_steps must be divisible by bptt_horizon" @@ -420,9 +429,7 @@ def train(data): nextnonterminal = 1.0 - data.dones[i_nxt] nextvalues = data.values[i_nxt] delta = ( - data.rewards[i_nxt] - + config.gamma * nextvalues * nextnonterminal - - data.values[i] + data.rewards[i_nxt] + config.gamma * nextvalues * nextnonterminal - data.values[i] ) advantages[t] = lastgaelam = ( delta + config.gamma * config.gae_lambda * nextnonterminal * lastgaelam @@ -430,14 +437,10 @@ def train(data): # Flatten the batch data.b_obs = b_obs = torch.Tensor(data.obs_ary[b_idxs]) - b_actions = torch.Tensor(data.actions_ary[b_idxs] - ).to(data.device, non_blocking=True) - b_logprobs = torch.Tensor(data.logprobs_ary[b_idxs] - ).to(data.device, non_blocking=True) - b_dones = torch.Tensor(data.dones_ary[b_idxs] - ).to(data.device, non_blocking=True) - b_values = torch.Tensor(data.values_ary[b_idxs] - ).to(data.device, non_blocking=True) + b_actions = torch.Tensor(data.actions_ary[b_idxs]).to(data.device, non_blocking=True) + b_logprobs = torch.Tensor(data.logprobs_ary[b_idxs]).to(data.device, non_blocking=True) + b_dones = torch.Tensor(data.dones_ary[b_idxs]).to(data.device, non_blocking=True) + b_values = torch.Tensor(data.values_ary[b_idxs]).to(data.device, non_blocking=True) b_advantages = advantages.reshape( config.batch_rows, num_minibatches, config.bptt_horizon ).transpose(0, 1) @@ -446,22 +449,23 @@ def train(data): # Optimizing the policy and value network train_time = time.time() pg_losses, entropy_losses, v_losses, clipfracs, old_kls, kls = [], [], [], [], [], [] - mb_obs_buffer = torch.zeros_like(b_obs[0], pin_memory=(data.device=="cuda")) + mb_obs_buffer = torch.zeros_like(b_obs[0], pin_memory=(data.device == "cuda")) for epoch in range(config.update_epochs): lstm_state = None for mb in range(num_minibatches): mb_obs_buffer.copy_(b_obs[mb], non_blocking=True) mb_obs = mb_obs_buffer.to(data.device, non_blocking=True) - #mb_obs = b_obs[mb].to(data.device, non_blocking=True) + # mb_obs = b_obs[mb].to(data.device, non_blocking=True) mb_actions = b_actions[mb].contiguous() mb_values = b_values[mb].reshape(-1) mb_advantages = b_advantages[mb].reshape(-1) mb_returns = b_returns[mb].reshape(-1) - if hasattr(data.agent, 'lstm'): + if hasattr(data.agent, "lstm"): _, newlogprob, entropy, newvalue, lstm_state = data.agent( - mb_obs, state=lstm_state, action=mb_actions) + mb_obs, state=lstm_state, action=mb_actions + ) lstm_state = (lstm_state[0].detach(), lstm_state[1].detach()) else: _, newlogprob, entropy, newvalue = data.agent( @@ -478,9 +482,7 @@ def train(data): old_kls.append(old_approx_kl.item()) approx_kl = ((ratio - 1) - logratio).mean() kls.append(approx_kl.item()) - clipfracs += [ - ((ratio - 1.0).abs() > config.clip_coef).float().mean().item() - ] + clipfracs += [((ratio - 1.0).abs() > config.clip_coef).float().mean().item()] mb_advantages = mb_advantages.reshape(-1) if config.norm_adv: @@ -554,7 +556,8 @@ def train(data): data.update += 1 if data.update % config.checkpoint_interval == 0 or done_training(data): - save_checkpoint(data) + save_checkpoint(data) + def close(data): data.pool.close() @@ -567,15 +570,17 @@ def close(data): data.wandb.run.log_artifact(artifact) data.wandb.finish() + def done_training(data): return data.update >= data.total_updates + def save_checkpoint(data): path = os.path.join(data.config.data_dir, data.exp_name) if not os.path.exists(path): os.makedirs(path) - model_name = f'model_{data.update:06d}.pt' + model_name = f"model_{data.update:06d}.pt" model_path = os.path.join(path, model_name) # Already saved @@ -593,17 +598,18 @@ def save_checkpoint(data): } if data.wandb: - state['exp_name'] = data.exp_name + state["exp_name"] = data.exp_name - state_path = os.path.join(path, 'trainer_state.pt') - torch.save(state, state_path + '.tmp') - os.rename(state_path + '.tmp', state_path) + state_path = os.path.join(path, "trainer_state.pt") + torch.save(state, state_path + ".tmp") + os.rename(state_path + ".tmp", state_path) if data.config.verbose: print(f"Model saved to {model_path}") return model_path + def seed_everything(seed, torch_deterministic): random.seed(seed) np.random.seed(seed) @@ -620,6 +626,7 @@ def seed_everything(seed, torch_deterministic): # With torch >= 2.2, check also https://pytorch.org/docs/2.2/deterministic.html # torch.utils.deterministic.fill_uninitialized_memory = torch_deterministic + def unroll_nested_dict(d): if not isinstance(d, dict): return d @@ -631,36 +638,37 @@ def unroll_nested_dict(d): else: yield k, v + def print_dashboard(stats, init_performance, performance): output = [] data = {**init_performance, **performance} # Only show these stats in the dashboard - if 'length' in stats: - data['length'] = stats['length'] + if "length" in stats: + data["length"] = stats["length"] grouped_data = defaultdict(dict) for k, v in data.items(): - if k == 'total_uptime': + if k == "total_uptime": v = timedelta(seconds=v) - if 'memory' in k: + if "memory" in k: v = pufferlib.utils.format_bytes(v) - elif 'time' in k: + elif "time" in k: try: v = f"{v:.2f} s" except: pass - - first_word, *rest_words = k.split('_') - rest_words = ' '.join(rest_words).title() - + + first_word, *rest_words = k.split("_") + rest_words = " ".join(rest_words).title() + grouped_data[first_word][rest_words] = v - + for main_key, sub_dict in grouped_data.items(): output.append(f"{main_key.title()}") for sub_key, sub_value in sub_dict.items(): output.append(f" {sub_key}: {sub_value}") - + print("\033c", end="") - print('\n'.join(output)) - time.sleep(1/20) + print("\n".join(output)) + time.sleep(1 / 20) diff --git a/reinforcement_learning/environment.py b/reinforcement_learning/environment.py index 5426cbc9..d1720148 100644 --- a/reinforcement_learning/environment.py +++ b/reinforcement_learning/environment.py @@ -5,7 +5,7 @@ from pettingzoo.utils.wrappers.base_parallel import BaseParallelWrapper -import nmmo +import nmmo import nmmo.core.config as nc import nmmo.core.game_api as ng @@ -13,9 +13,21 @@ def alt_combat_damage_formula(offense, defense, multiplier, minimum_proportion): return int(max(multiplier * offense - defense, offense * minimum_proportion)) -class Config(nc.Medium, nc.Terrain, nc.Resource, nc.Combat, nc.NPC, nc.Progression, - nc.Item, nc.Equipment, nc.Profession, nc.Exchange): - '''Configuration for Neural MMO.''' + +class Config( + nc.Medium, + nc.Terrain, + nc.Resource, + nc.Combat, + nc.NPC, + nc.Progression, + nc.Item, + nc.Equipment, + nc.Profession, + nc.Exchange, +): + """Configuration for Neural MMO.""" + def __init__(self, env_args: Namespace): super().__init__() @@ -25,8 +37,11 @@ def __init__(self, env_args: Namespace): self.set("PLAYER_N", env_args.num_agents) self.set("HORIZON", env_args.max_episode_length) self.set("MAP_N", env_args.num_maps) - self.set("PLAYER_DEATH_FOG", env_args.death_fog_tick if isinstance(env_args.death_fog_tick, int) else None) - self.set("PATH_MAPS", f'{env_args.maps_path}/{env_args.map_size}/') + self.set( + "PLAYER_DEATH_FOG", + env_args.death_fog_tick if isinstance(env_args.death_fog_tick, int) else None, + ) + self.set("PATH_MAPS", f"{env_args.maps_path}/{env_args.map_size}/") self.set("MAP_CENTER", env_args.map_size) self.set("NPC_N", env_args.num_npcs) self.set("TASK_EMBED_DIM", env_args.task_size) @@ -66,8 +81,9 @@ def __init__(self, env_args: Namespace): def make_env_creator(reward_wrapper_cls: BaseParallelWrapper): def env_creator(*args, **kwargs): """Create an environment.""" - env = nmmo.Env(Config(kwargs['env'])) # args.env is provided as kwargs - env = reward_wrapper_cls(env, **kwargs['reward_wrapper']) + env = nmmo.Env(Config(kwargs["env"])) # args.env is provided as kwargs + env = reward_wrapper_cls(env, **kwargs["reward_wrapper"]) env = pufferlib.emulation.PettingZooPufferEnv(env) return env + return env_creator diff --git a/reinforcement_learning/stat_wrapper.py b/reinforcement_learning/stat_wrapper.py index 0ae4a475..0a8f09fd 100644 --- a/reinforcement_learning/stat_wrapper.py +++ b/reinforcement_learning/stat_wrapper.py @@ -22,19 +22,19 @@ def __init__( self._stat_prefix = stat_prefix def observation(self, agent_id, agent_obs): - '''Called before observations are returned from the environment - Use this to define custom featurizers. Changing the space itself requires you to - define the observation space again (i.e. Gym.spaces.Dict(gym.spaces....))''' + """Called before observations are returned from the environment + Use this to define custom featurizers. Changing the space itself requires you to + define the observation space again (i.e. Gym.spaces.Dict(gym.spaces....))""" # NOTE: nmmo minigame obs is mapping proxy, which doesn't work with pufferlib flatten return agent_obs def action(self, agent_id, agent_atn): - '''Called before actions are passed from the model to the environment''' + """Called before actions are passed from the model to the environment""" return agent_atn def reward_terminated_truncated_info(self, agent_id, reward, terminated, truncated, info): - '''Called on reward, terminated, truncated, and info before they are returned from the environment - Use this to define custom reward shaping.''' + """Called on reward, terminated, truncated, and info before they are returned from the environment + Use this to define custom reward shaping.""" return reward, terminated, truncated, info @property @@ -42,7 +42,7 @@ def agents(self): return [] if self.env_done else self.env.agents def reset(self, **kwargs): - '''Called at the start of each episode''' + """Called at the start of each episode""" self._reset_episode_stats() obs, info = self.env.reset(**kwargs) @@ -51,7 +51,7 @@ def reset(self, **kwargs): return obs, info def step(self, action): - assert len(self.env.agents) > 0, 'No agents in the environment' # xcxc: sanity check + assert len(self.env.agents) > 0, "No agents in the environment" # xcxc: sanity check # Modify actions before they are passed to the environment for agent_id in self.env.agents: @@ -81,7 +81,7 @@ def step(self, action): if self.env_done: # To mark the end of the episode. Only one agent's done flag is enough. - infos[agent_id]['episode_done'] = True + infos[agent_id]["episode_done"] = True return obs, rewards, terms, truncs, infos @@ -91,17 +91,17 @@ def _reset_episode_stats(self): self.cum_rewards = {agent_id: 0 for agent_id in self.env.possible_agents} self._unique_events = { agent_id: { - 'experienced': set(), - 'prev_count': 0, - 'curr_count': 0, + "experienced": set(), + "prev_count": 0, + "curr_count": 0, } for agent_id in self.env.possible_agents } def _process_stats_and_early_stop(self, agent_id, reward, terminated, truncated, info): - '''Update stats + info and save replays.''' + """Update stats + info and save replays.""" # Remove the task from info. Curriculum info is processed in _update_stats() - info.pop('task', None) + info.pop("task", None) # Handle early stopping if self.env_done and not terminated: @@ -111,56 +111,54 @@ def _process_stats_and_early_stop(self, agent_id, reward, terminated, truncated, realm = self.env.realm tick_log = realm.event_log.get_data(agents=[agent_id], tick=-1) # get only the last tick uniq = self._unique_events[agent_id] - uniq['prev_count'] = uniq['curr_count'] - uniq['curr_count'] += count_unique_events(tick_log, uniq['experienced']) + uniq["prev_count"] = uniq["curr_count"] + uniq["curr_count"] += count_unique_events(tick_log, uniq["experienced"]) if not (terminated or truncated): self.cum_rewards[agent_id] += reward return truncated, info # The agent is terminated or truncated, so recoding the episode stats - if 'stats' not in info: - info['stats'] = {} + if "stats" not in info: + info["stats"] = {} agent = realm.players.dead_this_tick.get(agent_id, realm.players.get(agent_id)) assert agent is not None # NOTE: this may not be true when players can be resurrected. Check back later - info['length'] = realm.tick + info["length"] = realm.tick - info['return'] = self.cum_rewards[agent_id] + info["return"] = self.cum_rewards[agent_id] # Cause of Deaths if terminated: - info['stats']['cod/attacked'] = 1.0 if agent.damage.val > 0 else 0.0 - info['stats']['cod/starved'] = 1.0 if agent.food.val == 0 else 0.0 - info['stats']['cod/dehydrated'] = 1.0 if agent.water.val == 0 else 0.0 + info["stats"]["cod/attacked"] = 1.0 if agent.damage.val > 0 else 0.0 + info["stats"]["cod/starved"] = 1.0 if agent.food.val == 0 else 0.0 + info["stats"]["cod/dehydrated"] = 1.0 if agent.water.val == 0 else 0.0 else: - info['stats']['cod/attacked'] = 0 - info['stats']['cod/starved'] = 0 - info['stats']['cod/dehydrated'] = 0 + info["stats"]["cod/attacked"] = 0 + info["stats"]["cod/starved"] = 0 + info["stats"]["cod/dehydrated"] = 0 # Task-related stats task = self.env.agent_task_map[agent_id][0] # consider only the first task - info['stats']['task/completed'] = 1.0 if task.completed else 0.0 - info['stats']['task/pcnt_2_reward_signal'] = 1.0 if task.reward_signal_count >= 2 else 0.0 - info['stats']['task/pcnt_0p2_max_progress'] = 1.0 if task._max_progress >= 0.2 else 0.0 - info['curriculum'] = { - task.spec_name: (task._max_progress, task.reward_signal_count) - } + info["stats"]["task/completed"] = 1.0 if task.completed else 0.0 + info["stats"]["task/pcnt_2_reward_signal"] = 1.0 if task.reward_signal_count >= 2 else 0.0 + info["stats"]["task/pcnt_0p2_max_progress"] = 1.0 if task._max_progress >= 0.2 else 0.0 + info["curriculum"] = {task.spec_name: (task._max_progress, task.reward_signal_count)} if self.eval_mode: # 'return' is used for ranking in the eval mode, so put the task progress here - info['return'] = task._max_progress # this is 1 if done + info["return"] = task._max_progress # this is 1 if done # Max combat/harvest level achieved - info['stats']['achieved/max_combat_level'] = agent.attack_level - info['stats']['achieved/max_harvest_skill_ammo'] = max( + info["stats"]["achieved/max_combat_level"] = agent.attack_level + info["stats"]["achieved/max_harvest_skill_ammo"] = max( agent.prospecting_level.val, agent.carving_level.val, agent.alchemy_level.val, ) - info['stats']['achieved/max_harvest_skill_consum'] = max( + info["stats"]["achieved/max_harvest_skill_consum"] = max( agent.fishing_level.val, agent.herbalism_level.val, ) @@ -168,7 +166,7 @@ def _process_stats_and_early_stop(self, agent_id, reward, terminated, truncated, # Event-based stats achieved, performed, _ = process_event_log(realm, [agent_id]) for key, val in list(achieved.items()) + list(performed.items()): - info['stats'][key] = float(val) + info["stats"][key] = float(val) if self._stat_prefix: info = {self._stat_prefix: info} @@ -180,33 +178,32 @@ def _process_stats_and_early_stop(self, agent_id, reward, terminated, truncated, # Event processing utilities for Neural MMO. INFO_KEY_TO_EVENT_CODE = { - 'event/' + evt.lower(): val - for evt, val in EventCode.__dict__.items() - if isinstance(val, int) + "event/" + evt.lower(): val for evt, val in EventCode.__dict__.items() if isinstance(val, int) } # convert the numbers into binary (performed or not) for the key events KEY_EVENT = [ - 'eat_food', - 'drink_water', - 'score_hit', - 'player_kill', - 'consume_item', - 'harvest_item', - 'list_item', - 'buy_item', + "eat_food", + "drink_water", + "score_hit", + "player_kill", + "consume_item", + "harvest_item", + "list_item", + "buy_item", ] ITEM_TYPE = { - 'armor': [item.ITEM_TYPE_ID for item in Item.ARMOR], - 'weapon': [item.ITEM_TYPE_ID for item in Item.WEAPON], - 'tool': [item.ITEM_TYPE_ID for item in Item.TOOL], - 'ammo': [item.ITEM_TYPE_ID for item in Item.AMMUNITION], - 'consumable': [item.ITEM_TYPE_ID for item in Item.CONSUMABLE], + "armor": [item.ITEM_TYPE_ID for item in Item.ARMOR], + "weapon": [item.ITEM_TYPE_ID for item in Item.WEAPON], + "tool": [item.ITEM_TYPE_ID for item in Item.TOOL], + "ammo": [item.ITEM_TYPE_ID for item in Item.AMMUNITION], + "consumable": [item.ITEM_TYPE_ID for item in Item.CONSUMABLE], } + def process_event_log(realm, agent_list): - '''Process the event log and extract performed actions and achievements.''' + """Process the event log and extract performed actions and achievements.""" log = realm.event_log.get_data(agents=agent_list) attr_to_col = realm.event_log.attr_to_col @@ -214,60 +211,68 @@ def process_event_log(realm, agent_list): event_cnt = {} for key, code in INFO_KEY_TO_EVENT_CODE.items(): # count the freq of each event - event_cnt[key] = int(sum(log[:, attr_to_col['event']] == code)) + event_cnt[key] = int(sum(log[:, attr_to_col["event"]] == code)) # record true or false for each event performed = {} for evt in KEY_EVENT: - key = 'event/' + evt + key = "event/" + evt performed[key] = event_cnt[key] > 0 # check if tools, weapons, ammos, ammos were equipped for item_type, item_ids in ITEM_TYPE.items(): - if item_type == 'consumable': + if item_type == "consumable": continue - key = 'event/equip_' + item_type - idx = (log[:, attr_to_col['event']] == EventCode.EQUIP_ITEM) & \ - np.in1d(log[:, attr_to_col['item_type']], item_ids) + key = "event/equip_" + item_type + idx = (log[:, attr_to_col["event"]] == EventCode.EQUIP_ITEM) & np.in1d( + log[:, attr_to_col["item_type"]], item_ids + ) performed[key] = sum(idx) > 0 # check if weapon was harvested - key = 'event/harvest_weapon' - idx = (log[:, attr_to_col['event']] == EventCode.HARVEST_ITEM) & \ - np.in1d(log[:, attr_to_col['item_type']], ITEM_TYPE['weapon']) + key = "event/harvest_weapon" + idx = (log[:, attr_to_col["event"]] == EventCode.HARVEST_ITEM) & np.in1d( + log[:, attr_to_col["item_type"]], ITEM_TYPE["weapon"] + ) performed[key] = sum(idx) > 0 # record important achievements achieved = {} # get progress to center - idx = log[:, attr_to_col['event']] == EventCode.GO_FARTHEST - achieved['achieved/max_progress_to_center'] = \ - int(max(log[idx, attr_to_col['distance']])) if sum(idx) > 0 else 0 + idx = log[:, attr_to_col["event"]] == EventCode.GO_FARTHEST + achieved["achieved/max_progress_to_center"] = ( + int(max(log[idx, attr_to_col["distance"]])) if sum(idx) > 0 else 0 + ) # get earned gold - idx = log[:, attr_to_col['event']] == EventCode.EARN_GOLD - achieved['achieved/earned_gold'] = int(sum(log[idx, attr_to_col['gold']])) + idx = log[:, attr_to_col["event"]] == EventCode.EARN_GOLD + achieved["achieved/earned_gold"] = int(sum(log[idx, attr_to_col["gold"]])) # get max damage - idx = log[:, attr_to_col['event']] == EventCode.SCORE_HIT - achieved['achieved/max_damage'] = int(max(log[idx, attr_to_col['damage']])) if sum(idx) > 0 else 0 + idx = log[:, attr_to_col["event"]] == EventCode.SCORE_HIT + achieved["achieved/max_damage"] = ( + int(max(log[idx, attr_to_col["damage"]])) if sum(idx) > 0 else 0 + ) # get max possessed item levels: from harvesting, looting, buying - idx = np.in1d(log[:, attr_to_col['event']], - [EventCode.HARVEST_ITEM, EventCode.LOOT_ITEM, EventCode.BUY_ITEM]) + idx = np.in1d( + log[:, attr_to_col["event"]], + [EventCode.HARVEST_ITEM, EventCode.LOOT_ITEM, EventCode.BUY_ITEM], + ) if sum(idx) > 0: for item_type, item_ids in ITEM_TYPE.items(): - idx_item = np.in1d(log[idx, attr_to_col['item_type']], item_ids) + idx_item = np.in1d(log[idx, attr_to_col["item_type"]], item_ids) if sum(idx_item) > 0: - achieved['achieved/max_' + item_type + '_level'] = \ - int(max(log[idx][idx_item, attr_to_col['level']])) + achieved["achieved/max_" + item_type + "_level"] = int( + max(log[idx][idx_item, attr_to_col["level"]]) + ) # other notable achievements - idx = (log[:, attr_to_col['event']] == EventCode.PLAYER_KILL) - achieved['achieved/agent_kill_count'] = int(sum(idx & (log[:, attr_to_col['target_ent']] > 0))) - achieved['achieved/npc_kill_count'] = int(sum(idx & (log[:, attr_to_col['target_ent']] < 0))) - achieved['achieved/unique_events'] = count_unique_events(log, set()) + idx = log[:, attr_to_col["event"]] == EventCode.PLAYER_KILL + achieved["achieved/agent_kill_count"] = int(sum(idx & (log[:, attr_to_col["target_ent"]] > 0))) + achieved["achieved/npc_kill_count"] = int(sum(idx & (log[:, attr_to_col["target_ent"]] < 0))) + achieved["achieved/unique_events"] = count_unique_events(log, set()) return achieved, performed, event_cnt @@ -275,8 +280,8 @@ def process_event_log(realm, agent_list): # These events are important, so count them even though they are not unique EVERY_EVENT_TO_COUNT = set([EventCode.PLAYER_KILL, EventCode.EARN_GOLD]) -def count_unique_events(tick_log, experienced, - every_event_to_count=EVERY_EVENT_TO_COUNT): + +def count_unique_events(tick_log, experienced, every_event_to_count=EVERY_EVENT_TO_COUNT): cnt_unique = 0 if len(tick_log) == 0: return cnt_unique diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index eb79acb6..00000000 --- a/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -accelerate==0.27.2 -gymnasium==0.29.1 -#nmmo # https://github.com/kywch/nmmo-environment -polars -pufferlib==0.7.3 -psutil -torch==2.1.0 -transformers==4.37.2 -wandb diff --git a/tests/test_task_encoder.py b/tests/test_task_encoder.py index 7ef20f5c..97bacd0a 100644 --- a/tests/test_task_encoder.py +++ b/tests/test_task_encoder.py @@ -10,50 +10,48 @@ # NOTE: different LLMs will give different embedding dimensions EMBEDDING_DIM = 4096 + class TestTaskEncoder(unittest.TestCase): - # pylint: disable=protected-access,bad-builtin - @classmethod - def setUpClass(cls): - cls.task_encoder = TaskEncoder( - LLM_CHECKPOINT, curriculum_generation.manual_curriculum, batch_size=4 - ) - - @classmethod - def tearDownClass(cls): - cls.task_encoder.close() - - def test_embed_dim(self): - self.assertEqual(self.task_encoder.embed_dim, EMBEDDING_DIM) - - def test_task_encoder_api(self): - task_spec_with_embedding = self.task_encoder.get_task_embedding( - curriculum_generation.manual_curriculum.curriculum, - save_to_file=CURRICULUM_FILE_PATH - ) - - for single_spec in task_spec_with_embedding: - self.assertFalse(sum(single_spec.embedding) == 0) - - def test_get_task_deps_src(self): - custom_fn = curriculum_generation.manual_curriculum.PracticeInventoryManagement - fn_src, deps_src = self.task_encoder._get_task_deps_src(custom_fn) - - self.assertEqual( - fn_src, - "def PracticeInventoryManagement(gs, subject, space, num_tick):\n " - + - "return InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick)\n", - ) - self.assertTrue("def InventorySpaceGE(" in deps_src) - self.assertTrue("def TickGE(" in deps_src) - - def test_contruct_prompt(self): - single_spec = random.choice(curriculum_generation.manual_curriculum.curriculum) - prompt = self.task_encoder._construct_prompt( - single_spec.reward_to, single_spec.eval_fn, single_spec.eval_fn_kwargs - ) - print(prompt) + @classmethod + def setUpClass(cls): + cls.task_encoder = TaskEncoder( + LLM_CHECKPOINT, curriculum_generation.manual_curriculum, batch_size=4 + ) + + @classmethod + def tearDownClass(cls): + cls.task_encoder.close() + + def test_embed_dim(self): + self.assertEqual(self.task_encoder.embed_dim, EMBEDDING_DIM) + + def test_task_encoder_api(self): + task_spec_with_embedding = self.task_encoder.get_task_embedding( + curriculum_generation.manual_curriculum.curriculum, save_to_file=CURRICULUM_FILE_PATH + ) + + for single_spec in task_spec_with_embedding: + self.assertFalse(sum(single_spec.embedding) == 0) + + def test_get_task_deps_src(self): + custom_fn = curriculum_generation.manual_curriculum.PracticeInventoryManagement + fn_src, deps_src = self.task_encoder._get_task_deps_src(custom_fn) + + self.assertEqual( + fn_src, + "def PracticeInventoryManagement(gs, subject, space, num_tick):\n " + + "return InventorySpaceGE(gs, subject, space) * TickGE(gs, subject, num_tick)\n", + ) + self.assertTrue("def InventorySpaceGE(" in deps_src) + self.assertTrue("def TickGE(" in deps_src) + + def test_contruct_prompt(self): + single_spec = random.choice(curriculum_generation.manual_curriculum.curriculum) + prompt = self.task_encoder._construct_prompt( + single_spec.reward_to, single_spec.eval_fn, single_spec.eval_fn_kwargs + ) + print(prompt) if __name__ == "__main__": - unittest.main() + unittest.main() diff --git a/train.py b/train.py index 7ffe135c..f697d934 100644 --- a/train.py +++ b/train.py @@ -1,4 +1,4 @@ -from pdb import set_trace as T +# from pdb import set_trace as T import importlib import argparse import inspect @@ -19,12 +19,12 @@ def load_from_config(agent, debug=False): - with open('config.yaml') as f: + with open("config.yaml") as f: config = yaml.safe_load(f) - default_keys = 'agent_zoo env train policy recurrent sweep_metadata sweep_metric sweep wandb reward_wrapper'.split() + default_keys = "agent_zoo env train policy recurrent sweep_metadata sweep_metric sweep wandb reward_wrapper".split() defaults = {key: config.get(key, {}) for key in default_keys} - debug_config = config.get('debug', {}) if debug else {} + debug_config = config.get("debug", {}) if debug else {} agent_config = config[agent] combined_config = {} @@ -35,15 +35,16 @@ def load_from_config(agent, debug=False): return pufferlib.namespace(**combined_config) + def get_init_args(fn): if fn is None: return {} sig = inspect.signature(fn) args = {} for name, param in sig.parameters.items(): - if name in ('self', 'env', 'policy'): + if name in ("self", "env", "policy"): continue - if name in ('agent_id', 'is_multiagent'): # Postprocessor args + if name in ("agent_id", "is_multiagent"): # Postprocessor args continue if param.kind == inspect.Parameter.VAR_POSITIONAL: continue @@ -53,12 +54,13 @@ def get_init_args(fn): args[name] = param.default if param.default is not inspect.Parameter.empty else None return args + # Return env_creator, agent_creator def setup_agent(module_name): try: - agent_module = importlib.import_module(f'agent_zoo.{module_name}') + agent_module = importlib.import_module(f"agent_zoo.{module_name}") except ModuleNotFoundError: - raise ValueError(f'Agent module {module_name} not found under the agent_zoo directory.') + raise ValueError(f"Agent module {module_name} not found under the agent_zoo directory.") env_creator = environment.make_env_creator(reward_wrapper_cls=agent_module.RewardWrapper) @@ -72,31 +74,32 @@ def agent_creator(env, args): return policy.to(args.train.device) init_args = { - 'policy': get_init_args(agent_module.Policy.__init__), - 'recurrent': get_init_args(agent_module.Recurrent.__init__), - 'reward_wrapper': get_init_args(agent_module.RewardWrapper.__init__), + "policy": get_init_args(agent_module.Policy.__init__), + "recurrent": get_init_args(agent_module.Recurrent.__init__), + "reward_wrapper": get_init_args(agent_module.RewardWrapper.__init__), } return agent_module, env_creator, agent_creator, init_args + def combine_config_args(parser, args, config): clean_parser = argparse.ArgumentParser(parents=[parser]) for name, sub_config in config.items(): args[name] = {} for key, value in sub_config.items(): - data_key = f'{name}.{key}' - cli_key = f'--{data_key}'.replace('_', '-') + data_key = f"{name}.{key}" + cli_key = f"--{data_key}".replace("_", "-") if isinstance(value, bool) and value is False: - parser.add_argument(cli_key, default=value, action='store_true') - clean_parser.add_argument(cli_key, default=value, action='store_true') + parser.add_argument(cli_key, default=value, action="store_true") + clean_parser.add_argument(cli_key, default=value, action="store_true") elif isinstance(value, bool) and value is True: - data_key = f'{name}.no_{key}' - cli_key = f'--{data_key}'.replace('_', '-') - parser.add_argument(cli_key, default=value, action='store_false') - clean_parser.add_argument(cli_key, default=value, action='store_false') + data_key = f"{name}.no_{key}" + cli_key = f"--{data_key}".replace("_", "-") + parser.add_argument(cli_key, default=value, action="store_false") + clean_parser.add_argument(cli_key, default=value, action="store_false") else: parser.add_argument(cli_key, default=value, type=type(value)) - clean_parser.add_argument(cli_key, default=value, metavar='', type=type(value)) + clean_parser.add_argument(cli_key, default=value, metavar="", type=type(value)) args[name][key] = getattr(parser.parse_known_args()[0], data_key) args[name] = pufferlib.namespace(**args[name]) @@ -104,6 +107,7 @@ def combine_config_args(parser, args, config): clean_parser.parse_args(sys.argv[1:]) return args + def update_args(args, mode=None): args = pufferlib.namespace(**args) @@ -111,14 +115,14 @@ def update_args(args, mode=None): args.env.curriculum_file_path = args.curriculum vec = args.vectorization - if vec == 'serial' or args.debug: + if vec == "serial" or args.debug: args.vectorization = pufferlib.vectorization.Serial - elif vec == 'multiprocessing': + elif vec == "multiprocessing": args.vectorization = pufferlib.vectorization.Multiprocessing - elif vec == 'ray': + elif vec == "ray": args.vectorization = pufferlib.vectorization.Ray else: - raise ValueError('Invalid --vectorization (serial/multiprocessing/ray).') + raise ValueError("Invalid --vectorization (serial/multiprocessing/ray).") # TODO: load the trained baseline from wandb # elif args.baseline: @@ -135,8 +139,8 @@ def update_args(args, mode=None): # model_file = max(os.listdir(data_dir)) # args.eval_model_path = os.path.join(data_dir, model_file) - if mode in ['evaluate', 'replay']: - assert args.eval_model_path is not None, 'Eval mode requires a path to checkpoints' + if mode in ["evaluate", "replay"]: + assert args.eval_model_path is not None, "Eval mode requires a path to checkpoints" args.track = False # Disable env pool - see the comment about next_lstm_state in clean_pufferl.evaluate() args.train.env_pool = False @@ -144,63 +148,86 @@ def update_args(args, mode=None): args.reward_wrapper.eval_mode = True args.reward_wrapper.early_stop_agent_num = 0 - if mode == 'replay': + if mode == "replay": args.train.num_envs = args.train.envs_per_worker = args.train.envs_per_batch = 1 args.vectorization = pufferlib.vectorization.Serial return args -if __name__ == '__main__': + +if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser(description='Parse environment argument', add_help=False) - parser.add_argument('-m', '--mode', type=str, default='train', choices='train sweep replay'.split()) - parser.add_argument('-a', '--agent', type=str, default='neurips23_start_kit', help='Agent module to use') - parser.add_argument('-n', '--exp-name', type=str, default=None, help="Need exp name to resume the experiment") - parser.add_argument('-p', '--eval-model-path', type=str, default=None, help='Path to model to evaluate') - parser.add_argument('-c', '--curriculum', type=str, default=BASELINE_CURRICULUM, help='Path to curriculum file') - parser.add_argument('-t', '--task-to-assign', type=int, default=None, - help='The index of the task to assign in the curriculum file') - #parser.add_argument('--baseline', action='store_true', help='Baseline run') - parser.add_argument('--vectorization', type=str, default='multiprocessing', choices='serial multiprocessing ray'.split()) - parser.add_argument('--no-recurrence', action='store_true', help='Do not use recurrence') + parser = argparse.ArgumentParser(description="Parse environment argument", add_help=False) + parser.add_argument( + "-m", "--mode", type=str, default="train", choices="train sweep replay".split() + ) + parser.add_argument( + "-a", "--agent", type=str, default="neurips23_start_kit", help="Agent module to use" + ) + parser.add_argument( + "-n", "--exp-name", type=str, default=None, help="Need exp name to resume the experiment" + ) + parser.add_argument( + "-p", "--eval-model-path", type=str, default=None, help="Path to model to evaluate" + ) + parser.add_argument( + "-c", "--curriculum", type=str, default=BASELINE_CURRICULUM, help="Path to curriculum file" + ) + parser.add_argument( + "-t", + "--task-to-assign", + type=int, + default=None, + help="The index of the task to assign in the curriculum file", + ) + # parser.add_argument('--baseline', action='store_true', help='Baseline run') + parser.add_argument( + "--vectorization", + type=str, + default="multiprocessing", + choices="serial multiprocessing ray".split(), + ) + parser.add_argument("--no-recurrence", action="store_true", help="Do not use recurrence") if DEBUG: - parser.add_argument('--no-track', default=True, help='Do NOT track on WandB') - parser.add_argument('--debug', default=True, help='Debug mode') + parser.add_argument("--no-track", default=True, help="Do NOT track on WandB") + parser.add_argument("--debug", default=True, help="Debug mode") else: - parser.add_argument('--no-track', action='store_true', help='Do NOT track on WandB') - parser.add_argument('--debug', action='store_true', help='Debug mode') + parser.add_argument("--no-track", action="store_true", help="Do NOT track on WandB") + parser.add_argument("--debug", action="store_true", help="Debug mode") args = parser.parse_known_args()[0].__dict__ - config = load_from_config(args['agent'], debug=args.get('debug', False)) - agent_module, env_creator, agent_creator, init_args = setup_agent(args['agent']) + config = load_from_config(args["agent"], debug=args.get("debug", False)) + agent_module, env_creator, agent_creator, init_args = setup_agent(args["agent"]) # Update config with environment defaults - config.policy = {**init_args['policy'], **config.policy} - config.recurrent = {**init_args['recurrent'], **config.recurrent} - config.reward_wrapper = {**init_args['reward_wrapper'], **config.reward_wrapper} + config.policy = {**init_args["policy"], **config.policy} + config.recurrent = {**init_args["recurrent"], **config.recurrent} + config.reward_wrapper = {**init_args["reward_wrapper"], **config.reward_wrapper} # Generate argparse menu from config args = combine_config_args(parser, args, config) # Perform mode-specific updates - args = update_args(args, mode=args['mode']) + args = update_args(args, mode=args["mode"]) if args.train.env_pool is True: - logging.warning('Env_pool is enabled. This may increase training speed but break determinism.') + logging.warning( + "Env_pool is enabled. This may increase training speed but break determinism." + ) if args.track: args.exp_name = init_wandb(args).id else: args.exp_name = f"nmmo_{time.strftime('%Y%m%d_%H%M%S')}" - if args.mode == 'train': + if args.mode == "train": train(args, env_creator, agent_creator) exit(0) - elif args.mode == 'sweep': + elif args.mode == "sweep": sweep(args, env_creator, agent_creator) exit(0) - elif args.mode == 'replay': + elif args.mode == "replay": generate_replay(args, env_creator, agent_creator) exit(0) else: - raise ValueError('Mode must be one of train, sweep, or evaluate') + raise ValueError("Mode must be one of train, sweep, or evaluate") diff --git a/train_helper.py b/train_helper.py index 813defc6..20ce4530 100644 --- a/train_helper.py +++ b/train_helper.py @@ -5,14 +5,9 @@ import dill import wandb -# Related to torch.use_deterministic_algorithms(True) -# See also https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility -os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' - import torch import numpy as np -import pufferlib import pufferlib.policy_pool as pp from nmmo.render.replay_helper import FileReplayHelper @@ -20,40 +15,45 @@ from reinforcement_learning import clean_pufferl +# Related to torch.use_deterministic_algorithms(True) +# See also https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility +os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" + def init_wandb(args, resume=True): if args.no_track: return None - assert args.wandb.project is not None, 'Please set the wandb project in config.yaml' - assert args.wandb.entity is not None, 'Please set the wandb entity in config.yaml' + assert args.wandb.project is not None, "Please set the wandb project in config.yaml" + assert args.wandb.entity is not None, "Please set the wandb entity in config.yaml" wandb_kwargs = { - 'id': args.exp_name or wandb.util.generate_id(), - 'project': args.wandb.project, - 'entity': args.wandb.entity, - 'config': { - 'cleanrl': args.train, - 'env': args.env, - 'agent_zoo': args.agent, - 'policy': args.policy, - 'recurrent': args.recurrent, - 'reward_wrapper': args.reward_wrapper, + "id": args.exp_name or wandb.util.generate_id(), + "project": args.wandb.project, + "entity": args.wandb.entity, + "config": { + "cleanrl": args.train, + "env": args.env, + "agent_zoo": args.agent, + "policy": args.policy, + "recurrent": args.recurrent, + "reward_wrapper": args.reward_wrapper, }, - 'name': args.exp_name, - 'monitor_gym': True, - 'save_code': True, - 'resume': resume, + "name": args.exp_name, + "monitor_gym": True, + "save_code": True, + "resume": resume, } if args.wandb.group is not None: - wandb_kwargs['group'] = args.wandb.group + wandb_kwargs["group"] = args.wandb.group return wandb.init(**wandb_kwargs) + def train(args, env_creator, agent_creator): data = clean_pufferl.create( config=args.train, agent_creator=agent_creator, - agent_kwargs={'args': args}, + agent_kwargs={"args": args}, env_creator=env_creator, - env_creator_kwargs={'env': args.env, 'reward_wrapper': args.reward_wrapper}, + env_creator_kwargs={"env": args.env, "reward_wrapper": args.reward_wrapper}, vectorization=args.vectorization, exp_name=args.exp_name, track=args.track, @@ -63,9 +63,10 @@ def train(args, env_creator, agent_creator): clean_pufferl.evaluate(data) clean_pufferl.train(data) - print('Done training. Saving data...') + print("Done training. Saving data...") clean_pufferl.close(data) - print('Run complete.') + print("Run complete.") + def sweep(args, env_creator, agent_creator): sweep_id = wandb.sweep(sweep=args.sweep, project=args.wandb.project) @@ -73,32 +74,32 @@ def sweep(args, env_creator, agent_creator): def main(): try: args.exp_name = init_wandb(args).id - if hasattr(wandb.config, 'train'): + if hasattr(wandb.config, "train"): # TODO: Add update method to namespace print(args.train.__dict__) print(wandb.config.train) args.train.__dict__.update(dict(wandb.config.train)) train(args, env_creator, agent_creator) - except Exception as e: + except Exception as e: # noqa: F841 import traceback + traceback.print_exc() wandb.agent(sweep_id, main, count=20) -def generate_replay(args, env_creator, agent_creator, - stop_when_all_complete_task=True, - seed=None): - assert args.eval_model_path is not None, 'eval_model_path must be set for replay generation' + +def generate_replay(args, env_creator, agent_creator, stop_when_all_complete_task=True, seed=None): + assert args.eval_model_path is not None, "eval_model_path must be set for replay generation" policies = pp.get_policy_names(args.eval_model_path) - assert len(policies) > 0, 'No policies found in eval_model_path' - logging.info(f'Policies to generate replay: {policies}') + assert len(policies) > 0, "No policies found in eval_model_path" + logging.info(f"Policies to generate replay: {policies}") save_dir = args.eval_model_path - logging.info('Replays will be saved to %s', save_dir) + logging.info("Replays will be saved to %s", save_dir) if seed is not None: args.train.seed = seed - logging.info('Seed: %d', args.train.seed) + logging.info("Seed: %d", args.train.seed) # Set the train config for replay args.train.num_envs = 1 @@ -110,15 +111,16 @@ def generate_replay(args, env_creator, agent_creator, args.reward_wrapper.early_stop_agent_num = 0 # Use the policy pool helper functions to create kernel (policy-agent mapping) - args.train.pool_kernel = pp.create_kernel(args.env.num_agents, len(policies), - shuffle_with_seed=args.train.seed) + args.train.pool_kernel = pp.create_kernel( + args.env.num_agents, len(policies), shuffle_with_seed=args.train.seed + ) data = clean_pufferl.create( config=args.train, agent_creator=agent_creator, - agent_kwargs={'args': args}, + agent_kwargs={"args": args}, env_creator=env_creator, - env_creator_kwargs={'env': args.env, 'reward_wrapper': args.reward_wrapper}, + env_creator_kwargs={"env": args.env, "reward_wrapper": args.reward_wrapper}, eval_mode=True, eval_model_path=args.eval_model_path, policy_selector=pp.AllPolicySelector(args.train.seed), @@ -131,25 +133,26 @@ def generate_replay(args, env_creator, agent_creator, nmmo_env.realm.record_replay(replay_helper) # Sanity checks for replay generation - assert len(policies) == len(data.policy_pool.current_policies), 'Policy count mismatch' - assert len(data.policy_pool.kernel) == nmmo_env.max_num_agents, 'Agent count mismatch' + assert len(policies) == len(data.policy_pool.current_policies), "Policy count mismatch" + assert len(data.policy_pool.kernel) == nmmo_env.max_num_agents, "Agent count mismatch" # Add the policy names to agent names if len(policies) > 1: for policy_id, samp in data.policy_pool.sample_idxs.items(): - policy_name = data.policy_pool.current_policies[policy_id]['name'] + policy_name = data.policy_pool.current_policies[policy_id]["name"] for idx in samp: agent_id = idx + 1 # agents are 0-indexed in policy_pool, but 1-indexed in nmmo - nmmo_env.realm.players[agent_id].name = f'{policy_name}_{agent_id}' + nmmo_env.realm.players[agent_id].name = f"{policy_name}_{agent_id}" # Assign the specified task to the agents, if provided if args.task_to_assign is not None: - with open(args.curriculum, 'rb') as f: - task_with_embedding = dill.load(f) # a list of TaskSpec - assert 0 <= args.task_to_assign < len(task_with_embedding), 'Task index out of range' + with open(args.curriculum, "rb") as f: + task_with_embedding = dill.load(f) # a list of TaskSpec + assert 0 <= args.task_to_assign < len(task_with_embedding), "Task index out of range" select_task = task_with_embedding[args.task_to_assign] - tasks = make_task_from_spec(nmmo_env.possible_agents, - [select_task] * len(nmmo_env.possible_agents)) + tasks = make_task_from_spec( + nmmo_env.possible_agents, [select_task] * len(nmmo_env.possible_agents) + ) # Reassign the task to the agents nmmo_env.tasks = tasks @@ -159,7 +162,7 @@ def generate_replay(args, env_creator, agent_creator, task_embedding = nmmo_env.agent_task_map[agent_id][0].embedding nmmo_env.obs[agent_id].gym_obs.reset(task_embedding) - print(f'All agents are assigned: {nmmo_env.tasks[0].spec_name}\n') + print(f"All agents are assigned: {nmmo_env.tasks[0].spec_name}\n") # Generate the replay replay_helper.reset() @@ -178,7 +181,8 @@ def generate_replay(args, env_creator, agent_creator, ) actions, logprob, value, next_lstm_state = data.policy_pool.forwards( - o.to(data.device), next_lstm_state) + o.to(data.device), next_lstm_state + ) if next_lstm_state is not None: h, c = next_lstm_state @@ -192,35 +196,39 @@ def generate_replay(args, env_creator, agent_creator, num_alive = len(nmmo_env.agents) task_done = sum(1 for task in nmmo_env.tasks if task.completed) - alive_done = sum(1 for task in nmmo_env.tasks - if task.completed and task.assignee[0] in nmmo_env.realm.players) - print('Tick:', nmmo_env.realm.tick, ', alive agents:', num_alive, ', task done:', task_done) + alive_done = sum( + 1 + for task in nmmo_env.tasks + if task.completed and task.assignee[0] in nmmo_env.realm.players + ) + print("Tick:", nmmo_env.realm.tick, ", alive agents:", num_alive, ", task done:", task_done) if num_alive == alive_done: - print('All alive agents completed the task.') + print("All alive agents completed the task.") break if num_alive == 0 or nmmo_env.realm.tick == args.env.max_episode_length: - print('All agents died or reached the max episode length.') + print("All agents died or reached the max episode length.") break # Count how many agents completed the task - print('--------------------------------------------------') - print('Task:', nmmo_env.tasks[0].spec_name) + print("--------------------------------------------------") + print("Task:", nmmo_env.tasks[0].spec_name) num_completed = sum(1 for task in nmmo_env.tasks if task.completed) - print('Number of agents completed the task:', num_completed) - avg_progress = np.mean([task.progress_info['max_progress'] for task in nmmo_env.tasks]) - print(f'Average maximum progress (max=1): {avg_progress:.3f}') - avg_completed_tick = 0 + print("Number of agents completed the task:", num_completed) + avg_progress = np.mean([task.progress_info["max_progress"] for task in nmmo_env.tasks]) + print(f"Average maximum progress (max=1): {avg_progress:.3f}") + avg_completed_tick = 0 if num_completed > 0: - avg_completed_tick = np.mean([task.progress_info['completed_tick'] - for task in nmmo_env.tasks if task.completed]) - print(f'Average completed tick: {avg_completed_tick:.1f}') + avg_completed_tick = np.mean( + [task.progress_info["completed_tick"] for task in nmmo_env.tasks if task.completed] + ) + print(f"Average completed tick: {avg_completed_tick:.1f}") # Save the replay file - replay_file = f'replay_seed_{args.train.seed}_' + replay_file = f"replay_seed_{args.train.seed}_" if args.task_to_assign is not None: - replay_file += f'task_{args.task_to_assign}_' - replay_file = os.path.join(save_dir, replay_file + time.strftime('%Y%m%d_%H%M%S')) - print(f'Saving replay to {replay_file}') + replay_file += f"task_{args.task_to_assign}_" + replay_file = os.path.join(save_dir, replay_file + time.strftime("%Y%m%d_%H%M%S")) + print(f"Saving replay to {replay_file}") replay_helper.save(replay_file, compress=True) clean_pufferl.close(data)