-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdriver.py
292 lines (228 loc) · 9.48 KB
/
driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
from __future__ import annotations
import os
import matplotlib.pyplot as plt
import numpy as np
import time
import torch
import torch.nn as nn
from torch import optim
from tqdm import tqdm
import wandb
import gymnasium as gym
from a2c import *
from finder_gym import *
from alg_parameters import *
# environment hyperparams
n_env = 1
n_updates = 1000
n_steps_per_update = 2080
randomize_domain = False
# agent hyperparams
gamma = 0.999
lam = 0.95 # hyperparameter for GAE
ent_coef = 0.01 # coefficient for the entropy bonus (to encourage exploration)
actor_lr = 0.001
critic_lr = 0.005
# Note: the actor has a slower learning rate so that the value targets become
# more stationary and are theirfore easier to estimate for the critic
# #TODO: GENERATE WORLD
# # prob = np.random.triangular(self.PROB[0], .33 * self.PROB[0] + .66 * self.PROB[1],
# # self.PROB[1]) # sample a value from triangular distribution
# # size = np.random.choice([self.SIZE[0], self.SIZE[0] * .5 + self.SIZE[1] * .5, self.SIZE[1]],
# # p=[.5, .25, .25]) # sample a value according to the given probability
# prob = 0.4
# size = 20
# # prob = self.PROB
# # size = self.SIZE # fixed world0 size and obstacle density for evaluation
# # here is the map without any agents nor goals
# world = -(np.random.rand(int(size), int(size)) < prob).astype(int) # -1 obstacle,0 nothing, >0 agent id
# #for PRIMAL1 map
# # world = random_generator(SIZE_O=self.SIZE, PROB_O=self.PROB)
# world = padding_world(world)
# #add starting node
# for index, x in np.ndenumerate(world):
# if x == 0:
# start_pos = index
# world[index[0]][index[1]] = 1
# break
# environment setup
def create_mask(env):
return np.equal(env, 2)
#return (env == 2)
env = nodeFindEnv()
obs_shape = env.observation_space.shape
action_shape = env.action_space.n
# set the device
use_cuda = True
if use_cuda:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
device = torch.device("cpu")
# init the agent
agent = A2C(obs_shape, action_shape, device, critic_lr, actor_lr, n_env)
# #call to reload saved actor critic data
agent.actor.load_state_dict(torch.load("./models/actor_prototype2.5-20240223-030259"))
agent.critic.load_state_dict(torch.load("./models/critic_prototype2.5-20240223-030259"))
# create a wrapper environment to save episode returns and episode lengths
env_wrapper = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_env * n_updates)
wandb_id = wandb.util.generate_id()
wandb.init(project=RecordingParameters.EXPERIMENT_PROJECT,
name=RecordingParameters.EXPERIMENT_NAME,
entity=RecordingParameters.ENTITY,
notes=RecordingParameters.EXPERIMENT_NOTE,
config=all_args,
id=wandb_id,
resume='allow')
print('id is:{}'.format(wandb_id))
print('Launching wandb...\n')
while True:
# use tqdm to get a progress bar for training
image_Saved = False
for sample_phase in tqdm(range(n_updates)):
# we don't have to reset the env, they just continue playing
# until the episode is over and then reset automatically
critic_losses = []
actor_losses = []
entropies = []
# reset lists that collect experiences of an episode (sample phase)
num_invalid_move = 0
ep_length = []
ep_value_preds = torch.zeros(n_steps_per_update, n_env, device=device)
ep_rewards = torch.zeros(n_steps_per_update, n_env, device=device)
ep_action_log_probs = torch.zeros(n_steps_per_update, n_env, device=device)
masks = torch.zeros(n_steps_per_update, n_env, device=device)
# at the start of training reset all env to get an initial state
if sample_phase == 0:
states, info = env_wrapper.reset()
image_Saved = False
# play n steps in our parallel environments to collect data
for step in range(n_steps_per_update):
# select an action A_{t} using S_{t} as input for the agent
actions, action_log_probs, state_value_preds, entropy = agent.select_action(
states, mask=create_mask(states)
)
# perform the action A_{t} in the environment to get S_{t+1} and R_{t+1}
states, rewards, terminated, truncated, infos = env_wrapper.step(
actions.cpu().numpy()
)
#count number of invalid moves
if(infos['isValid'] != 1):
num_invalid_move += 1
if("episode" in infos):
ep_length.append(infos["episode"]["l"])
pygame.event.get()
env.render()
if(terminated):
#save image once every training cycle (2080 steps)
if((not image_Saved)and step > n_steps_per_update/2):
image_path = "/home/marmot/patrick/ALPHA/node_placer/complete_map/"
image_name = RecordingParameters.EXPERIMENT_NAME + "/" + time.strftime("%Y%m%d-%H%M%S") + ".jpeg"
pygame.image.save(env.screen, image_path + image_name)
#comment below to always save at end of every ep
image_Saved = True
env_wrapper.reset()
# #save image aevery 200 steps
# if(step % 200 == 0 and step > 0):
# image_Saved = False
ep_value_preds[step] = torch.squeeze(state_value_preds)
ep_rewards[step] = torch.tensor(rewards, device=device)
ep_action_log_probs[step] = action_log_probs
# add a mask (for the return calculation later);
# for each env the mask is 1 if the episode is ongoing and 0 if it is terminated (not by truncation!)
masks[step] = torch.tensor([not terminated])
#pygame.time.wait(2)
# calculate the losses for actor and critic
critic_loss, actor_loss = agent.get_losses(
ep_rewards,
ep_action_log_probs,
ep_value_preds,
entropy,
masks,
gamma,
lam,
ent_coef,
device,
)
#gradient norm
gradient_norm = nn.utils.clip_grad_norm_(agent.actor.parameters(), TrainingParameters.MAX_GRAD_NORM)
# update the actor and critic networks
agent.update_parameters(critic_loss, actor_loss)
# log the losses and entropy
critic_losses.append(critic_loss.detach().cpu().numpy())
actor_losses.append(actor_loss.detach().cpu().numpy())
entropies.append(entropy.detach().mean().cpu().numpy())
wandb.log({'Perf_random_eval/Num_Invalid_Moves': num_invalid_move})
wandb.log({'Perf_random_eval/Episode_length': np.mean(np.array(ep_length))})
wandb.log({'Perf_random_eval/Rewards': torch.mean(ep_rewards)})
# print(actor_losses)
# print(critic_losses)
wandb.log({'Perf_random_eval/Actor Loss': actor_losses[0]})
wandb.log({'Perf_random_eval/Critic Loss': critic_losses[0]})
wandb.log({'Perf_random_eval/Actor Entropy': entropies[0]})
wandb.log({'Perf_random_eval/Actor_Grad': gradient_norm})
# wandb.log({'Perf_random_eval/Num_block': performance_dict['per_block']}, step=step)
timestr = time.strftime("%Y%m%d-%H%M%S")
critic_file_name = "./models/critic_" + RecordingParameters.EXPERIMENT_NAME + "-" + timestr
actor_file_name = "./models/actor_" + RecordingParameters.EXPERIMENT_NAME + "-" + timestr
torch.save(agent.critic.state_dict(), critic_file_name)
torch.save(agent.actor.state_dict(), actor_file_name)
# pygame.event.get()
# env.render()
# pygame.time.wait(200)
####recording line
pygame.event.get()
# action = env.action_space.sample() # Random action selection
# print('Reward:', rewards)
# print('critic loss:', critic_loss)
pygame.time.wait(200)
""" plot the results """
# %matplotlib inline
# rolling_length = 20
# fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 5))
# fig.suptitle(
# f"Training plots for {agent.__class__.__name__} in the LunarLander-v2 environment \n \
# (n_env={n_env}, n_steps_per_update={n_steps_per_update}, randomize_domain={randomize_domain})"
# )
# # episode return
# axs[0][0].set_title("Episode Returns")
# episode_returns_moving_average = (
# np.convolve(
# np.array(env_wrapper.return_queue).flatten(),
# np.ones(rolling_length),
# mode="valid",
# )
# / rolling_length
# )
# axs[0][0].plot(
# np.arange(len(episode_returns_moving_average)) / n_env,
# episode_returns_moving_average,
# )
# axs[0][0].set_xlabel("Number of episodes")
# # entropy
# axs[1][0].set_title("Entropy")
# entropy_moving_average = (
# np.convolve(np.array(entropies), np.ones(rolling_length), mode="valid")
# / rolling_length
# )
# axs[1][0].plot(entropy_moving_average)
# axs[1][0].set_xlabel("Number of updates")
# # critic loss
# axs[0][1].set_title("Critic Loss")
# critic_losses_moving_average = (
# np.convolve(
# np.array(critic_losses).flatten(), np.ones(rolling_length), mode="valid"
# )
# / rolling_length
# )
# axs[0][1].plot(critic_losses_moving_average)
# axs[0][1].set_xlabel("Number of updates")
# # actor loss
# axs[1][1].set_title("Actor Loss")
# actor_losses_moving_average = (
# np.convolve(np.array(actor_losses).flatten(), np.ones(rolling_length), mode="valid")
# / rolling_length
# )
# axs[1][1].plot(actor_losses_moving_average)
# axs[1][1].set_xlabel("Number of updates")
# plt.tight_layout()
# plt.show()