-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
358 lines (302 loc) · 14.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This the code for the paper:
[1] Zhi Wang, Han-Xiong Li, and Chunlin Chen, "Incremental Reinforcement Learning
in Continuous Spaces via Policy Relaxation and Importance Weighting for
Dynamic Environments", IEEE Transactions on Neural Networks and Learning
Systems, 2019.
The implementation consists of two steps:
1. Train the policy network in an original environment, acquire the model parameters
2. In a new environment, initialze the policy parameters from the original one,
and continue to train the policy using all tested methods including
- Random, a baseline that trains from scratch
- Pretrained, a baseline that directly trains in the new environment
- PRPG, a baseline of policy reuse policy gradient
- PR, a variant of the proposed method that only applies Policy Relaxation
- IW, a variant of the proposed method that only applies Importance Weighting
- PR+IW, the proposed method
https://github.com/HeyuanMingong/irl_cs
"""
### common lib
import sys
import gym
import numpy as np
import argparse
import torch
from tqdm import tqdm
import os
import time
from torch.optim import Adam, SGD
import scipy.io as sio
import copy
### personal lib
from myrllib.episodes.episode import BatchEpisodes
from myrllib.samplers.sampler import BatchSampler
from myrllib.policies import NormalMLPPolicy
from myrllib.algorithms.reinforce import REINFORCE
from myrllib.algorithms.trpo import TRPO
######################## Arguments ############################################
parser = argparse.ArgumentParser()
parser.add_argument('--num_workers', type=int, default=4,
help='number of cpu processors for parallelly sampling of gym environment')
parser.add_argument('--batch_size', type=int, default=20,
help='number of rollouts/learning episodes in one policy iteration')
parser.add_argument('--hidden_size', type=int, default=100,
help='hidden size of the policy network')
parser.add_argument('--num_layers', type=int, default=2,
help='number of hidden layers of the policy network')
parser.add_argument('--num_iter', type=int, default=100,
help='number of policy iterations')
parser.add_argument('--lr', type=float, default=1e-2,
help='learning rate, if REINFORCE algorithm is used')
parser.add_argument('--output', type=str, default='output/navi_v1',
help='output folder for saving the experimental results')
parser.add_argument('--model_path', type=str, default='saves/navi_v1',
help='the folder for saving and loading the pretrained model')
parser.add_argument('--stage', type=str, default='finetune',
help='pretrain or finetune, in the original or new environment')
parser.add_argument('--algorithm', type=str, default='reinforce',
help='reinforce or trpo, the base algorithm for policy gradient')
parser.add_argument('--iw', action='store_true', default=False,
help='using the Importance Weighting method or not')
parser.add_argument('--pr', action='store_true', default=False,
help='using the Policy Relaxation method or not')
parser.add_argument('--priw', action='store_true', default=False,
help='using the proposed method (PR+IW) or not')
parser.add_argument('--prpg', action='store_true', default=False,
help='using the PRPG baseline or not')
parser.add_argument('--pretrained', action='store_true', default=False,
help='using the Pretrained baseline or not')
parser.add_argument('--random', action='store_true', default=False,
help='using the Random baseline or not')
parser.add_argument('--env', type=str, default='Navigation2D-v1')
parser.add_argument('--task', nargs='+', type=float, default=None,
help='the randomly chosen task in the original or new environment')
parser.add_argument('--device', type=str, default='cpu')
args = parser.parse_args()
print(args)
device = torch.device(args.device if torch.cuda.is_available() else 'cpu')
start_time = time.time()
####################### Hyperparameters ######################################
"""
If the rewards are positive, the Importance Weighting assigns a weight to
a learning episode that is proportional to its received reward; IW_INV = False.
Else, the rewards are negative, the weight is inversely proportional to the
received reward; IW_INV = True.
"""
IW_INV = True
### for the PRPG method
NU = 0.8; RMAX = 200; PSI = 1.0; UPSILON = 0.01
### for the Policy Relaxation implementations
PR_SMOOTH = 0.1; RELAX_ITERS = 1
### task information
TASK = args.task
######################## Small functions ######################################
### build a learner given a policy network
def generate_learner(policy, pr_smooth=1e-20):
if args.algorithm == 'trpo':
learner = TRPO(policy, pr_smooth=PR_SMOOTH, iw_inv=IW_INV, device=device)
else:
learner = REINFORCE(policy, lr=args.lr, pr_smooth=PR_SMOOTH, device=device)
return learner
######################## Main Functions #######################################
### build a sampler given an environment
sampler = BatchSampler(args.env, args.batch_size, num_workers=args.num_workers)
state_dim = int(np.prod(sampler.envs.observation_space.shape))
action_dim = int(np.prod(sampler.envs.action_space.shape))
print('state dim: %d; action dim: %d'%(state_dim,action_dim))
"""
'Navigation2D-v1': the task is to reaching a dynamic goal by a point agent
'Navigation2D-v2': reaching a stationary goal with three dynamic puddles
"""
if args.env == 'Navigation2D-v3':
### reaching a dynamic goal with three dynamic puddles
RELAX_ITERS = 2
elif args.env in ['SwimmerVel-v1', 'HopperVel-v1', 'HalfCheetahVel-v1']:
### locomotion tasks, let the agent run at a dynamic velocity
RELAX_ITERS = 0; NU = 0.5
if args.env == 'HopperVel-v1':
IW_INV = False
TASK = args.task[0]
elif args.env in ['ReacherDyna-v1', 'ReacherDyna-v2', 'ReacherDyna-v3']:
### v1: reaching a dynamic goal by a two-linked robotic arm
### v2: reaching a stationary goal with different physical parameters
### v3: reaching a dynamic goal with different physical parameters
RELAX_ITERS = 0
if args.env == 'ReacherDyna-v2':
TASK = int(args.task[0])
### set the task, i.e., given an environment
print('Taks information', TASK)
sampler.reset_task(TASK)
### in an original environment
if args.stage == 'pretrain':
if not os.path.exists(args.model_path):
os.makedirs(args.model_path)
policy = NormalMLPPolicy(state_dim, action_dim,
hidden_sizes=(args.hidden_size,) * args.num_layers)
learner = generate_learner(policy)
rewards = np.zeros(args.num_iter)
### training
for idx in tqdm(range(args.num_iter)):
episodes = sampler.sample(policy, device=device)
rewards[idx] = episodes.evaluate()
learner.step(episodes)
### save the model for initialization in the new environment
name = os.path.join(args.model_path, 'model_pretrained.pkl')
print('Save the model to %s'%name); torch.save(policy, name)
np.save(os.path.join(args.model_path, 'rewards_pretrained.npy'), rewards)
### in a new environment
elif args.stage == 'finetune':
### generate a random policy for policy relaxation
policy_relax = NormalMLPPolicy(state_dim, action_dim,
hidden_sizes=(args.hidden_size,) * args.num_layers).to(device)
### the policy model name from the original environment
model_name = os.path.join(args.model_path, 'model_pretrained.pkl')
### create an output folder
if not os.path.exists(args.output):
os.makedirs(args.output)
### the Random baseline
if args.random:
print('\n========== The Random baseline ==========')
print('Always randomly initialize policy parameters...')
policy_ran = NormalMLPPolicy(state_dim, action_dim,
hidden_sizes=(args.hidden_size,) * args.num_layers)
learner_ran = generate_learner(policy_ran)
rewards_ran = np.zeros(args.num_iter)
### training in the new environment
for idx in tqdm(range(args.num_iter)):
### Random baseline
episodes_ran = sampler.sample(policy_ran, device=device)
rewards_ran[idx] = episodes_ran.evaluate()
learner_ran.step(episodes_ran)
### save the data
name = os.path.join(args.output, 'random.npy')
print('Save the Random baseline to file: %s'%name)
np.save(name, rewards_ran)
### Pretrained baseline
if args.pretrained:
print('\n========== The Pretrained baseline ==========')
print('Load the policy from %s'%model_name)
policy_pre = torch.load(model_name).to(args.device)
learner_pre = generate_learner(policy_pre)
rewards_pre = np.zeros(args.num_iter)
for idx in tqdm(range(args.num_iter)):
episodes_pre = sampler.sample(policy_pre, device=device)
rewards_pre[idx] = episodes_pre.evaluate()
learner_pre.step(episodes_pre)
### save the data
name = os.path.join(args.output, 'pretrained.npy')
print('Save the Pretrained baseline to file: %s'%name)
np.save(name, rewards_pre)
### a variant of the proposed method: Importance Weighting
if args.iw:
print('\n========== Importance Weighting method ==========')
print('Load policy from %s'%model_name)
policy_iw = torch.load(model_name).to(args.device)
learner_iw = generate_learner(policy_iw)
rewards_iw = np.zeros(args.num_iter)
for idx in tqdm(range(args.num_iter)):
episodes_iw = sampler.sample(policy_iw, device=device)
rewards_iw[idx] = episodes_iw.evaluate()
### importance weighting step
learner_iw.step(episodes_iw, iw=True)
### save the data
name = os.path.join(args.output, 'iw.npy')
print('Save the IW method to file: %s'%name)
np.save(name, rewards_iw)
### a variant of the proposed method: Policy Relaxation
if args.pr:
print('\n========== Policy Relaxation method ==========')
print('Load policy from %s'%model_name)
policy_pr = torch.load(model_name).to(args.device)
learner_pr = generate_learner(policy_pr, pr_smooth=PR_SMOOTH)
rewards_pr = np.zeros(args.num_iter)
for idx in tqdm(range(args.num_iter)):
episodes_pr = sampler.sample(policy_pr, device=device)
pr = False
rewards_pr[idx] = episodes_pr.evaluate()
if idx < RELAX_ITERS:
### generate samples from the relaxed policy
episodes_pr = sampler.sample(policy_relax, device=device)
pr = True
learner_pr.step(episodes_pr, pr=pr)
name = os.path.join(args.output, 'pr.npy')
print('Save the PR method to file: %s'%name)
np.save(name, rewards_pr)
### the proposed method: Policy Relaxation + Importance Weighting
if args.priw:
print('\n========== Proposed method, PR + IW ==========')
print('Load policy from %s'%model_name)
policy_priw = torch.load(model_name).to(args.device)
learner_priw = generate_learner(policy_priw, pr_smooth=PR_SMOOTH)
rewards_priw = np.zeros(args.num_iter)
for idx in tqdm(range(args.num_iter)):
episodes_priw = sampler.sample(policy_priw, device=device)
pr = False
rewards_priw[idx] = episodes_priw.evaluate()
if idx < RELAX_ITERS:
### policy relaxation step
episodes_priw = sampler.sample(policy_relax, device=device)
pr = True
### importance weighting step
learner_priw.step(episodes_priw, iw=True, pr=pr)
### save the data
name = os.path.join(args.output, 'priw.npy')
print('Save the proposed method to file: %s'%name)
np.save(name, rewards_priw)
### PRPG baseline
if args.prpg:
'''
Hyperparameters of PRQ-learning, more details can be found in:
[2] Fernando Fernandez, Javier Garcia, and Manuela Veloso,
"Probabilistic Policy Reuse for inter-task transfer learning",
Robotics and Autonomous Systems, 2010.
'''
print('\n========== PRPG method ==========')
print('Randomly initialize the policy parameters...')
policy_prpg = NormalMLPPolicy(state_dim, action_dim,
hidden_sizes=(args.hidden_size,) * args.num_layers)
learner_prpg = generate_learner(policy_prpg)
rewards_prpg = np.zeros(args.num_iter)
### reuse the policy from the original environment
print('Load the old policy from %s'%model_name)
policy_old = torch.load(model_name).to(args.device)
### UPSILON: temperature for weighting the old and new policies
### PSI: the probability for using the old policy
### NU: weight decay for using the old policy
score_old, score_new = 0.0, 0.0
used_old, used_new = 0, 0
### record the probability for selecting the old policy
### for debuging the PRQ-learning algorithm
select_old_p, use_old_epis = [], []
for idx in tqdm(range(args.num_iter)):
p_old = np.exp(UPSILON * score_old) / (
np.exp(UPSILON * score_new) + np.exp(UPSILON * score_old))
select_old = np.random.binomial(n=1, p=p_old, size=1)
episodes_prpg = sampler.sample(policy_prpg, device=device)
pr = False
rewards_prpg[idx] = episodes_prpg.evaluate()
r_tau_new = RMAX + episodes_prpg.evaluate()
to_use_new = True
if select_old==1:
reuse = np.random.binomial(n=1, p=PSI, size=1)
if reuse==1:
use_old_epis.append(idx)
episodes_prpg = sampler.sample(policy_old, device=device)
pr = True
r_tau_old = RMAX + episodes_prpg.evaluate()
score_old = (score_old * used_old + r_tau_old) / (used_old+1)
used_old += 1
to_use_new = False
if to_use_new:
score_new = (score_new*used_new+r_tau_new)/(used_new+1)
used_new += 1
learner_prpg.step(episodes_prpg, pr=pr)
PSI *= NU
select_old_p.append(p_old*PSI)
### save the data
name = os.path.join(args.output, 'prpg.npy')
print('Save PRPG method to file: %s'%name)
np.save(name, rewards_prpg)
print('Running time: %.2f'%(time.time()-start_time))