-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDQN_train.py
111 lines (84 loc) · 3.22 KB
/
DQN_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os, sys, shutil
import os.path as osp
import gym, random, pickle, os.path, math, glob
import numpy as np
from datetime import timedelta
from timeit import default_timer as timer
import matplotlib
# %matplotlib inline
from IPython.display import clear_output
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from utils.wrappers import *
from utils.hyperparameters import Config
from utils.plot import plot_reward
from agents.BaseAgent import BaseAgent
from model import Model
config = Config()
config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#epsilon variables
config.epsilon_start = 1.0
config.epsilon_final = 0.01
config.epsilon_decay = 30000
config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (config.epsilon_start - config.epsilon_final) * math.exp(-1. * frame_idx / config.epsilon_decay)
#misc agent variables
config.GAMMA = 0.99
config.LR = 1e-4
#memory
config.TARGET_NET_UPDATE_FREQ = 1000
config.EXP_REPLAY_SIZE = 100000 # 100000
config.BATCH_SIZE = 32
#Learning control variables
config.LEARN_START = 10000 # 10000
config.MAX_FRAMES = 1000000 # 1000000
config.UPDATE_FREQ = 1 # 1
if __name__ == '__main__':
start = timer()
# env_id = "PongNoFrameskip-v4"
# env_id = "AirRaidNoFrameskip-v4"
# env_id = "AssaultNoFrameskip-v4"
# env_id = "CarnivalNoFrameskip-v4"
env_id = "DemonAttackNoFrameskip-v4"
print('Init env: ' + env_id)
# log_dir = "/tmp/gym/"
log_dir = osp.join('log', env_id)
if not osp.exists(log_dir):
os.makedirs(log_dir)
env = make_atari(env_id)
env = bench.Monitor(env, os.path.join(log_dir, env_id))
env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=True)
env = WrapPyTorch(env)
gpu_id = 0
with torch.cuda.device(gpu_id):
model = Model(env=env, config=config, log_dir=log_dir)
episode_reward = 0
observation = env.reset()
for frame_idx in range(1, config.MAX_FRAMES + 1):
epsilon = config.epsilon_by_frame(frame_idx)
action = model.get_action(observation, epsilon)
prev_observation = observation
observation, reward, done, _ = env.step(action)
observation = None if done else observation
model.update(prev_observation, action, reward, observation, frame_idx)
episode_reward += reward
if done:
model.finish_nstep()
model.reset_hx()
observation = env.reset()
model.save_reward(episode_reward)
episode_reward = 0
if frame_idx % 1000 == 0:
print('Frame: {}'.format(frame_idx))
if frame_idx % 10000 == 0:
try:
save_filename = osp.join(log_dir, env_id+'results.png')
clear_output(True)
plot_reward(log_dir, env_id, 'DQN', config.MAX_FRAMES, bin_size=10, smooth=1,
time=timedelta(seconds=int(timer()-start)), ipynb=False, save_filename=save_filename)
except IOError:
pass
model.save_w(env_id)
# model.save_w()
env.close()