-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDQN_main.py
150 lines (135 loc) · 6.43 KB
/
DQN_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import torch
import datetime
import numpy as np
import pickle
from utils import save_results, make_dir
from utils import plot_rewards
from DQN_model import DQN
from DQN_env import *
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
self.env_name = 'micro_grid' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.seed = 10 # 随机种子,置0则不设置随机种子
self.train_eps = 1000 # 训练的回合数
self.test_eps = 10 # 测试的回合数
self.train_day = 0
################################################################################
################################## 算法超参数 ###################################
self.gamma = 1 # 强化学习中的折扣因子
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
self.lr = 0.00025 # 学习率
self.memory_capacity = 4000 # 经验回放的容量
self.batch_size = 200 # mini-batch SGD中的批量大小
self.target_update = 10 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = './result/' # 保存结果的路径
# self.model_path = curr_path + "/outputs/" + self.env_name + \
# '/' + curr_time + '/models/' # 保存模型的路径
#"20221221-154606good"
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = MicroGridEnv() # 创建环境
state_dim = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度
print(state_dim, action_dim, "\n")
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
if cfg.seed != 0: # 设置随机种子
torch.manual_seed(cfg.seed)
#env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有time_step的滑动平均奖励
ep_reward = [] # 记录所有time_step的奖励
for i_ep in range(cfg.train_eps):
state = env.reset_all(day=cfg.train_day) # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward.append(reward) # 累加奖励
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * reward)
else:
ma_rewards.append(reward)
if done:
break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
if (i_ep + 1) % 10 == 0:
print('回合:{}/{}'.format(i_ep + 1, cfg.train_eps))
print('完成训练!')
env.close()
return ep_reward, ma_rewards
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset_all(day=i_ep) # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action,test=True) # 更新环境,返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
env.render(display=[0,0,0,0])
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep + 1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print(rewards)
print('完成测试!')
env.close()
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
save_results(rewards, ma_rewards, tag='DQN_train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# # 测试
env, agent = env_agent_config(cfg)
rewards, ma_rewards = test(cfg, env, agent)
print("平均奖励",np.mean(rewards))
save_results(rewards, ma_rewards, tag='DQN_test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果