Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RL AutoML based on MetaFEDOT #33

Open
wants to merge 64 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
01da308
Adding environments, dataloader and test
aPovidlo Jul 9, 2023
a713f55
env fixes and updates
aPovidlo Aug 1, 2023
7b6b76f
Adding generator, agent and fixes in env
aPovidlo Aug 1, 2023
0b24046
Adding generator, agent and fixes in env (repeated)
aPovidlo Aug 2, 2023
43f0977
Adding Save/Load agent, Tensorboard vis, fix fitting process
aPovidlo Aug 7, 2023
7ab4717
ensemble env fixes, load agent, sep main to experiments
aPovidlo Aug 9, 2023
d34a1bd
Fixes memory errors
aPovidlo Sep 18, 2023
1407e08
Adding environments, dataloader and test
aPovidlo Jul 9, 2023
f3fc0aa
env fixes and updates
aPovidlo Aug 1, 2023
0a8201f
Adding generator, agent and fixes in env
aPovidlo Aug 1, 2023
3b3c35c
Adding generator, agent and fixes in env (repeated)
aPovidlo Aug 2, 2023
aae94c1
Adding Save/Load agent, Tensorboard vis, fix fitting process
aPovidlo Aug 7, 2023
e8f915c
ensemble env fixes, load agent, sep main to experiments
aPovidlo Aug 9, 2023
77be6b2
Fixes memory errors
aPovidlo Sep 18, 2023
093d2ae
Adding PPO agent
aPovidlo Feb 5, 2024
8078b5c
Adding TS DataLoader
aPovidlo Feb 5, 2024
ca6bbb8
Adding TS env and experiment file
aPovidlo Feb 5, 2024
49e9857
After rebase
aPovidlo Feb 5, 2024
3109652
Fixes in TS env and script for experiment
aPovidlo Feb 7, 2024
1381df4
Normilize meta data
aPovidlo Feb 7, 2024
a956d19
Fix in PPO
aPovidlo Feb 7, 2024
3c38ee1
Adding penalty for spaming connections actions
aPovidlo Feb 7, 2024
1ca1f7a
Fix in dataloader, adding entropy regularisation and kl div
aPovidlo Feb 12, 2024
fda73da
Adding masking available actions for agent
aPovidlo Feb 13, 2024
2f51ad9
Fix bug with empty pipeline show
aPovidlo Feb 13, 2024
1f389a2
Fix masking actions,update params, refactoring env for new rules and …
aPovidlo Feb 19, 2024
39d1577
Refactoring masking for categorical actions and fix with device
aPovidlo Feb 22, 2024
81fabae
Some fixes
aPovidlo Feb 22, 2024
1a4f39a
Fix in PPo and environment
aPovidlo Feb 26, 2024
53ee333
Adding test for ts env
aPovidlo Feb 26, 2024
e832f66
Fix in render and _get_obs
aPovidlo Feb 26, 2024
c23079d
Environment refactoring
aPovidlo Feb 27, 2024
8c39709
Update ts_experiment
aPovidlo Feb 27, 2024
82654bc
Added new test
aPovidlo Feb 27, 2024
2d67343
Delete old test
aPovidlo Feb 27, 2024
2107755
Adding random agent for comparing
aPovidlo Feb 27, 2024
0a6627c
Update ppo agent and adding log params report
aPovidlo Feb 27, 2024
4b02162
Adding DQN agent
aPovidlo Feb 28, 2024
8cda7bc
Update PPO
aPovidlo Feb 28, 2024
44d06b8
Update env and experiment
aPovidlo Feb 28, 2024
51938ee
Probs returns fix and plot
aPovidlo Mar 1, 2024
b2efb53
Adding test for correct pairs
aPovidlo Mar 1, 2024
1745745
Started PPO with LSTM
aPovidlo Mar 1, 2024
5d98443
Continue RPPO and some fixes
aPovidlo Mar 1, 2024
e1d5d15
Fix bug with dataloader
aPovidlo Mar 22, 2024
e0940d3
Fix in DQN
aPovidlo Mar 22, 2024
d66eaef
Script for pipeline's validation table
aPovidlo Mar 22, 2024
cf76150
Update Random Strategy
aPovidlo Mar 22, 2024
660c73c
Update env for sb3 and added agents from sb3
aPovidlo Mar 22, 2024
4e66a94
Update experiment data and experience injection
aPovidlo Mar 22, 2024
15da93e
Rename TS_DataLoader
aPovidlo Apr 3, 2024
23298c8
Fix RPPO
aPovidlo Apr 3, 2024
5642454
Update Env for curriculum learning
aPovidlo Apr 3, 2024
6582876
Updates for experiments
aPovidlo Apr 3, 2024
9424002
Update sb3 for agent's curriculum learning
aPovidlo Apr 3, 2024
a862b4b
Adding Decision Transformer for offline
aPovidlo Apr 15, 2024
bf6ed01
Updates env, tests and utils
aPovidlo Apr 15, 2024
116e36d
Refactoring env with reward scaling
aPovidlo May 7, 2024
09570e6
Change experiments settings and rewards
aPovidlo May 13, 2024
78697b0
Final refactoring
aPovidlo May 21, 2024
a649ea9
PEP8
aPovidlo May 21, 2024
6f97988
PEP8 (1)
aPovidlo May 21, 2024
26cb6d3
Change dir for rl test
aPovidlo May 21, 2024
b4a2336
Comment rl tests
aPovidlo May 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions gamlet/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from pathlib import Path


def project_root() -> Path:
"""Returns MetaFEDOT project root folder."""
return Path(__file__).parent.parent.parent
Empty file added rl_core/agent/__init__.py
Empty file.
137 changes: 137 additions & 0 deletions rl_core/agent/decision_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from typing import io

import torch
from torch import nn
from torchinfo import summary

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


class EmbeddingLayer(nn.Module):
def __init__(self, input_dim, embed_dim):
super().__init__()
self.embedding = nn.Linear(input_dim, embed_dim)

def forward(self, x, pos_embedding):
return self.embedding(x) + pos_embedding


class DecisionTransformer(nn.Module):
""" https://arxiv.org/abs/2106.01345 """
metadata = {'name': 'DecisionTransformer'}

def __init__(
self, state_dim, action_dim, max_length, embed_dim, num_heads, num_layers, dim_feedforward=2048,
device=DEVICE
):
super().__init__()
self.state_dim = state_dim
self.action_dim = action_dim
self.max_length = max_length
self.embed_dim = embed_dim
self.num_heads = num_heads
self.num_layers = num_layers
self.dim_feedforward = dim_feedforward
self.device = device

self.embed_s = EmbeddingLayer(state_dim, embed_dim)
self.embed_a = EmbeddingLayer(action_dim, embed_dim)
self.embed_R = EmbeddingLayer(1, embed_dim)
self.embed_t = nn.Embedding(max_length, embed_dim)
self.embed_ln = nn.LayerNorm(embed_dim)

self.transformer = nn.Transformer(
d_model=embed_dim, nhead=num_heads,
num_encoder_layers=num_layers, num_decoder_layers=num_layers,
dim_feedforward=dim_feedforward, batch_first=True
)

self.pred_a = nn.Linear(embed_dim, action_dim)
self.max_length = max_length

def forward(self, R, s, a, t, mask=None, a_mask=None):
pos_embedding = self.embed_t(t)
s_embedding = self.embed_s(s, pos_embedding)
a_embedding = self.embed_a(a, pos_embedding)
R_embedding = self.embed_R(R, pos_embedding)

input_embeds = torch.stack((R_embedding, s_embedding, a_embedding), dim=1).permute(0, 2, 1, 3)
input_embeds = input_embeds.reshape(s.size(0), 3*s.size(1), self.embed_dim)
input_embeds = self.embed_ln(input_embeds)

mask_size = s.size(1) * 3

if mask is not None:
mask = torch.stack((mask, mask, mask), dim=1).permute(0, 2, 1).reshape(s.size(0), mask_size)
mask = mask.bool()
else:
mask = torch.zeros((s.size(0), mask_size)).bool().to(DEVICE)

attn_mask = self.transformer.generate_square_subsequent_mask(sz=mask_size).to(DEVICE)
attn_mask = torch.isfinite(attn_mask)
attn_mask = ~attn_mask

hidden_states = self.transformer(
input_embeds,
input_embeds,
src_key_padding_mask=mask,
tgt_key_padding_mask=mask,
memory_key_padding_mask=mask,
memory_is_causal=True,
src_is_causal=True,
tgt_is_causal=True,
src_mask=attn_mask,
tgt_mask=attn_mask,
memory_mask=attn_mask
)

hidden_states = hidden_states.reshape(s.size(0), s.size(1), 3, self.embed_dim).permute(0, 2, 1, 3)

a_hidden = hidden_states[:, 1, :]

# q_values = self.pred_a(a_hidden).squeeze(0)[-1]
# mask_tensor = torch.tensor(a_mask, dtype=torch.bool).to(self.device)
# m_q_values = torch.where(mask_tensor, q_values, -torch.inf)
# argmax_action = torch.argmax(m_q_values)
#
# # 0.1 - eps regularization
# probs = 0.1 * np.ones(self.action_dim) / sum(a_mask)
# m_probs = np.where(a_mask, probs, 0)
# m_probs[argmax_action] += 1 - 0.1
# action = np.random.choice(np.arange(self.action_dim), p=m_probs)

return self.pred_a(a_hidden)

def save(self, path: str):
torch.save(self.state_dict(), path)

def load(self, path: str):
self.load_state_dict(torch.load(path))

def create_log_report(self, log_dir):
with io.open(f'{log_dir}/params.log', 'w', encoding='utf-8') as file:
file.write('-- PARAMS --\n')
file.write(f'state_dim {self.state_dim}\n')
file.write(f'action_dim {self.action_dim}\n')
file.write(f'max_length {self.max_length}')
file.write(f'embed_dim {self.embed_dim}')
file.write(f'num_heads {self.num_heads}')
file.write(f'num_layers {self.num_layers}')
file.write(f'dim_feedforward {self.dim_feedforward}')
file.write('--\n')
file.write(f'device {self.device}\n')
file.write('--\n')
file.write('\n-- ARCHITECTURE --\n')
file.write('- PI MODEL -\n')
embed_s = str(summary(self.embed_s, (1, self.embed_dim), verbose=0))
embed_a = str(summary(self.embed_a, (1, self.embed_dim), verbose=0))
embed_R = str(summary(self.embed_R, (1, self.embed_dim), verbose=0))
embed_t = str(summary(self.embed_t, (1, self.embed_dim), verbose=0))
embed_ln = str(summary(self.embed_ln, (1, self.embed_dim), verbose=0))
transformer = str(summary(self.transformer, (1, self.embed_dim), verbose=0))
file.write(f'{embed_s}')
file.write(f'{embed_a}')
file.write(f'{embed_R}')
file.write(f'{embed_t}')
file.write(f'{embed_ln}')
file.write(f'{transformer}')
125 changes: 125 additions & 0 deletions rl_core/agent/dqn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import io
import random

import numpy as np
import torch.optim
from torch import nn
from torchinfo import summary


class Qfunction(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim, device):
super().__init__()

self.network = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim)
).to(device)

def forward(self, _input: torch.Tensor) -> torch.Tensor:
return self.network(_input)

def save(self, path: str):
torch.save(self.network.state_dict(), path)

def load(self, path: str):
self.network.load_state_dict(torch.load(path))


class DQN:
""" https://arxiv.org/abs/1312.5602 """
metadata = {'name': 'DQN'}

def __init__(self, state_dim, action_dim, hidden_dim=512, gamma=0.01, lr=1e-4, batch_size=64, eps_decrease=1e-6,
eps_min=1e-3, device='cuda'):
self.state_dim = state_dim
self.action_dim = action_dim
self.hidden_dim = hidden_dim
self.q_function = Qfunction(state_dim, action_dim, hidden_dim, device)
self.gamma = gamma
self.batch_size = batch_size
self.lr = lr
self.eps = 1
self.eps_decrease = eps_decrease
self.eps_min = eps_min
self.device = device
self.memory = []
self.optimizer = torch.optim.Adam(self.q_function.parameters(), lr=lr)

self.probs = None

def get_action(self, state, mask):
state = torch.tensor(state, dtype=torch.float32).to(self.device)
mask_tensor = torch.tensor(mask, dtype=torch.bool).to(self.device)

q_values = self.q_function(state)
m_q_values = torch.where(mask_tensor, q_values, -torch.inf)
argmax_action = torch.argmax(m_q_values)
probs = self.eps * np.ones(self.action_dim) / sum(mask)
m_probs = np.where(mask, probs, 0)
m_probs[argmax_action] += 1 - self.eps
action = np.random.choice(np.arange(self.action_dim), p=m_probs)

return action

def fit(self, state, action, reward, done, masks, next_state):
loss_ = np.nan
self.memory.append([state, action, reward, int(done), masks, next_state])

if len(self.memory) > self.batch_size:
batch = random.sample(self.memory, self.batch_size)
states, actions, rewards, dones, masks, next_states = map(torch.tensor, list(zip(*batch)))

states = states.to(torch.float32).to(self.device)
actions = actions.to(self.device)
rewards = rewards.to(self.device)
dones = dones.to(self.device)
masks = masks.to(torch.bool).to(self.device)
next_states = next_states.to(torch.float32).to(self.device)

targets = rewards + self.gamma * (1 - dones) * torch.max(self.q_function(next_states), dim=1).values
q_values = self.q_function(states)[torch.arange(self.batch_size), actions]

loss = torch.mean((q_values - targets.detach()) ** 2)
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()

if self.eps > self.eps_min:
self.eps -= self.eps_decrease

if self.eps < 0:
self.eps = self.eps_min

loss_ = loss.detach().cpu().item()

return loss_

def save(self, path: str):
self.q_function.save(path)

def load(self, path: str):
self.q_function.load(path)

def create_log_report(self, log_dir):
with io.open(f'{log_dir}/params.log', 'w', encoding='utf-8') as file:
file.write('-- PARAMS --\n')
file.write(f'state_dim {self.state_dim}\n')
file.write(f'action_dim {self.action_dim}\n')
file.write(f'hidden_dim {self.hidden_dim}\n')
file.write('--\n')
file.write(f'gamma {self.gamma}\n')
file.write(f'epsilon {self.eps}\n')
file.write(f'eps_min {self.eps_min}\n')
file.write(f'eps_decrease {self.eps_decrease}\n')
file.write(f'batch_size {self.batch_size}\n')
file.write(f'device {self.device}\n')
file.write('--\n')
file.write(f'pi_lr {self.lr}\n')
file.write('\n-- ARCHITECTURE --\n')
file.write('- PI MODEL -\n')
q_function = str(summary(self.q_function, (1, self.state_dim), verbose=0))
file.write(f'{q_function}')
Loading
Loading