Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/proposal market #416

Draft
wants to merge 16 commits into
base: develop
Choose a base branch
from

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion vega_sim/api/trading.py
Original file line number Diff line number Diff line change
@@ -240,7 +240,7 @@ def cancel_order(
wallet_name: str,
wallet: Wallet,
market_id: str,
order_id: str,
order_id: Optional[str],
key_name: Optional[str] = None,
):
"""
2 changes: 1 addition & 1 deletion vega_sim/environment/environment.py
Original file line number Diff line number Diff line change
@@ -70,7 +70,7 @@ def __init__(
step_length_seconds: Optional[int] = None,
vega_service: Optional[VegaServiceNull] = None,
pause_every_n_steps: Optional[int] = None,
random_state: np.random.RandomState = None,
random_state: Optional[np.random.RandomState] = None,
):
"""Set up a Vega protocol environment with some specified agents.
Handles the entire Vega setup and environment lifetime process, allowing the
60 changes: 58 additions & 2 deletions vega_sim/reinforcement/v2/agents/puppets.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,9 @@
from typing import Optional
from logging import getLogger

import vega_sim.proto.vega as vega_protos
from vega_sim.scenario.common.agents import StateAgentWithWallet, VegaService, VegaState
from vega_sim.service import PeggedOrder

logger = getLogger(__name__)

@@ -15,6 +17,11 @@ class Side(Enum):
BUY = 2


class ForcedSide(Enum):
SELL = 0
BUY = 1


@dataclass
class Action:
pass
@@ -26,13 +33,20 @@ class MarketOrderAction(Action):
volume: float


@dataclass
class AtTouchOrderAction(Action):
side: ForcedSide
volume: float


@dataclass
class NoAction(Action):
pass


class AgentType(Enum):
MARKET_ORDER = auto()
AT_TOUCH = auto()


class Puppet(StateAgentWithWallet):
@@ -90,5 +104,47 @@ def step(self, vega_state: VegaState):
logger.exception(traceback.format_exc())


AGENT_TYPE_TO_AGENT = {AgentType.MARKET_ORDER: MarketOrderPuppet}
AGENT_TYPE_TO_ACTION = {AgentType.MARKET_ORDER: MarketOrderAction}
class AtTouchPuppet(Puppet):
def initialise(self, vega: VegaService, create_wallet: bool = True):
super().initialise(vega, create_wallet)
self.market_id = self.vega.find_market_id(name=self.market_name)

def step(self, vega_state: VegaState):
if self.action is not None and self.action is not NoAction:
try:
self.vega.cancel_order(
trading_key=self.key_name, market_id=self.market_id
)
self.vega.submit_order(
trading_key=self.key_name,
market_id=self.market_id,
side=(
"SIDE_BUY"
if self.action.side == ForcedSide.BUY
else "SIDE_SELL"
),
volume=self.action.volume,
time_in_force=vega_protos.vega.Order.TimeInForce.TIME_IN_FORCE_GTC,
order_type=vega_protos.vega.Order.Type.TYPE_LIMIT,
pegged_order=PeggedOrder(
reference=(
vega_protos.vega.PeggedReference.PEGGED_REFERENCE_BEST_BID
if self.action.side == ForcedSide.BUY
else vega_protos.vega.PeggedReference.PEGGED_REFERENCE_BEST_ASK
),
offset=0,
),
wait=False,
)
except:
logger.exception(traceback.format_exc())


AGENT_TYPE_TO_AGENT = {
AgentType.MARKET_ORDER: MarketOrderPuppet,
AgentType.AT_TOUCH: AtTouchPuppet,
}
AGENT_TYPE_TO_ACTION = {
AgentType.MARKET_ORDER: MarketOrderAction,
AgentType.AT_TOUCH: AtTouchOrderAction,
}
18 changes: 11 additions & 7 deletions vega_sim/reinforcement/v2/learning_environment.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict
from typing import Dict, Type, Optional
from dataclasses import dataclass
from vega_sim.reinforcement.v2.agents.puppets import (
AGENT_TYPE_TO_AGENT,
@@ -24,9 +24,9 @@ class StepResult:
class Environment:
def __init__(
self,
agents: Dict[str, AgentType],
agent_to_reward: Dict[str, BaseRewarder],
agent_to_state: Dict[str, State],
agents: Dict[str, Type[AgentType]],
agent_to_reward: Dict[str, Type[BaseRewarder]],
agent_to_state: Dict[str, Type[State]],
scenario: Scenario,
reset_vega_every_n_runs: int = 100,
funds_per_run: float = 10_000,
@@ -63,20 +63,24 @@ def _extract_observation(self, agent_name: str) -> State:
asset_id=self._asset_id,
)

def step(self, actions: Dict[str, Action]) -> Dict[str, StepResult]:
def step(self, actions: Dict[str, Optional[Action]]) -> Dict[str, StepResult]:
for agent_name, action in actions.items():
self._puppets[agent_name].set_next_action(action=action)
if action is not None:
self._puppets[agent_name].set_next_action(action=action)

self._scenario.env.step(self._vega)
self._vega.wait_fn(1)
step_res = {}
for agent_name, reward_gen in self._agent_to_reward.items():
step_res[agent_name] = StepResult(
observation=self._extract_observation(agent_name),
reward=reward_gen.get_reward(self._vega),
reward=self.calculate_reward(reward_gen),
)
return step_res

def calculate_reward(self, rewarder: Type[BaseRewarder]) -> float:
return rewarder.get_reward(vega=self._vega)

def _reset_vega(self) -> None:
self._vega.stop()
self._vega = VegaServiceNull(
17 changes: 16 additions & 1 deletion vega_sim/reinforcement/v2/rewards.py
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@

class Reward(Enum):
PNL = auto()
SQ_INVENTORY_PENALTY = auto()


class BaseRewarder(ABC):
@@ -45,4 +46,18 @@ def get_reward(self, vega: VegaService) -> float:
return reward


REWARD_ENUM_TO_CLASS = {Reward.PNL: PnlRewarder}
class SquareInventoryPenalty(BaseRewarder):
def get_reward(self, vega: VegaService) -> float:
posn = vega.positions_by_market(
key_name=self.agent_key,
wallet_name=self.agent_wallet,
market_id=self.market_id,
)

return (-1 * posn.open_volume**2) if posn is not None else 0


REWARD_ENUM_TO_CLASS = {
Reward.PNL: PnlRewarder,
Reward.SQ_INVENTORY_PENALTY: SquareInventoryPenalty,
}
89 changes: 68 additions & 21 deletions vega_sim/reinforcement/v2/stable_baselines/environment.py
Original file line number Diff line number Diff line change
@@ -2,35 +2,58 @@

import gymnasium as gym
from gymnasium import spaces
from typing import Type, Optional
from enum import Enum
from stable_baselines3.common.callbacks import BaseCallback

from vega_sim.reinforcement.v2.agents.puppets import (
MarketOrderAction,
AgentType,
ForcedSide,
Side,
AGENT_TYPE_TO_ACTION,
)
from vega_sim.reinforcement.v2.learning_environment import Environment
from vega_sim.reinforcement.v2.rewards import Reward
from vega_sim.reinforcement.v2.rewards import Reward, REWARD_ENUM_TO_CLASS
from vega_sim.reinforcement.v2.stable_baselines.states import (
price_state_with_fees_obs_space,
position_state_with_fees_obs_space,
)
from vega_sim.reinforcement.v2.states import PriceStateWithFees, State
from vega_sim.reinforcement.v2.states import PriceStateWithFees, State, PositionOnly
from vega_sim.scenario.registry import CurveMarketMaker


logger = logging.getLogger(__name__)


class ActionType(Enum):
MARKET = "market"
AT_TOUCH_ONE_SIDE = "at_touch_one_side"


ACTION_TO_AGENT = {
ActionType.MARKET: AgentType.MARKET_ORDER,
ActionType.AT_TOUCH_ONE_SIDE: AgentType.AT_TOUCH,
}


class ActionLoggerCallback(BaseCallback):
pass


class SingleAgentVegaEnv(gym.Env):
"""Custom Environment that follows gym interface."""

metadata = {"render.modes": ["human"]}

def __init__(
self,
action_type: str = "market",
state_type: State = PriceStateWithFees,
action_type: ActionType = ActionType.MARKET,
state_type: Type[State] = PriceStateWithFees,
reward_type: Reward = Reward.PNL,
num_levels_state: int = 5,
trade_volume: float = 1,
steps_per_trading_session: int = 1000,
terminal_reward_type: Optional[Reward] = None,
):
super().__init__()
self.num_levels_state = num_levels_state
@@ -40,6 +63,7 @@ def __init__(
self.steps_per_trading_session = steps_per_trading_session
self.current_step = 0
self.learner_name = "learner_1"
self.terminal_reward_type = terminal_reward_type

# Define action and observation space
# They must be gym.spaces objects
@@ -57,6 +81,7 @@ def __init__(
initial_asset_mint=1e8,
step_length_seconds=1,
block_length_seconds=1,
# market_maker_assumed_market_kappa=0.8,
buy_intensity=5,
sell_intensity=5,
market_name="ETH",
@@ -67,45 +92,67 @@ def __init__(
)

self.env = Environment(
agents={self.learner_name: AgentType.MARKET_ORDER},
agent_to_reward={self.learner_name: Reward.PNL},
agent_to_state={self.learner_name: PriceStateWithFees},
agents={self.learner_name: ACTION_TO_AGENT[self.action_type]},
agent_to_reward={self.learner_name: reward_type},
agent_to_state={self.learner_name: state_type},
scenario=scenario,
)

def _get_action_space(self, action_type: str) -> spaces.Space:
if action_type == "market":
def _get_action_space(self, action_type: ActionType) -> spaces.Space:
if action_type == ActionType.MARKET:
return spaces.Discrete(3)
elif action_type == ActionType.AT_TOUCH_ONE_SIDE:
return spaces.Discrete(2)
else:
raise Exception(f"Action type {action_type} is not implemented")

def _get_observation_space(self, state_type: State) -> spaces.Space:
def _get_observation_space(self, state_type: Type[State]) -> spaces.Space:
if state_type == PriceStateWithFees:
return price_state_with_fees_obs_space(num_levels=self.num_levels_state)
if state_type == PositionOnly:
return position_state_with_fees_obs_space()

def _action_conversion(self, action):
if self.action_type == ActionType.MARKET:
return AGENT_TYPE_TO_ACTION[AgentType.MARKET_ORDER](
side=Side(action), volume=self.trade_volume
)
elif self.action_type == ActionType.AT_TOUCH_ONE_SIDE:
return AGENT_TYPE_TO_ACTION[AgentType.AT_TOUCH](
side=ForcedSide(action), volume=self.trade_volume
)
else:
raise Exception(f"Action type {self.action_type} is not implemented")

def step(self, action):
step_res = self.env.step(
{
self.learner_name: MarketOrderAction(
side=Side(action), volume=self.trade_volume
)
}
{self.learner_name: self._action_conversion(action=action)}
)[self.learner_name]
self.current_step += 1

is_terminal = self.current_step >= self.steps_per_trading_session

if is_terminal and self.terminal_reward_type is not None:
terminal_reward = REWARD_ENUM_TO_CLASS[self.terminal_reward_type](
agent_key=self.learner_name,
asset_id=self.env._asset_id,
market_id=self.env._market_id,
)
reward = step_res.reward + self.env.calculate_reward(terminal_reward)
else:
reward = step_res.reward

return (
step_res.observation.to_array(),
step_res.reward,
self.current_step >= self.steps_per_trading_session,
reward,
is_terminal,
False,
{},
)

def reset(self):
self.env.reset()
step_res = self.env.step(
{self.learner_name: MarketOrderAction(side=Side.NONE, volume=0)}
)[self.learner_name]
step_res = self.env.step({self.learner_name: None})[self.learner_name]

self.current_step = 0
return step_res.observation.to_array(), {}
15 changes: 11 additions & 4 deletions vega_sim/reinforcement/v2/stable_baselines/run.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
from stable_baselines3 import PPO
from stable_baselines3 import PPO, DQN

import vega_sim.reinforcement.v2.stable_baselines.environment as env
from vega_sim.reinforcement.v2.states import PriceStateWithFees, PositionOnly

if __name__ == "__main__":
e = env.SingleAgentVegaEnv(steps_per_trading_session=200)
e = env.SingleAgentVegaEnv(
action_type=env.ActionType.AT_TOUCH_ONE_SIDE,
steps_per_trading_session=200,
reward_type=env.Reward.PNL,
terminal_reward_type=env.Reward.SQ_INVENTORY_PENALTY,
state_type=PriceStateWithFees,
)
model = PPO(
"MlpPolicy",
e,
verbose=1,
tensorboard_log="./ppo_tensorboard/",
n_steps=200,
batch_size=100,
n_steps=600,
batch_size=50,
).learn(total_timesteps=1_000_000)
11 changes: 11 additions & 0 deletions vega_sim/reinforcement/v2/stable_baselines/states.py
Original file line number Diff line number Diff line change
@@ -20,3 +20,14 @@ def price_state_with_fees_obs_space(
),
dtype=np.float64,
)


def position_state_with_fees_obs_space(
min_position: float = -1000,
max_position: float = 1000,
) -> gym.spaces.Box:
return gym.spaces.Box(
low=np.array([min_position]),
high=np.array([max_position]),
dtype=np.float64,
)
Loading