From 56436b636a425d033122981e4399bf8778c3b7d3 Mon Sep 17 00:00:00 2001
From: Shahar Bar <shaharb@playtika.com>
Date: Wed, 11 Dec 2024 14:06:54 +0200
Subject: [PATCH] Add epsilon greedy support in from_state

 ### Changes
 * Add epsilon and default_action to from_state methods in smab.py and cmab.py.
 * Updated state UTs.
---
 .github/workflows/release_draft.yml |  1 +
 pybandits/cmab.py                   | 20 +++++++++--
 pybandits/smab.py                   | 32 +++++++++++++++---
 tests/test_cmab.py                  | 38 ++++++++++++++-------
 tests/test_smab.py                  | 51 +++++++++++++++++++----------
 5 files changed, 105 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/release_draft.yml b/.github/workflows/release_draft.yml
index cadfae7..a57c086 100644
--- a/.github/workflows/release_draft.yml
+++ b/.github/workflows/release_draft.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - develop
+      - main
 
 jobs:
   draft_release:
diff --git a/pybandits/cmab.py b/pybandits/cmab.py
index efc5587..acfbb2a 100644
--- a/pybandits/cmab.py
+++ b/pybandits/cmab.py
@@ -227,7 +227,11 @@ def __init__(
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulli":
-        return cls(actions=state["actions"])
+        return cls(
+            actions=state["actions"],
+            epsilon=state.get("epsilon", None),
+            default_action=state.get("default_action", None),
+        )
 
     @validate_call(config=dict(arbitrary_types_allowed=True))
     def update(self, context: ArrayLike, actions: List[ActionId], rewards: List[BinaryReward]):
@@ -271,7 +275,12 @@ def __init__(
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulliBAI":
-        return cls(actions=state["actions"], exploit_p=state["strategy"].get("exploit_p", None))
+        return cls(
+            actions=state["actions"],
+            exploit_p=state["strategy"].get("exploit_p", None),
+            epsilon=state.get("epsilon", None),
+            default_action=state.get("default_action", None),
+        )
 
     @validate_call(config=dict(arbitrary_types_allowed=True))
     def update(self, context: ArrayLike, actions: List[ActionId], rewards: List[BinaryReward]):
@@ -324,7 +333,12 @@ def __init__(
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulliCC":
-        return cls(actions=state["actions"], subsidy_factor=state["strategy"].get("subsidy_factor", None))
+        return cls(
+            actions=state["actions"],
+            subsidy_factor=state["strategy"].get("subsidy_factor", None),
+            epsilon=state.get("epsilon", None),
+            default_action=state.get("default_action", None),
+        )
 
     @validate_call(config=dict(arbitrary_types_allowed=True))
     def update(self, context: ArrayLike, actions: List[ActionId], rewards: List[BinaryReward]):
diff --git a/pybandits/smab.py b/pybandits/smab.py
index 65e4bb1..a3ee363 100644
--- a/pybandits/smab.py
+++ b/pybandits/smab.py
@@ -150,7 +150,11 @@ def __init__(
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulli":
-        return cls(actions=state["actions"])
+        return cls(
+            actions=state["actions"],
+            epsilon=state.get("epsilon", None),
+            default_action=state.get("default_action", None),
+        )
 
     @validate_call
     def update(self, actions: List[ActionId], rewards: List[BinaryReward]):
@@ -187,7 +191,12 @@ def __init__(
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulliBAI":
-        return cls(actions=state["actions"], exploit_p=state["strategy"].get("exploit_p", None))
+        return cls(
+            actions=state["actions"],
+            exploit_p=state["strategy"].get("exploit_p", None),
+            epsilon=state.get("epsilon", None),
+            default_action=state.get("default_action", None),
+        )
 
     @validate_call
     def update(self, actions: List[ActionId], rewards: List[BinaryReward]):
@@ -232,7 +241,12 @@ def __init__(
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulliCC":
-        return cls(actions=state["actions"], subsidy_factor=state["strategy"].get("subsidy_factor", None))
+        return cls(
+            actions=state["actions"],
+            subsidy_factor=state["strategy"].get("subsidy_factor", None),
+            epsilon=state.get("epsilon", None),
+            default_action=state.get("default_action", None),
+        )
 
     @validate_call
     def update(self, actions: List[ActionId], rewards: List[BinaryReward]):
@@ -303,7 +317,11 @@ def __init__(
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulliMO":
-        return cls(actions=state["actions"])
+        return cls(
+            actions=state["actions"],
+            epsilon=state.get("epsilon", None),
+            default_action=state.get("default_action", None),
+        )
 
 
 class SmabBernoulliMOCC(BaseSmabBernoulliMO):
@@ -337,7 +355,11 @@ def __init__(
 
     @classmethod
     def from_state(cls, state: dict) -> "SmabBernoulliMOCC":
-        return cls(actions=state["actions"])
+        return cls(
+            actions=state["actions"],
+            epsilon=state.get("epsilon", None),
+            default_action=state.get("default_action", None),
+        )
 
 
 @validate_call
diff --git a/tests/test_cmab.py b/tests/test_cmab.py
index a992afa..2ff034f 100644
--- a/tests/test_cmab.py
+++ b/tests/test_cmab.py
@@ -19,7 +19,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-from typing import get_args
+from typing import Optional, get_args
 
 import numpy as np
 import pandas as pd
@@ -381,21 +381,26 @@ def run_predict(mab):
 
 
 @settings(deadline=500)
-@given(st.integers(min_value=1), st.integers(min_value=1), st.integers(min_value=2, max_value=100))
-def test_cmab_get_state(mu, sigma, n_features):
+@given(
+    st.integers(min_value=1),
+    st.integers(min_value=1),
+    st.integers(min_value=2, max_value=100),
+    st.sampled_from([None, 0.1]),
+)
+def test_cmab_get_state(mu, sigma, n_features, epsilon):
     actions: dict = {
         "a1": BayesianLogisticRegression(alpha=StudentT(mu=mu, sigma=sigma), betas=n_features * [StudentT()]),
         "a2": create_bayesian_logistic_regression_cold_start(n_betas=n_features),
     }
 
-    cmab = CmabBernoulli(actions=actions)
+    cmab = CmabBernoulli(actions=actions, epsilon=epsilon)
     expected_state = to_serializable_dict(
         {
             "actions": actions,
             "strategy": {},
             "predict_with_proba": False,
             "predict_actions_randomly": False,
-            "epsilon": None,
+            "epsilon": epsilon,
             "default_action": None,
         }
     )
@@ -438,6 +443,7 @@ def test_cmab_get_state(mu, sigma, n_features):
                 min_size=2,
             ),
             "strategy": st.fixed_dictionaries({}),
+            "epsilon": st.sampled_from([None, 0.1]),
         }
     ),
     update_method=st.sampled_from(literal_update_methods),
@@ -613,21 +619,22 @@ def test_cmab_bai_update(n_samples, n_features, update_method):
     st.integers(min_value=1),
     st.integers(min_value=2, max_value=100),
     st.floats(min_value=0, max_value=1),
+    st.sampled_from([None, 0.1]),
 )
-def test_cmab_bai_get_state(mu, sigma, n_features, exploit_p: Float01):
+def test_cmab_bai_get_state(mu, sigma, n_features, exploit_p: Float01, epsilon: Optional[Float01]):
     actions: dict = {
         "a1": BayesianLogisticRegression(alpha=StudentT(mu=mu, sigma=sigma), betas=n_features * [StudentT()]),
         "a2": create_bayesian_logistic_regression_cold_start(n_betas=n_features),
     }
 
-    cmab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p)
+    cmab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p, epsilon=epsilon)
     expected_state = to_serializable_dict(
         {
             "actions": actions,
             "strategy": {"exploit_p": exploit_p},
             "predict_with_proba": False,
             "predict_actions_randomly": False,
-            "epsilon": None,
+            "epsilon": epsilon,
             "default_action": None,
         }
     )
@@ -674,6 +681,7 @@ def test_cmab_bai_get_state(mu, sigma, n_features, exploit_p: Float01):
                 st.just({"exploit_p": None}),
                 st.builds(lambda x: {"exploit_p": x}, st.floats(min_value=0, max_value=1)),
             ),
+            "epsilon": st.sampled_from([None, 0.1]),
         }
     ),
     update_method=st.sampled_from(literal_update_methods),
@@ -864,9 +872,16 @@ def test_cmab_cc_update(n_samples, n_features, update_method):
     st.floats(min_value=0),
     st.floats(min_value=0),
     st.floats(min_value=0, max_value=1),
+    st.sampled_from([None, 0.1]),
 )
 def test_cmab_cc_get_state(
-    mu, sigma, n_features, cost_1: NonNegativeFloat, cost_2: NonNegativeFloat, subsidy_factor: Float01
+    mu,
+    sigma,
+    n_features,
+    cost_1: NonNegativeFloat,
+    cost_2: NonNegativeFloat,
+    subsidy_factor: Float01,
+    epsilon: Optional[Float01],
 ):
     actions: dict = {
         "a1": BayesianLogisticRegressionCC(
@@ -875,14 +890,14 @@ def test_cmab_cc_get_state(
         "a2": create_bayesian_logistic_regression_cc_cold_start(n_betas=n_features, cost=cost_2),
     }
 
-    cmab = CmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor)
+    cmab = CmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor, epsilon=epsilon)
     expected_state = to_serializable_dict(
         {
             "actions": actions,
             "strategy": {"subsidy_factor": subsidy_factor},
             "predict_with_proba": True,
             "predict_actions_randomly": False,
-            "epsilon": None,
+            "epsilon": epsilon,
             "default_action": None,
         }
     )
@@ -930,6 +945,7 @@ def test_cmab_cc_get_state(
                 st.just({"subsidy_factor": None}),
                 st.builds(lambda x: {"subsidy_factor": x}, st.floats(min_value=0, max_value=1)),
             ),
+            "epsilon": st.sampled_from([None, 0.1]),
         }
     ),
     update_method=st.sampled_from(literal_update_methods),
diff --git a/tests/test_smab.py b/tests/test_smab.py
index 369d016..06136d3 100644
--- a/tests/test_smab.py
+++ b/tests/test_smab.py
@@ -21,7 +21,7 @@
 # SOFTWARE.
 import json
 from copy import deepcopy
-from typing import List
+from typing import List, Optional
 
 import pytest
 from hypothesis import given
@@ -203,16 +203,22 @@ def test_smab_accepts_only_valid_actions(s):
         SmabBernoulli(actions={s: Beta(), s + "_": Beta()})
 
 
-@given(st.integers(min_value=1), st.integers(min_value=1), st.integers(min_value=1), st.integers(min_value=1))
-def test_smab_get_state(a, b, c, d):
+@given(
+    st.integers(min_value=1),
+    st.integers(min_value=1),
+    st.integers(min_value=1),
+    st.integers(min_value=1),
+    st.sampled_from([None, 0.1]),
+)
+def test_smab_get_state(a, b, c, d, epsilon):
     actions = {"action1": Beta(n_successes=a, n_failures=b), "action2": Beta(n_successes=c, n_failures=d)}
-    smab = SmabBernoulli(actions=actions)
+    smab = SmabBernoulli(actions=actions, epsilon=epsilon)
 
     expected_state = to_serializable_dict(
         {
             "actions": actions,
             "strategy": {},
-            "epsilon": None,
+            "epsilon": epsilon,
             "default_action": None,
         }
     )
@@ -236,6 +242,7 @@ def test_smab_get_state(a, b, c, d):
                 min_size=2,
             ),
             "strategy": st.fixed_dictionaries({}),
+            "epsilon": st.sampled_from([None, 0.1]),
         }
     )
 )
@@ -324,15 +331,16 @@ def test_smabbai_with_betacc():
     st.integers(min_value=1),
     st.integers(min_value=1),
     st.floats(min_value=0, max_value=1),
+    st.sampled_from([None, 0.1]),
 )
-def test_smab_bai_get_state(a, b, c, d, exploit_p: Float01):
+def test_smab_bai_get_state(a, b, c, d, exploit_p: Float01, epsilon: Optional[Float01]):
     actions = {"action1": Beta(n_successes=a, n_failures=b), "action2": Beta(n_successes=c, n_failures=d)}
     smab = SmabBernoulliBAI(actions=actions, exploit_p=exploit_p)
     expected_state = to_serializable_dict(
         {
             "actions": actions,
             "strategy": {"exploit_p": exploit_p},
-            "epsilon": None,
+            "epsilon": epsilon,
             "default_action": None,
         }
     )
@@ -362,6 +370,7 @@ def test_smab_bai_get_state(a, b, c, d, exploit_p: Float01):
                 st.just({"exploit_p": None}),
                 st.builds(lambda x: {"exploit_p": x}, st.floats(min_value=0, max_value=1)),
             ),
+            "epsilon": st.sampled_from([None, 0.1]),
         }
     )
 )
@@ -454,20 +463,23 @@ def test_smabcc_update():
     st.floats(min_value=0),
     st.floats(min_value=0),
     st.floats(min_value=0, max_value=1),
+    st.sampled_from([None, 0.1]),
 )
-def test_smab_cc_get_state(a, b, c, d, cost1: NonNegativeFloat, cost2: NonNegativeFloat, subsidy_factor: Float01):
+def test_smab_cc_get_state(
+    a, b, c, d, cost1: NonNegativeFloat, cost2: NonNegativeFloat, subsidy_factor: Float01, epsilon: Optional[Float01]
+):
     actions = {
         "action1": BetaCC(n_successes=a, n_failures=b, cost=cost1),
         "action2": BetaCC(n_successes=c, n_failures=d, cost=cost2),
     }
-    smab = SmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor)
+    smab = SmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor, epsilon=epsilon)
     expected_state = to_serializable_dict(
         {
             "actions": actions,
             "strategy": {
                 "subsidy_factor": subsidy_factor,
             },
-            "epsilon": None,
+            "epsilon": epsilon,
             "default_action": None,
         }
     )
@@ -498,6 +510,7 @@ def test_smab_cc_get_state(a, b, c, d, cost1: NonNegativeFloat, cost2: NonNegati
                 st.just({"subsidy_factor": None}),
                 st.builds(lambda x: {"subsidy_factor": x}, st.floats(min_value=0, max_value=1)),
             ),
+            "epsilon": st.sampled_from([None, 0.1]),
         }
     )
 )
@@ -606,8 +619,8 @@ def test_smab_mo_update():
     mab.update(actions=["a1", "a1"], rewards=[[1, 0, 1], [1, 1, 0]])
 
 
-@given(st.lists(st.integers(min_value=1), min_size=6, max_size=6))
-def test_smab_mo_get_state(a_list):
+@given(st.lists(st.integers(min_value=1), min_size=6, max_size=6), st.sampled_from([None, 0.1]))
+def test_smab_mo_get_state(a_list, epsilon):
     a, b, c, d, e, f = a_list
 
     actions = {
@@ -626,12 +639,12 @@ def test_smab_mo_get_state(a_list):
             ]
         ),
     }
-    smab = SmabBernoulliMO(actions=actions)
+    smab = SmabBernoulliMO(actions=actions, epsilon=epsilon)
     expected_state = to_serializable_dict(
         {
             "actions": actions,
             "strategy": {},
-            "epsilon": None,
+            "epsilon": epsilon,
             "default_action": None,
         }
     )
@@ -665,6 +678,7 @@ def test_smab_mo_get_state(a_list):
                 min_size=2,
             ),
             "strategy": st.fixed_dictionaries({}),
+            "epsilon": st.sampled_from([None, 0.1]),
         }
     )
 )
@@ -767,8 +781,8 @@ def test_smab_mo_cc_predict():
         s.predict(n_samples=n_samples, forbidden_actions=forbidden)
 
 
-@given(st.lists(st.integers(min_value=1), min_size=8, max_size=8))
-def test_smab_mocc_get_state(a_list):
+@given(st.lists(st.integers(min_value=1), min_size=8, max_size=8), st.sampled_from([None, 0.1]))
+def test_smab_mo_cc_get_state(a_list, epsilon):
     a, b, c, d, e, f, g, h = a_list
 
     actions = {
@@ -789,12 +803,12 @@ def test_smab_mocc_get_state(a_list):
             cost=h,
         ),
     }
-    smab = SmabBernoulliMOCC(actions=actions)
+    smab = SmabBernoulliMOCC(actions=actions, epsilon=epsilon)
     expected_state = to_serializable_dict(
         {
             "actions": actions,
             "strategy": {},
-            "epsilon": None,
+            "epsilon": epsilon,
             "default_action": None,
         }
     )
@@ -829,6 +843,7 @@ def test_smab_mocc_get_state(a_list):
                 min_size=2,
             ),
             "strategy": st.fixed_dictionaries({}),
+            "epsilon": st.sampled_from([None, 0.1]),
         }
     )
 )