From eb2eebe1cb3234c5288fb75aca7b9beb137c0695 Mon Sep 17 00:00:00 2001
From: Shahar Bar <33932594+shaharbar1@users.noreply.github.com>
Date: Thu, 5 Sep 2024 15:28:40 +0300
Subject: [PATCH] Add offline policy evaluation module and update dependencies
### Changes
* Introduced `offline_policy_evaluator.py` with classes for propensity score estimation and offline policy evaluation.
* Introduced `offline_policy_estimator.py` with classes for offline policy estimation.
* Updated `pyproject.toml` to include new dependencies: `bokeh` and `optuna`. Further adjusted existing dependencies to compatible versions and added python 3.12 support.
* Changed .pre-commit-config.yaml to utilize nbstripout instead of nbdev_clean.
* Added caching of dependencies on CI and CD.
* Added class method to PyBanditsBaseModel on base.py to allow seeing default values for arguments that were not passed to the model.
* Added test_offline_policy_evaluator.py and test_offline_policy_estimator.py as a test suite for the OfflinePolicyEvaluator.
* Added `get_non_abstract_classes`, `visualize_via_bokeh` and `in_jupyter_notebook` utility functions.
---
.github/workflows/continuous_delivery.yml | 15 +-
.github/workflows/continuous_integration.yml | 19 +-
.pre-commit-config.yaml | 6 +-
docs/src/tutorials/cmab.ipynb | 79 +-
docs/src/tutorials/simulation_cmab.ipynb | 299 +----
docs/src/tutorials/simulation_smab.ipynb | 38 +-
docs/src/tutorials/smab.ipynb | 36 +-
docs/tutorials/cmab.ipynb | 79 +-
docs/tutorials/mab.ipynb | 337 +-----
docs/tutorials/simulation_cmab.ipynb | 299 +----
docs/tutorials/simulation_smab.ipynb | 38 +-
docs/tutorials/smab.ipynb | 36 +-
docs/tutorials/smab_mo_cc.ipynb | 409 +------
pybandits/offline_policy_estimator.py | 807 +++++++++++++
pybandits/offline_policy_evaluator.py | 1127 ++++++++++++++++++
pybandits/pydantic_version_compatibility.py | 17 +-
pybandits/strategy.py | 8 +-
pybandits/utils.py | 80 +-
pyproject.toml | 35 +-
tests/test_cmab.py | 8 +-
tests/test_offline_policy_estimator.py | 162 +++
tests/test_offline_policy_evaluator.py | 300 +++++
22 files changed, 2741 insertions(+), 1493 deletions(-)
create mode 100644 pybandits/offline_policy_estimator.py
create mode 100644 pybandits/offline_policy_evaluator.py
create mode 100644 tests/test_offline_policy_estimator.py
create mode 100644 tests/test_offline_policy_evaluator.py
diff --git a/.github/workflows/continuous_delivery.yml b/.github/workflows/continuous_delivery.yml
index ab2679e..4059d01 100644
--- a/.github/workflows/continuous_delivery.yml
+++ b/.github/workflows/continuous_delivery.yml
@@ -29,10 +29,23 @@ jobs:
export PATH="$HOME/.poetry/bin:$PATH"
- name: Backup pyproject.toml
run: cp pyproject.toml pyproject.toml.bak
+ - name: Change pydantic version
+ run: |
+ poetry add pydantic@${{ matrix.pydantic-version }} --lock
+ - name: Cache Poetry virtualenv and dependencies
+ uses: actions/cache@v4
+ with:
+ path: |
+ ~/.cache/pypoetry
+ ~/.local/share/pypoetry/virtualenvs
+ key: ${{ runner.os }}-poetry-${{ matrix.python-version }}-${{ matrix.pydantic-version }}-${{ hashFiles('poetry.lock') }}
+ restore-keys: |
+ ${{ runner.os }}-poetry-${{ matrix.python-version }}-${{ matrix.pydantic-version }}-
- name: Install project dependencies with Poetry
run: |
- poetry add pydantic@${{ matrix.pydantic-version }}
poetry install
+ - name: Restore pyproject.toml
+ run: |
mv pyproject.toml.bak pyproject.toml
- name: Style check
run: |
diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 2853deb..3b9de13 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -20,7 +20,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- python-version: [ "3.8", "3.9", "3.10" ]
+ python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
pydantic-version: [ "1.10.*", "2.*" ]
steps:
@@ -35,9 +35,20 @@ jobs:
run: |
curl -sSL https://install.python-poetry.org | python3 -
export PATH="$HOME/.poetry/bin:$PATH"
+ - name: Change pydantic version
+ run: |
+ poetry add pydantic@${{ matrix.pydantic-version }} --lock
+ - name: Cache Poetry virtualenv and dependencies
+ uses: actions/cache@v4
+ with:
+ path: |
+ ~/.cache/pypoetry
+ ~/.local/share/pypoetry/virtualenvs
+ key: ${{ runner.os }}-poetry-${{ matrix.python-version }}-${{ matrix.pydantic-version }}-${{ hashFiles('poetry.lock') }}
+ restore-keys: |
+ ${{ runner.os }}-poetry-${{ matrix.python-version }}-${{ matrix.pydantic-version }}-
- name: Install project dependencies with Poetry
run: |
- poetry add pydantic@${{ matrix.pydantic-version }}
poetry install
- name: Style check
run: |
@@ -45,4 +56,8 @@ jobs:
poetry run pre-commit run --all-files
- name: Run tests
run: |
+ START_TIME=$(date +%s)
poetry run pytest -vv -k 'not time and not update_parallel'
+ END_TIME=$(date +%s)
+ DURATION=$((END_TIME - START_TIME))
+ echo "Tests completed in $DURATION seconds."
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c185f1b..d0b0d5f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
types_or: [ python, pyi, jupyter ]
require_serial: true
- - repo: https://github.com/fastai/nbdev
- rev: 2.3.11
+ - repo: https://github.com/kynan/nbstripout
+ rev: 0.7.1
hooks:
- - id: nbdev_clean
+ - id: nbstripout
diff --git a/docs/src/tutorials/cmab.ipynb b/docs/src/tutorials/cmab.ipynb
index 07b74fa..cb83a7e 100644
--- a/docs/src/tutorials/cmab.ipynb
+++ b/docs/src/tutorials/cmab.ipynb
@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {
"pycharm": {
"is_executing": false
@@ -56,31 +56,13 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {
"pycharm": {
"is_executing": false
}
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "X: context matrix of shape (n_samples, n_features)\n",
- "[[-0.53211475 -0.40592956 0.05892565 -0.88067628 -0.84061481]\n",
- " [-0.95680954 -0.00540581 0.09148556 -0.82021004 -0.63425381]\n",
- " [-0.87792928 -0.51881823 -0.51767022 -0.05385187 -0.64499044]\n",
- " [-0.10569516 0.30847784 -0.353929 -0.94831998 -0.52175713]\n",
- " [-0.05088401 0.17155683 -0.4322128 -0.07509104 -0.78919832]\n",
- " [-0.88604157 0.55037109 0.42634479 -0.87179776 -0.69767766]\n",
- " [-0.0022063 0.99304089 0.76398198 -0.87343131 -0.12363411]\n",
- " [ 0.36371019 0.6660538 0.17177652 -0.08891719 -0.91070485]\n",
- " [-0.1056742 -0.72879406 -0.69367421 -0.8684397 0.70903817]\n",
- " [-0.15422305 0.31069811 -0.47487951 0.00853137 0.23793364]]\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# context\n",
"n_samples = 1000\n",
@@ -92,7 +74,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -109,7 +91,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -126,18 +108,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Recommended action: ['action C' 'action C' 'action B' 'action B' 'action C' 'action C'\n",
- " 'action B' 'action C' 'action B' 'action C']\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# predict action\n",
"pred_actions, _ = cmab.predict(X)\n",
@@ -153,17 +126,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Simulated rewards: [1 0 0 0 0 0 0 0 1 1]\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# simulate reward from environment\n",
"simulated_rewards = np.random.randint(2, size=n_samples)\n",
@@ -179,31 +144,9 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 5 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# update model\n",
"cmab.update(X, actions=pred_actions, rewards=simulated_rewards)"
diff --git a/docs/src/tutorials/simulation_cmab.ipynb b/docs/src/tutorials/simulation_cmab.ipynb
index 1ce2423..5a972ee 100644
--- a/docs/src/tutorials/simulation_cmab.ipynb
+++ b/docs/src/tutorials/simulation_cmab.ipynb
@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -38,7 +38,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -61,7 +61,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -80,77 +80,9 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Probability of positive reward for each group/action:\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " action A | \n",
- " action B | \n",
- " action C | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0.05 | \n",
- " 0.80 | \n",
- " 0.05 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0.80 | \n",
- " 0.05 | \n",
- " 0.05 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0.80 | \n",
- " 0.05 | \n",
- " 0.80 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " action A action B action C\n",
- "0 0.05 0.80 0.05\n",
- "1 0.80 0.05 0.05\n",
- "2 0.80 0.05 0.80"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# init probability of rewards from the environment\n",
"prob_rewards = pd.DataFrame(\n",
@@ -171,7 +103,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -181,24 +113,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Setup simulation completed.\n",
- "Simulated input probability rewards:\n",
- " action A action B action C\n",
- "group \n",
- "0 0.041176 0.835294 0.052941\n",
- "1 0.819277 0.036145 0.054217\n",
- "2 0.786585 0.042683 0.817073 \n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# init simulation\n",
"sim = SimulationCmab(\n",
@@ -222,205 +139,9 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #1\n",
- "Start predict batch 1 ...\n",
- "Start update batch 1 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 11 seconds.\n",
- "The number of effective samples is smaller than 25% for some parameters.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 10 seconds.\n",
- "The number of effective samples is smaller than 25% for some parameters.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #2\n",
- "Start predict batch 2 ...\n",
- "Start update batch 2 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 9 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 5 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #3\n",
- "Start predict batch 3 ...\n",
- "Start update batch 3 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 9 seconds.\n",
- "The number of effective samples is smaller than 25% for some parameters.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #4\n",
- "Start predict batch 4 ...\n",
- "Start update batch 4 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #5\n",
- "Start predict batch 5 ...\n",
- "Start update batch 5 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Simulation results (first 10 observations):\n",
- " action reward group selected_prob_reward max_prob_reward regret \\\n",
- "0 action C 0.0 1 0.05 0.8 0.75 \n",
- "1 action C 1.0 2 0.80 0.8 0.00 \n",
- "2 action B 1.0 0 0.80 0.8 0.00 \n",
- "3 action C 0.0 1 0.05 0.8 0.75 \n",
- "4 action C 0.0 1 0.05 0.8 0.75 \n",
- "5 action B 1.0 0 0.80 0.8 0.00 \n",
- "6 action A 0.0 0 0.05 0.8 0.75 \n",
- "7 action C 0.0 2 0.80 0.8 0.00 \n",
- "8 action C 0.0 1 0.05 0.8 0.75 \n",
- "9 action C 1.0 2 0.80 0.8 0.00 \n",
- "\n",
- " cum_regret \n",
- "0 0.75 \n",
- "1 0.75 \n",
- "2 0.75 \n",
- "3 1.50 \n",
- "4 2.25 \n",
- "5 2.25 \n",
- "6 3.00 \n",
- "7 3.00 \n",
- "8 3.75 \n",
- "9 3.75 \n",
- "\n",
- "Count of actions selected by the bandit: \n",
- " {'group 0': {'action B': 85, 'action A': 53, 'action C': 32}, 'group 1': {'action A': 109, 'action C': 31, 'action B': 26}, 'group 2': {'action A': 70, 'action C': 59, 'action B': 35}} \n",
- "\n",
- "Observed proportion of positive rewards for each action:\n",
- " {'group 0': {'action B': 0.788235294117647, 'action A': 0.03773584905660377, 'action C': 0.03125}, 'group 1': {'action A': 0.7981651376146789, 'action B': 0.07692307692307693, 'action C': 0.03225806451612903}, 'group 2': {'action A': 0.7142857142857143, 'action C': 0.8305084745762712, 'action B': 0.02857142857142857}} \n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"sim.run()"
]
diff --git a/docs/src/tutorials/simulation_smab.ipynb b/docs/src/tutorials/simulation_smab.ipynb
index df7d1e8..e15d57f 100644
--- a/docs/src/tutorials/simulation_smab.ipynb
+++ b/docs/src/tutorials/simulation_smab.ipynb
@@ -18,7 +18,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -35,7 +35,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -45,7 +45,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -64,7 +64,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -95,35 +95,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Simulation results (first 10 observations):\n",
- " action reward\n",
- "0 Action B 0.0\n",
- "1 Action C 1.0\n",
- "2 Action C 0.0\n",
- "3 Action A 1.0\n",
- "4 Action B 1.0\n",
- "5 Action C 1.0\n",
- "6 Action A 1.0\n",
- "7 Action A 1.0\n",
- "8 Action B 0.0\n",
- "9 Action B 0.0 \n",
- "\n",
- "Count of actions selected by the bandit: \n",
- " {'Action C': 38670, 'Action B': 683, 'Action A': 647} \n",
- "\n",
- "Observed proportion of positive rewards for each action:\n",
- " {'Action A': 0.6120556414219475, 'Action B': 0.4978038067349927, 'Action C': 0.7995603827256271} \n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# run simulation\n",
"sim.run()"
diff --git a/docs/src/tutorials/smab.ipynb b/docs/src/tutorials/smab.ipynb
index ed119e1..c4bc60c 100644
--- a/docs/src/tutorials/smab.ipynb
+++ b/docs/src/tutorials/smab.ipynb
@@ -32,7 +32,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -50,7 +50,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {
"tags": []
},
@@ -62,7 +62,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -81,7 +81,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -98,17 +98,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Recommended action: ['Action C', 'Action C', 'Action C', 'Action B', 'Action B', 'Action C', 'Action B', 'Action C', 'Action A', 'Action B']\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# predict actions\n",
"pred_actions, _ = smab.predict(n_samples=1000)\n",
@@ -124,19 +116,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Action A: n_successes=285, n_failures=31\n",
- "Action B: n_successes=123, n_failures=210\n",
- "Action C: n_successes=261, n_failures=90\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# simulate rewards from environment\n",
"n_successes, n_failures = {}, {}\n",
@@ -155,7 +137,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
diff --git a/docs/tutorials/cmab.ipynb b/docs/tutorials/cmab.ipynb
index 07b74fa..cb83a7e 100644
--- a/docs/tutorials/cmab.ipynb
+++ b/docs/tutorials/cmab.ipynb
@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {
"pycharm": {
"is_executing": false
@@ -56,31 +56,13 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {
"pycharm": {
"is_executing": false
}
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "X: context matrix of shape (n_samples, n_features)\n",
- "[[-0.53211475 -0.40592956 0.05892565 -0.88067628 -0.84061481]\n",
- " [-0.95680954 -0.00540581 0.09148556 -0.82021004 -0.63425381]\n",
- " [-0.87792928 -0.51881823 -0.51767022 -0.05385187 -0.64499044]\n",
- " [-0.10569516 0.30847784 -0.353929 -0.94831998 -0.52175713]\n",
- " [-0.05088401 0.17155683 -0.4322128 -0.07509104 -0.78919832]\n",
- " [-0.88604157 0.55037109 0.42634479 -0.87179776 -0.69767766]\n",
- " [-0.0022063 0.99304089 0.76398198 -0.87343131 -0.12363411]\n",
- " [ 0.36371019 0.6660538 0.17177652 -0.08891719 -0.91070485]\n",
- " [-0.1056742 -0.72879406 -0.69367421 -0.8684397 0.70903817]\n",
- " [-0.15422305 0.31069811 -0.47487951 0.00853137 0.23793364]]\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# context\n",
"n_samples = 1000\n",
@@ -92,7 +74,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -109,7 +91,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -126,18 +108,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Recommended action: ['action C' 'action C' 'action B' 'action B' 'action C' 'action C'\n",
- " 'action B' 'action C' 'action B' 'action C']\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# predict action\n",
"pred_actions, _ = cmab.predict(X)\n",
@@ -153,17 +126,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Simulated rewards: [1 0 0 0 0 0 0 0 1 1]\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# simulate reward from environment\n",
"simulated_rewards = np.random.randint(2, size=n_samples)\n",
@@ -179,31 +144,9 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 5 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# update model\n",
"cmab.update(X, actions=pred_actions, rewards=simulated_rewards)"
diff --git a/docs/tutorials/mab.ipynb b/docs/tutorials/mab.ipynb
index 22c5666..d139501 100644
--- a/docs/tutorials/mab.ipynb
+++ b/docs/tutorials/mab.ipynb
@@ -3,7 +3,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "e2595bf3-9767-4338-9a51-ce706dc306cf",
+ "id": "0",
"metadata": {},
"source": [
"# Stochastic Bernoulli Bandit"
@@ -11,8 +11,8 @@
},
{
"cell_type": "code",
- "execution_count": 1,
- "id": "8f8462e5-f38e-4b04-9002-07ababe3ee0c",
+ "execution_count": null,
+ "id": "1",
"metadata": {},
"outputs": [],
"source": [
@@ -25,21 +25,10 @@
},
{
"cell_type": "code",
- "execution_count": 2,
- "id": "75d6f625",
+ "execution_count": null,
+ "id": "2",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'%.2f'"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# print 2 decimal places in the notebook\n",
"%precision %.2f"
@@ -48,7 +37,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "b6b37329-6a3b-4f2a-87a5-e0dcbbb1bb69",
+ "id": "3",
"metadata": {},
"source": [
"## 1. Initialization\n",
@@ -58,7 +47,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "2ca215bf-6321-4819-a539-ebf1f378436a",
+ "id": "4",
"metadata": {},
"source": [
"### 1.1 Initialize via class constructor\n",
@@ -68,8 +57,8 @@
},
{
"cell_type": "code",
- "execution_count": 3,
- "id": "701111ff-b659-49b7-8cf5-8349536b4cd8",
+ "execution_count": null,
+ "id": "5",
"metadata": {},
"outputs": [],
"source": [
@@ -84,38 +73,10 @@
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "55112a02-8df2-4895-9414-ddabbfc8ecac",
+ "execution_count": null,
+ "id": "6",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "SmabBernoulli(\n",
- " actions={\n",
- " 'a1': Beta(n_successes=1, n_failures=1),\n",
- " 'a2': Beta(n_successes=1, n_failures=1),\n",
- " 'a3': Beta(n_successes=1, n_failures=1)\n",
- " },\n",
- " strategy=ClassicBandit()\n",
- ")\n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[1;35mSmabBernoulli\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n",
- " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\n",
- " \u001b[1m}\u001b[0m,\n",
- " \u001b[33mstrategy\u001b[0m=\u001b[1;35mClassicBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
- "\u001b[1m)\u001b[0m\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"print(mab)"
]
@@ -123,7 +84,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "f2ee7bdc-3881-47a5-b7d4-84862f70e643",
+ "id": "7",
"metadata": {},
"source": [
"### 1.2 Initialize via utility function (for cold start)"
@@ -132,7 +93,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "564914fd-73cc-4854-8ec7-548970f794a6",
+ "id": "8",
"metadata": {},
"source": [
"You can initialize the bandit via the utility function `SmabBernoulliMOCC.cold_start()`. This is particulary useful in a cold start setting when there is no prior knowledge on the Beta distruibutions. In this case for all Betas `n_successes` and `n_failures` are set to `1`."
@@ -140,8 +101,8 @@
},
{
"cell_type": "code",
- "execution_count": 5,
- "id": "dbfb0ddd-4c16-441f-8c68-16020e425d57",
+ "execution_count": null,
+ "id": "9",
"metadata": {},
"outputs": [],
"source": [
@@ -151,38 +112,10 @@
},
{
"cell_type": "code",
- "execution_count": 6,
- "id": "fcc3649c-d08c-46db-a534-f61d97962c99",
+ "execution_count": null,
+ "id": "10",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "SmabBernoulli(\n",
- " actions={\n",
- " 'a1': Beta(n_successes=1, n_failures=1),\n",
- " 'a3': Beta(n_successes=1, n_failures=1),\n",
- " 'a2': Beta(n_successes=1, n_failures=1)\n",
- " },\n",
- " strategy=ClassicBandit()\n",
- ")\n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[1;35mSmabBernoulli\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n",
- " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\n",
- " \u001b[1m}\u001b[0m,\n",
- " \u001b[33mstrategy\u001b[0m=\u001b[1;35mClassicBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
- "\u001b[1m)\u001b[0m\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"print(mab)"
]
@@ -190,7 +123,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "aa91a5ed-83cc-4016-aa3e-17b8a102bb77",
+ "id": "11",
"metadata": {},
"source": [
"## 2. Function `predict()`"
@@ -198,46 +131,18 @@
},
{
"cell_type": "code",
- "execution_count": 7,
- "id": "a735c03d-cde4-4147-a50d-4b82dd9c1792",
+ "execution_count": null,
+ "id": "12",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Help on method predict in module pybandits.smab:\n",
- "\n",
- "predict(n_samples: pydantic.types.PositiveInt = 1, forbidden_actions: Optional[Set[pybandits.base.ActionId]] = None) -> Tuple[List[pybandits.base.ActionId], List[Dict[pybandits.base.ActionId, pybandits.base.Probability]]] method of pybandits.smab.SmabBernoulli instance\n",
- " Predict actions.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " n_samples : int > 0, default=1\n",
- " Number of samples to predict.\n",
- " forbidden_actions : Optional[Set[ActionId]], default=None\n",
- " Set of forbidden actions. If specified, the model will discard the forbidden_actions and it will only\n",
- " consider the remaining allowed_actions. By default, the model considers all actions as allowed_actions.\n",
- " Note that: actions = allowed_actions U forbidden_actions.\n",
- " \n",
- " Returns\n",
- " -------\n",
- " actions: List[ActionId] of shape (n_samples,)\n",
- " The actions selected by the multi-armed bandit model.\n",
- " probs: List[Dict[ActionId, Probability]] of shape (n_samples,)\n",
- " The probabilities of getting a positive reward for each action.\n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"help(mab.predict)"
]
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "f3d9cb8b-7d9b-437b-bbc2-e7a55475a1fb",
+ "execution_count": null,
+ "id": "13",
"metadata": {},
"outputs": [],
"source": [
@@ -247,54 +152,28 @@
},
{
"cell_type": "code",
- "execution_count": 9,
- "id": "a9284b11-05ba-4cda-9597-b69e6d7632a3",
+ "execution_count": null,
+ "id": "14",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['a3', 'a1', 'a3', 'a1', 'a3']"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"actions"
]
},
{
"cell_type": "code",
- "execution_count": 10,
- "id": "84cdbed4-9aa5-42e1-84db-1f8f72c52d93",
+ "execution_count": null,
+ "id": "15",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'a1': 0.68, 'a3': 0.77, 'a2': 0.51},\n",
- " {'a1': 0.85, 'a3': 0.18, 'a2': 0.82},\n",
- " {'a1': 0.68, 'a3': 0.82, 'a2': 0.42},\n",
- " {'a1': 0.98, 'a3': 0.72, 'a2': 0.22},\n",
- " {'a1': 0.72, 'a3': 0.83, 'a2': 0.13}]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"probs"
]
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "bfc53fc8-b1bf-42ea-907a-fa5fb7173199",
+ "execution_count": null,
+ "id": "16",
"metadata": {},
"outputs": [],
"source": [
@@ -304,46 +183,20 @@
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "696d58f4-ca5f-41d4-983f-bc7a5351ab28",
+ "execution_count": null,
+ "id": "17",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['a2', 'a2', 'a2', 'a3', 'a2']"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"actions"
]
},
{
"cell_type": "code",
- "execution_count": 13,
- "id": "f5826785-a5c6-4c06-9bab-9f05134e783e",
+ "execution_count": null,
+ "id": "18",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'a3': 0.71, 'a2': 0.86},\n",
- " {'a3': 0.51, 'a2': 0.55},\n",
- " {'a3': 0.42, 'a2': 0.87},\n",
- " {'a3': 0.89, 'a2': 0.52},\n",
- " {'a3': 0.41, 'a2': 0.42}]"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"probs"
]
@@ -351,7 +204,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "d89f7199-bec3-407d-92a9-bdf917c13de6",
+ "id": "19",
"metadata": {},
"source": [
"## 3. Function `update()`"
@@ -359,42 +212,18 @@
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "140eb2fc-3659-4c13-86d1-ec5a575c79c1",
+ "execution_count": null,
+ "id": "20",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Help on method update in module pybandits.smab:\n",
- "\n",
- "update(actions: List[pybandits.base.ActionId], rewards: List[pybandits.base.BinaryReward]) method of pybandits.smab.SmabBernoulli instance\n",
- " Update the stochastic Bernoulli bandit given the list of selected actions and their corresponding binary\n",
- " rewards.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " actions : List[ActionId] of shape (n_samples,), e.g. ['a1', 'a2', 'a3', 'a4', 'a5']\n",
- " The selected action for each sample.\n",
- " rewards : List[Union[BinaryReward, List[BinaryReward]]] of shape (n_samples, n_objectives)\n",
- " The binary reward for each sample.\n",
- " If strategy is not MultiObjectiveBandit, rewards should be a list, e.g.\n",
- " rewards = [1, 0, 1, 1, 1, ...]\n",
- " If strategy is MultiObjectiveBandit, rewards should be a list of list, e.g. (with n_objectives=2):\n",
- " rewards = [[1, 1], [1, 0], [1, 1], [1, 0], [1, 1], ...]\n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"help(mab.update)"
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "2526ed6d-82d4-4485-bc6e-b5cb53dd78a5",
+ "execution_count": null,
+ "id": "21",
"metadata": {},
"outputs": [],
"source": [
@@ -404,38 +233,10 @@
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "3bd0ab45-94e8-415b-adea-a089c54f6274",
+ "execution_count": null,
+ "id": "22",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "SmabBernoulli(\n",
- " actions={\n",
- " 'a1': Beta(n_successes=1, n_failures=1),\n",
- " 'a3': Beta(n_successes=2, n_failures=1),\n",
- " 'a2': Beta(n_successes=3, n_failures=3)\n",
- " },\n",
- " strategy=ClassicBandit()\n",
- ")\n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[1;35mSmabBernoulli\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n",
- " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m2\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m\n",
- " \u001b[1m}\u001b[0m,\n",
- " \u001b[33mstrategy\u001b[0m=\u001b[1;35mClassicBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
- "\u001b[1m)\u001b[0m\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"# update\n",
"mab.update(actions=actions, rewards=rewards)\n",
@@ -445,7 +246,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "9823d84c-862b-4bb6-ab36-024f34460595",
+ "id": "23",
"metadata": {},
"source": [
"## 4. Example of usage\n",
@@ -455,8 +256,8 @@
},
{
"cell_type": "code",
- "execution_count": 17,
- "id": "a785463d-d710-4844-80bf-42c09b0e0b45",
+ "execution_count": null,
+ "id": "24",
"metadata": {},
"outputs": [],
"source": [
@@ -476,38 +277,10 @@
},
{
"cell_type": "code",
- "execution_count": 18,
- "id": "034add3d-e6f3-471c-b8b9-30c286faf2cc",
+ "execution_count": null,
+ "id": "25",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "SmabBernoulli(\n",
- " actions={\n",
- " 'a1': Beta(n_successes=337, n_failures=369),\n",
- " 'a3': Beta(n_successes=4448, n_failures=4315),\n",
- " 'a2': Beta(n_successes=246, n_failures=296)\n",
- " },\n",
- " strategy=ClassicBandit()\n",
- ")\n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[1;35mSmabBernoulli\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n",
- " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m337\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m369\u001b[0m\u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m4448\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m4315\u001b[0m\u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m246\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m296\u001b[0m\u001b[1m)\u001b[0m\n",
- " \u001b[1m}\u001b[0m,\n",
- " \u001b[33mstrategy\u001b[0m=\u001b[1;35mClassicBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
- "\u001b[1m)\u001b[0m\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"print(mab)"
]
diff --git a/docs/tutorials/simulation_cmab.ipynb b/docs/tutorials/simulation_cmab.ipynb
index 1ce2423..5a972ee 100644
--- a/docs/tutorials/simulation_cmab.ipynb
+++ b/docs/tutorials/simulation_cmab.ipynb
@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -38,7 +38,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -61,7 +61,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -80,77 +80,9 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Probability of positive reward for each group/action:\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " action A | \n",
- " action B | \n",
- " action C | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0.05 | \n",
- " 0.80 | \n",
- " 0.05 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0.80 | \n",
- " 0.05 | \n",
- " 0.05 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0.80 | \n",
- " 0.05 | \n",
- " 0.80 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " action A action B action C\n",
- "0 0.05 0.80 0.05\n",
- "1 0.80 0.05 0.05\n",
- "2 0.80 0.05 0.80"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# init probability of rewards from the environment\n",
"prob_rewards = pd.DataFrame(\n",
@@ -171,7 +103,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -181,24 +113,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Setup simulation completed.\n",
- "Simulated input probability rewards:\n",
- " action A action B action C\n",
- "group \n",
- "0 0.041176 0.835294 0.052941\n",
- "1 0.819277 0.036145 0.054217\n",
- "2 0.786585 0.042683 0.817073 \n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# init simulation\n",
"sim = SimulationCmab(\n",
@@ -222,205 +139,9 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #1\n",
- "Start predict batch 1 ...\n",
- "Start update batch 1 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 11 seconds.\n",
- "The number of effective samples is smaller than 25% for some parameters.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 10 seconds.\n",
- "The number of effective samples is smaller than 25% for some parameters.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #2\n",
- "Start predict batch 2 ...\n",
- "Start update batch 2 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 9 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 5 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #3\n",
- "Start predict batch 3 ...\n",
- "Start update batch 3 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 9 seconds.\n",
- "The number of effective samples is smaller than 25% for some parameters.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #4\n",
- "Start predict batch 4 ...\n",
- "Start update batch 4 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Iteration #5\n",
- "Start predict batch 5 ...\n",
- "Start update batch 5 ... \n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n",
- "Auto-assigning NUTS sampler...\n",
- "Initializing NUTS using adapt_diag...\n",
- "Sequential sampling (2 chains in 1 job)\n",
- "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n",
- "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Simulation results (first 10 observations):\n",
- " action reward group selected_prob_reward max_prob_reward regret \\\n",
- "0 action C 0.0 1 0.05 0.8 0.75 \n",
- "1 action C 1.0 2 0.80 0.8 0.00 \n",
- "2 action B 1.0 0 0.80 0.8 0.00 \n",
- "3 action C 0.0 1 0.05 0.8 0.75 \n",
- "4 action C 0.0 1 0.05 0.8 0.75 \n",
- "5 action B 1.0 0 0.80 0.8 0.00 \n",
- "6 action A 0.0 0 0.05 0.8 0.75 \n",
- "7 action C 0.0 2 0.80 0.8 0.00 \n",
- "8 action C 0.0 1 0.05 0.8 0.75 \n",
- "9 action C 1.0 2 0.80 0.8 0.00 \n",
- "\n",
- " cum_regret \n",
- "0 0.75 \n",
- "1 0.75 \n",
- "2 0.75 \n",
- "3 1.50 \n",
- "4 2.25 \n",
- "5 2.25 \n",
- "6 3.00 \n",
- "7 3.00 \n",
- "8 3.75 \n",
- "9 3.75 \n",
- "\n",
- "Count of actions selected by the bandit: \n",
- " {'group 0': {'action B': 85, 'action A': 53, 'action C': 32}, 'group 1': {'action A': 109, 'action C': 31, 'action B': 26}, 'group 2': {'action A': 70, 'action C': 59, 'action B': 35}} \n",
- "\n",
- "Observed proportion of positive rewards for each action:\n",
- " {'group 0': {'action B': 0.788235294117647, 'action A': 0.03773584905660377, 'action C': 0.03125}, 'group 1': {'action A': 0.7981651376146789, 'action B': 0.07692307692307693, 'action C': 0.03225806451612903}, 'group 2': {'action A': 0.7142857142857143, 'action C': 0.8305084745762712, 'action B': 0.02857142857142857}} \n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"sim.run()"
]
diff --git a/docs/tutorials/simulation_smab.ipynb b/docs/tutorials/simulation_smab.ipynb
index df7d1e8..e15d57f 100644
--- a/docs/tutorials/simulation_smab.ipynb
+++ b/docs/tutorials/simulation_smab.ipynb
@@ -18,7 +18,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -35,7 +35,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -45,7 +45,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -64,7 +64,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -95,35 +95,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Simulation results (first 10 observations):\n",
- " action reward\n",
- "0 Action B 0.0\n",
- "1 Action C 1.0\n",
- "2 Action C 0.0\n",
- "3 Action A 1.0\n",
- "4 Action B 1.0\n",
- "5 Action C 1.0\n",
- "6 Action A 1.0\n",
- "7 Action A 1.0\n",
- "8 Action B 0.0\n",
- "9 Action B 0.0 \n",
- "\n",
- "Count of actions selected by the bandit: \n",
- " {'Action C': 38670, 'Action B': 683, 'Action A': 647} \n",
- "\n",
- "Observed proportion of positive rewards for each action:\n",
- " {'Action A': 0.6120556414219475, 'Action B': 0.4978038067349927, 'Action C': 0.7995603827256271} \n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# run simulation\n",
"sim.run()"
diff --git a/docs/tutorials/smab.ipynb b/docs/tutorials/smab.ipynb
index ed119e1..c4bc60c 100644
--- a/docs/tutorials/smab.ipynb
+++ b/docs/tutorials/smab.ipynb
@@ -32,7 +32,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -50,7 +50,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {
"tags": []
},
@@ -62,7 +62,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -81,7 +81,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -98,17 +98,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Recommended action: ['Action C', 'Action C', 'Action C', 'Action B', 'Action B', 'Action C', 'Action B', 'Action C', 'Action A', 'Action B']\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# predict actions\n",
"pred_actions, _ = smab.predict(n_samples=1000)\n",
@@ -124,19 +116,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Action A: n_successes=285, n_failures=31\n",
- "Action B: n_successes=123, n_failures=210\n",
- "Action C: n_successes=261, n_failures=90\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# simulate rewards from environment\n",
"n_successes, n_failures = {}, {}\n",
@@ -155,7 +137,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
diff --git a/docs/tutorials/smab_mo_cc.ipynb b/docs/tutorials/smab_mo_cc.ipynb
index 880654c..f92e04d 100644
--- a/docs/tutorials/smab_mo_cc.ipynb
+++ b/docs/tutorials/smab_mo_cc.ipynb
@@ -3,7 +3,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "e2595bf3-9767-4338-9a51-ce706dc306cf",
+ "id": "0",
"metadata": {},
"source": [
"# Stochastic Bernoulli Bandit (Multi-Objective with Cost-Control)"
@@ -11,8 +11,8 @@
},
{
"cell_type": "code",
- "execution_count": 1,
- "id": "8f8462e5-f38e-4b04-9002-07ababe3ee0c",
+ "execution_count": null,
+ "id": "1",
"metadata": {},
"outputs": [],
"source": [
@@ -25,21 +25,10 @@
},
{
"cell_type": "code",
- "execution_count": 2,
- "id": "75d6f625",
+ "execution_count": null,
+ "id": "2",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'%.2f'"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# print 2 decimal places in the notebook\n",
"%precision %.2f"
@@ -47,7 +36,7 @@
},
{
"cell_type": "markdown",
- "id": "b6b37329-6a3b-4f2a-87a5-e0dcbbb1bb69",
+ "id": "3",
"metadata": {},
"source": [
"## 1. Initialization\n",
@@ -57,7 +46,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "2ca215bf-6321-4819-a539-ebf1f378436a",
+ "id": "4",
"metadata": {},
"source": [
"### 1.1 Initialize via class constructor\n",
@@ -67,8 +56,8 @@
},
{
"cell_type": "code",
- "execution_count": 3,
- "id": "701111ff-b659-49b7-8cf5-8349536b4cd8",
+ "execution_count": null,
+ "id": "5",
"metadata": {},
"outputs": [],
"source": [
@@ -83,63 +72,17 @@
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "55112a02-8df2-4895-9414-ddabbfc8ecac",
+ "execution_count": null,
+ "id": "6",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "SmabBernoulliMOCC(\n",
- " actions={\n",
- " 'a1': BetaMOCC(\n",
- " counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
- " cost=30.0\n",
- " ),\n",
- " 'a2': BetaMOCC(\n",
- " counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
- " cost=10.0\n",
- " ),\n",
- " 'a3': BetaMOCC(\n",
- " counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
- " cost=20.0\n",
- " )\n",
- " },\n",
- " strategy=MultiObjectiveCostControlBandit()\n",
- ")\n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[1;35mSmabBernoulliMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n",
- " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m30\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m10\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m20\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m\n",
- " \u001b[1m}\u001b[0m,\n",
- " \u001b[33mstrategy\u001b[0m=\u001b[1;35mMultiObjectiveCostControlBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
- "\u001b[1m)\u001b[0m\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"print(mab)"
]
},
{
"cell_type": "markdown",
- "id": "f2ee7bdc-3881-47a5-b7d4-84862f70e643",
+ "id": "7",
"metadata": {},
"source": [
"### 1.2 Initialize via utility function (for cold start)"
@@ -148,7 +91,7 @@
{
"attachments": {},
"cell_type": "markdown",
- "id": "564914fd-73cc-4854-8ec7-548970f794a6",
+ "id": "8",
"metadata": {},
"source": [
"You can initialize the bandit via the utility function `SmabBernoulliMOCC.cold_start()`. This is particulary useful in a cold start setting when there is no prior knowledge on the Beta distruibutions. In this case for all Betas `n_successes` and `n_failures` are set to `1`."
@@ -156,8 +99,8 @@
},
{
"cell_type": "code",
- "execution_count": 5,
- "id": "dbfb0ddd-4c16-441f-8c68-16020e425d57",
+ "execution_count": null,
+ "id": "9",
"metadata": {},
"outputs": [],
"source": [
@@ -170,63 +113,17 @@
},
{
"cell_type": "code",
- "execution_count": 6,
- "id": "fcc3649c-d08c-46db-a534-f61d97962c99",
+ "execution_count": null,
+ "id": "10",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "SmabBernoulliMOCC(\n",
- " actions={\n",
- " 'a1': BetaMOCC(\n",
- " counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
- " cost=30.0\n",
- " ),\n",
- " 'a2': BetaMOCC(\n",
- " counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
- " cost=10.0\n",
- " ),\n",
- " 'a3': BetaMOCC(\n",
- " counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
- " cost=20.0\n",
- " )\n",
- " },\n",
- " strategy=MultiObjectiveCostControlBandit()\n",
- ")\n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[1;35mSmabBernoulliMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n",
- " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m30\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m10\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m20\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m\n",
- " \u001b[1m}\u001b[0m,\n",
- " \u001b[33mstrategy\u001b[0m=\u001b[1;35mMultiObjectiveCostControlBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
- "\u001b[1m)\u001b[0m\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"print(mab)"
]
},
{
"cell_type": "markdown",
- "id": "aa91a5ed-83cc-4016-aa3e-17b8a102bb77",
+ "id": "11",
"metadata": {},
"source": [
"## 2. Function `predict()`"
@@ -234,46 +131,18 @@
},
{
"cell_type": "code",
- "execution_count": 7,
- "id": "a735c03d-cde4-4147-a50d-4b82dd9c1792",
+ "execution_count": null,
+ "id": "12",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Help on method predict in module pybandits.smab:\n",
- "\n",
- "predict(n_samples: pydantic.types.PositiveInt = 1, forbidden_actions: Optional[Set[pybandits.base.ActionId]] = None) -> Tuple[List[pybandits.base.ActionId], List[Dict[pybandits.base.ActionId, pybandits.base.Probability]]] method of pybandits.smab.SmabBernoulliMOCC instance\n",
- " Predict actions.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " n_samples : int > 0, default=1\n",
- " Number of samples to predict.\n",
- " forbidden_actions : Optional[Set[ActionId]], default=None\n",
- " Set of forbidden actions. If specified, the model will discard the forbidden_actions and it will only\n",
- " consider the remaining allowed_actions. By default, the model considers all actions as allowed_actions.\n",
- " Note that: actions = allowed_actions U forbidden_actions.\n",
- " \n",
- " Returns\n",
- " -------\n",
- " actions: List[ActionId] of shape (n_samples,)\n",
- " The actions selected by the multi-armed bandit model.\n",
- " probs: List[Dict[ActionId, Probability]] of shape (n_samples,)\n",
- " The probabilities of getting a positive reward for each action.\n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"help(mab.predict)"
]
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "f3d9cb8b-7d9b-437b-bbc2-e7a55475a1fb",
+ "execution_count": null,
+ "id": "13",
"metadata": {},
"outputs": [],
"source": [
@@ -283,54 +152,28 @@
},
{
"cell_type": "code",
- "execution_count": 9,
- "id": "a9284b11-05ba-4cda-9597-b69e6d7632a3",
+ "execution_count": null,
+ "id": "14",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['a3', 'a3', 'a2', 'a3', 'a2']"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"actions"
]
},
{
"cell_type": "code",
- "execution_count": 10,
- "id": "84cdbed4-9aa5-42e1-84db-1f8f72c52d93",
+ "execution_count": null,
+ "id": "15",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'a1': [0.75, 0.55], 'a2': [0.78, 0.29], 'a3': [0.79, 0.83]},\n",
- " {'a1': [0.95, 0.28], 'a2': [0.22, 0.23], 'a3': [0.99, 0.95]},\n",
- " {'a1': [0.22, 0.64], 'a2': [0.62, 0.50], 'a3': [0.30, 0.12]},\n",
- " {'a1': [0.19, 0.79], 'a2': [0.02, 0.70], 'a3': [0.27, 0.72]},\n",
- " {'a1': [0.38, 0.03], 'a2': [0.80, 0.55], 'a3': [0.79, 0.04]}]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"probs"
]
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "bfc53fc8-b1bf-42ea-907a-fa5fb7173199",
+ "execution_count": null,
+ "id": "16",
"metadata": {},
"outputs": [],
"source": [
@@ -340,53 +183,27 @@
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "696d58f4-ca5f-41d4-983f-bc7a5351ab28",
+ "execution_count": null,
+ "id": "17",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['a3', 'a2', 'a2', 'a2', 'a2']"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"actions"
]
},
{
"cell_type": "code",
- "execution_count": 13,
- "id": "f5826785-a5c6-4c06-9bab-9f05134e783e",
+ "execution_count": null,
+ "id": "18",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'a2': [0.12, 0.45], 'a3': [0.38, 0.90]},\n",
- " {'a2': [0.10, 0.96], 'a3': [0.58, 0.20]},\n",
- " {'a2': [0.92, 0.85], 'a3': [0.31, 0.65]},\n",
- " {'a2': [0.60, 0.04], 'a3': [0.45, 0.97]},\n",
- " {'a2': [0.87, 0.51], 'a3': [0.74, 0.35]}]"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"probs"
]
},
{
"cell_type": "markdown",
- "id": "d89f7199-bec3-407d-92a9-bdf917c13de6",
+ "id": "19",
"metadata": {},
"source": [
"## 3. Function `update()`"
@@ -394,42 +211,18 @@
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "140eb2fc-3659-4c13-86d1-ec5a575c79c1",
+ "execution_count": null,
+ "id": "20",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Help on method update in module pybandits.smab:\n",
- "\n",
- "update(actions: List[pybandits.base.ActionId], rewards: List[List[pybandits.base.BinaryReward]]) method of pybandits.smab.SmabBernoulliMOCC instance\n",
- " Update the stochastic Bernoulli bandit given the list of selected actions and their corresponding binary\n",
- " rewards.\n",
- " \n",
- " Parameters\n",
- " ----------\n",
- " actions : List[ActionId] of shape (n_samples,), e.g. ['a1', 'a2', 'a3', 'a4', 'a5']\n",
- " The selected action for each sample.\n",
- " rewards : List[Union[BinaryReward, List[BinaryReward]]] of shape (n_samples, n_objectives)\n",
- " The binary reward for each sample.\n",
- " If strategy is not MultiObjectiveBandit, rewards should be a list, e.g.\n",
- " rewards = [1, 0, 1, 1, 1, ...]\n",
- " If strategy is MultiObjectiveBandit, rewards should be a list of list, e.g. (with n_objectives=2):\n",
- " rewards = [[1, 1], [1, 0], [1, 1], [1, 0], [1, 1], ...]\n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"help(mab.update)"
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "2526ed6d-82d4-4485-bc6e-b5cb53dd78a5",
+ "execution_count": null,
+ "id": "21",
"metadata": {},
"outputs": [],
"source": [
@@ -439,56 +232,10 @@
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "3bd0ab45-94e8-415b-adea-a089c54f6274",
+ "execution_count": null,
+ "id": "22",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "SmabBernoulliMOCC(\n",
- " actions={\n",
- " 'a1': BetaMOCC(\n",
- " counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
- " cost=30.0\n",
- " ),\n",
- " 'a2': BetaMOCC(\n",
- " counters=[Beta(n_successes=7, n_failures=3), Beta(n_successes=7, n_failures=3)],\n",
- " cost=10.0\n",
- " ),\n",
- " 'a3': BetaMOCC(\n",
- " counters=[Beta(n_successes=3, n_failures=1), Beta(n_successes=3, n_failures=1)],\n",
- " cost=20.0\n",
- " )\n",
- " },\n",
- " strategy=MultiObjectiveCostControlBandit()\n",
- ")\n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[1;35mSmabBernoulliMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n",
- " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m30\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m7\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m7\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m10\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m20\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m\n",
- " \u001b[1m}\u001b[0m,\n",
- " \u001b[33mstrategy\u001b[0m=\u001b[1;35mMultiObjectiveCostControlBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
- "\u001b[1m)\u001b[0m\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"# update\n",
"mab.update(actions=actions, rewards=rewards)\n",
@@ -497,7 +244,7 @@
},
{
"cell_type": "markdown",
- "id": "9823d84c-862b-4bb6-ab36-024f34460595",
+ "id": "23",
"metadata": {},
"source": [
"## 4. Example of usage\n",
@@ -507,8 +254,8 @@
},
{
"cell_type": "code",
- "execution_count": 17,
- "id": "a785463d-d710-4844-80bf-42c09b0e0b45",
+ "execution_count": null,
+ "id": "24",
"metadata": {},
"outputs": [],
"source": [
@@ -531,56 +278,10 @@
},
{
"cell_type": "code",
- "execution_count": 18,
- "id": "034add3d-e6f3-471c-b8b9-30c286faf2cc",
+ "execution_count": null,
+ "id": "25",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "SmabBernoulliMOCC(\n",
- " actions={\n",
- " 'a1': BetaMOCC(\n",
- " counters=[Beta(n_successes=450, n_failures=488), Beta(n_successes=450, n_failures=488)],\n",
- " cost=30.0\n",
- " ),\n",
- " 'a2': BetaMOCC(\n",
- " counters=[Beta(n_successes=8541, n_failures=8325), Beta(n_successes=8541, n_failures=8325)],\n",
- " cost=10.0\n",
- " ),\n",
- " 'a3': BetaMOCC(\n",
- " counters=[Beta(n_successes=1110, n_failures=1102), Beta(n_successes=1110, n_failures=1102)],\n",
- " cost=20.0\n",
- " )\n",
- " },\n",
- " strategy=MultiObjectiveCostControlBandit()\n",
- ")\n",
- "
\n"
- ],
- "text/plain": [
- "\u001b[1;35mSmabBernoulliMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n",
- " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m450\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m488\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m450\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m488\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m30\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m8541\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m8325\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m8541\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m8325\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m10\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m,\n",
- " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n",
- " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1110\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1102\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1110\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1102\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
- " \u001b[33mcost\u001b[0m=\u001b[1;36m20\u001b[0m\u001b[1;36m.0\u001b[0m\n",
- " \u001b[1m)\u001b[0m\n",
- " \u001b[1m}\u001b[0m,\n",
- " \u001b[33mstrategy\u001b[0m=\u001b[1;35mMultiObjectiveCostControlBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
- "\u001b[1m)\u001b[0m\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"print(mab)"
]
diff --git a/pybandits/offline_policy_estimator.py b/pybandits/offline_policy_estimator.py
new file mode 100644
index 0000000..94a7817
--- /dev/null
+++ b/pybandits/offline_policy_estimator.py
@@ -0,0 +1,807 @@
+"""
+Comprehensive Offline Policy Evaluation (OPE) estimators.
+
+This module provides a complete set of estimators for OPE.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Optional, Tuple, Type
+
+import numpy as np
+from scipy.stats import bootstrap
+
+from pybandits.base import Float01, PyBanditsBaseModel
+from pybandits.pydantic_version_compatibility import (
+ PYDANTIC_VERSION_1,
+ PYDANTIC_VERSION_2,
+ NonNegativeFloat,
+ PositiveFloat,
+ PositiveInt,
+ PrivateAttr,
+ pydantic_version,
+ validate_call,
+)
+
+
+class BaseOfflinePolicyEstimator(PyBanditsBaseModel, ABC):
+ """Base class for all OPE estimators.
+
+ This class defines the interface for all OPE estimators and provides a common method for estimating the policy value.
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ """
+
+ alpha: Float01 = 0.05
+ n_bootstrap_samples: int = 10000
+ random_state: Optional[int] = None
+ _name: str = PrivateAttr()
+
+ @classmethod
+ def _check_array(
+ cls,
+ name: str,
+ data: Dict[str, np.ndarray],
+ ndim: PositiveInt,
+ dtype: type,
+ n_samples: PositiveInt,
+ n_actions: Optional[PositiveInt] = None,
+ ):
+ if name in data:
+ array = data[name]
+ if array.ndim != ndim:
+ raise ValueError(f"{name} must be a {ndim}D array.")
+ if array.shape[0] != n_samples:
+ raise ValueError(f"action and {name} must have the same length.")
+ if array.dtype != dtype:
+ raise ValueError(f"{name} must be a {dtype} array")
+ if ndim > 1:
+ if array.shape[1] != n_actions:
+ raise ValueError(f"{name} must have the same number of actions as the action array.")
+
+ @classmethod
+ def _check_sum(cls, name: str, data: Dict[str, np.ndarray]):
+ if name in data:
+ array = data[name]
+ if not array.sum(axis=-1).all():
+ raise ValueError(f"{name} must have at least one non-zero element on each column.")
+
+ @classmethod
+ def _check_inputs(cls, action: np.ndarray, **kwargs):
+ """
+ Check the inputs for the estimator.
+
+ Parameters
+ ----------
+ action : np.ndarray
+ Array of actions taken.
+ """
+ if action.ndim != 1:
+ raise ValueError("action must be a 1D array.")
+ if action.dtype != int:
+ raise ValueError("action must be an integer array.")
+ n_samples = action.shape[0]
+ n_actions = np.unique(action).shape[0]
+
+ for name, dtype in zip(["reward", "propensity_score", "expected_importance_weight"], [int, float, float]):
+ cls._check_array(name, kwargs, 1, dtype, n_samples)
+
+ for name in ["estimated_policy", "expected_reward"]:
+ cls._check_array(name, kwargs, 2, float, n_samples, n_actions)
+
+ for name in ["propensity_score", "estimated_policy", "expected_importance_weight"]:
+ cls._check_sum(name, kwargs)
+
+ @validate_call(config=dict(arbitrary_types_allowed=True))
+ def estimate_policy_value_with_confidence_interval(self, **kwargs) -> Tuple[float, float, float, float]:
+ """
+ Estimate the policy value with a confidence interval.
+
+ Parameters
+ ----------
+ action : np.ndarray
+ Array of actions taken.
+
+ Returns
+ -------
+ Tuple[float, float, float, float]
+ Estimated policy value, mean, lower bound, and upper bound of the confidence interval.
+ """
+ self._check_inputs(**kwargs)
+ sample_reward = self.estimate_sample_rewards(**kwargs)
+ estimated_policy_value = sample_reward.mean()
+ bootstrap_result = bootstrap(
+ data=(sample_reward,),
+ statistic=np.mean,
+ confidence_level=1 - self.alpha,
+ n_resamples=self.n_bootstrap_samples,
+ random_state=self.random_state,
+ )
+ low, high = bootstrap_result.confidence_interval
+ std = bootstrap_result.standard_error
+ return estimated_policy_value, low, high, std
+
+ @abstractmethod
+ def estimate_sample_rewards(self, **kwargs) -> np.ndarray:
+ """
+ Estimate sample rewards.
+
+ Returns
+ -------
+ np.ndarray
+ Estimated sample rewards.
+ """
+ pass
+
+ @property
+ def name(self) -> str:
+ """
+ Get the name of the estimator.
+
+ Returns
+ -------
+ str
+ Name of the estimator.
+ """
+ return self._name
+
+
+class ReplayMethod(BaseOfflinePolicyEstimator):
+ """
+ Replay Method estimator.
+
+ This estimator is a simple baseline that estimates the policy value by averaging the rewards of the matched samples.
+
+ Reference: Unbiased Offline Evaluation of Contextual-bandit-based News Article Recommendation Algorithms (Li, Chu, Langford, and Wang, 2011)
+ https://arxiv.org/pdf/1003.5956
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+
+ """
+
+ _name = "rep"
+
+ def estimate_sample_rewards(
+ self, action: np.ndarray, reward: np.ndarray, estimated_policy: np.ndarray, **kwargs
+ ) -> np.ndarray:
+ """
+ Estimate the sample rewards.
+
+ Parameters
+ ----------
+ action : np.ndarray
+ Array of actions taken.
+ reward : np.ndarray
+ Array of rewards corresponding to each action.
+ estimated_policy : np.ndarray
+ Array of action distributions.
+
+ Returns
+ -------
+ sample_reward : np.ndarray
+ Estimated sample rewards.
+ """
+ n_samples = action.shape[0]
+ matched_evaluation_policy = estimated_policy[np.arange(n_samples), action]
+ matched_action = matched_evaluation_policy == 1
+ sample_reward = (
+ reward * matched_action / matched_action.mean() if matched_action.any() else np.zeros_like(matched_action)
+ )
+ return sample_reward
+
+
+class GeneralizedInverseProbabilityWeighting(BaseOfflinePolicyEstimator, ABC):
+ """
+ Abstract generalization of the Inverse Probability Weighting (IPW) estimator.
+
+ Reference: Learning from Logged Implicit Exploration Data (Strehl, Langford, Li, and Kakade, 2010)
+ https://arxiv.org/pdf/1003.0120
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ """
+
+ @abstractmethod
+ def _get_importance_weights(self, **kwargs) -> np.ndarray:
+ """
+ Get the importance weights.
+
+ Returns
+ -------
+ np.ndarray
+ Array of importance weights.
+ """
+ pass
+
+ def estimate_sample_rewards(self, reward: np.ndarray, shrinkage_method: Optional[Callable], **kwargs) -> np.ndarray:
+ """
+ Estimate the sample rewards.
+
+ Parameters
+ ----------
+ reward : np.ndarray
+ Array of rewards corresponding to each action.
+ shrinkage_method : Optional[Callable]
+ Shrinkage method for the importance weights.
+
+ Returns
+ -------
+ sample_reward : np.ndarray
+ Estimated sample rewards.
+ """
+ importance_weight = self._get_importance_weights(**kwargs)
+ importance_weight = shrinkage_method(importance_weight) if shrinkage_method is not None else importance_weight
+ sample_reward = reward * importance_weight
+ return sample_reward
+
+
+class InverseProbabilityWeighting(GeneralizedInverseProbabilityWeighting):
+ """
+ Inverse Probability Weighing (IPW) estimator.
+
+ Reference: Learning from Logged Implicit Exploration Data (Strehl, Langford, Li, and Kakade, 2010)
+ https://arxiv.org/pdf/1003.0120
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ """
+
+ _name = "ipw"
+
+ def estimate_sample_rewards(
+ self,
+ action: np.ndarray,
+ reward: np.ndarray,
+ propensity_score: np.ndarray,
+ estimated_policy: np.ndarray,
+ shrinkage_method: Optional[Callable] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Estimate the sample rewards.
+
+ Parameters
+ ----------
+ action : np.ndarray
+ Array of actions taken.
+ reward : np.ndarray
+ Array of rewards corresponding to each action.
+ propensity_score : np.ndarray
+ Array of propensity scores.
+ estimated_policy : np.ndarray
+ Array of action distributions.
+
+ Returns
+ -------
+ sample_reward : np.ndarray
+ Estimated sample rewards.
+ """
+ return super().estimate_sample_rewards(
+ reward=reward,
+ action=action,
+ propensity_score=propensity_score,
+ estimated_policy=estimated_policy,
+ shrinkage_method=shrinkage_method,
+ )
+
+ def _get_importance_weights(
+ self,
+ action: np.ndarray,
+ propensity_score: np.ndarray,
+ estimated_policy: np.ndarray,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Get the importance weights.
+
+ Parameters
+ ----------
+ action : np.ndarray
+ Array of actions taken
+ propensity_score : np.ndarray
+ Array of propensity scores.
+ estimated_policy : np.ndarray
+ Array of action distributions.
+
+ Returns
+ -------
+ importance_weight : np.ndarray
+ Array of importance weights.
+ """
+ n_samples = action.shape[0]
+ importance_weight = estimated_policy[np.arange(n_samples), action] / propensity_score
+ return importance_weight
+
+
+class SelfNormalizedInverseProbabilityWeighting(InverseProbabilityWeighting):
+ """
+ Self-Normalized Inverse Propensity Score (SNIPS) estimator.
+
+ Reference: The Self-normalized Estimator for Counterfactual Learning (Swaminathan and Joachims, 2015)
+ https://papers.nips.cc/paper_files/paper/2015/file/39027dfad5138c9ca0c474d71db915c3-Paper.pdf
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ """
+
+ _name = "snips"
+
+ def estimate_sample_rewards(
+ self,
+ action: np.ndarray,
+ reward: np.ndarray,
+ propensity_score: np.ndarray,
+ estimated_policy: np.ndarray,
+ shrinkage_method: Optional[Callable] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Estimate the sample rewards.
+
+ Parameters
+ ----------
+ action : np.ndarray
+ Array of actions taken.
+ reward : np.ndarray
+ Array of rewards corresponding to each action.
+ propensity_score : np.ndarray
+ Array of propensity scores.
+ estimated_policy : np.ndarray
+ Array of action distributions.
+ shrinkage_method : Optional[Callable]
+ Shrinkage method for the importance weights.
+
+ Returns
+ -------
+ sample_reward : np.ndarray
+ Estimated sample rewards.
+ """
+
+ def self_normalized_shrink_weights(importance_weight: np.ndarray) -> np.ndarray:
+ importance_weight = (
+ shrinkage_method(importance_weight) if shrinkage_method is not None else importance_weight
+ )
+ return importance_weight / importance_weight.mean()
+
+ sample_reward = super().estimate_sample_rewards(
+ action=action,
+ reward=reward,
+ propensity_score=propensity_score,
+ estimated_policy=estimated_policy,
+ shrinkage_method=self_normalized_shrink_weights,
+ )
+ return sample_reward
+
+
+class DirectMethod(BaseOfflinePolicyEstimator):
+ """
+ Direct Method (DM) estimator.
+
+ This estimator uses the evaluation policy to Estimate the sample rewards.
+
+ Reference: The Offset Tree for Learning with Partial Labels (Beygelzimer and Langford, 2009)
+ https://arxiv.org/pdf/0812.4044
+
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ """
+
+ _name = "dm"
+
+ def estimate_sample_rewards(
+ self,
+ estimated_policy: np.ndarray,
+ expected_reward: np.ndarray,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Estimate the sample rewards.
+
+ Parameters
+ ----------
+ estimated_policy : np.ndarray
+ Array of action distributions.
+ expected_reward : np.ndarray
+ Array of expected rewards.
+
+ Returns
+ -------
+ sample_reward : np.ndarray
+ Estimated sample rewards.
+ """
+ n_samples = expected_reward.shape[0]
+ base_expected_reward = expected_reward[np.arange(n_samples), :]
+ evaluation_policy = estimated_policy[np.arange(n_samples), :]
+ expected_reward = np.average(
+ base_expected_reward,
+ weights=evaluation_policy,
+ axis=1,
+ )
+ return expected_reward
+
+
+class GeneralizedDoublyRobust(BaseOfflinePolicyEstimator, ABC):
+ """
+ Abstract generalization of the Doubly Robust (DR) estimator.
+
+ Reference: Doubly Robust Policy Evaluation and Optimization (DudÃk, Erhan, Langford, and Li, 2014)
+ https://arxiv.org/pdf/1503.02834
+
+ More Robust Doubly Robust Off-policy Evaluation (Farajtabar, Chow, and Ghavamzadeh, 2018)
+ https://arxiv.org/pdf/1802.03493
+
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ """
+
+ _alternative_method_cls: Type[InverseProbabilityWeighting]
+ _dm: DirectMethod = PrivateAttr()
+ _other_method: BaseOfflinePolicyEstimator = PrivateAttr()
+
+ if pydantic_version == PYDANTIC_VERSION_1:
+
+ def __init__(self, **data):
+ super().__init__(**data)
+ self._dm = DirectMethod(
+ alpha=self.alpha, n_bootstrap_samples=self.n_bootstrap_samples, random_state=self.random_state
+ )
+ self._other_method = self._alternative_method_cls(
+ alpha=self.alpha, n_bootstrap_samples=self.n_bootstrap_samples, random_state=self.random_state
+ )
+
+ elif pydantic_version == PYDANTIC_VERSION_2:
+
+ def model_post_init(self, __context: Any) -> None:
+ self._dm = DirectMethod(
+ alpha=self.alpha, n_bootstrap_samples=self.n_bootstrap_samples, random_state=self.random_state
+ )
+ self._other_method = self._alternative_method_cls(
+ alpha=self.alpha, n_bootstrap_samples=self.n_bootstrap_samples, random_state=self.random_state
+ )
+
+ else:
+ raise ValueError(f"Unsupported pydantic version: {pydantic_version}")
+
+ def estimate_sample_rewards(
+ self,
+ action: np.ndarray,
+ reward: np.ndarray,
+ propensity_score: np.ndarray,
+ estimated_policy: np.ndarray,
+ expected_reward: np.ndarray,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Estimate the sample rewards.
+
+ Parameters
+ ----------
+ action : np.ndarray
+ Array of actions taken.
+ reward : np.ndarray
+ Array of rewards corresponding to each action.
+ propensity_score : np.ndarray
+ Array of propensity scores.
+ estimated_policy : np.ndarray
+ Array of action distributions.
+ expected_reward : np.ndarray
+ Array of expected rewards.
+
+ Returns
+ -------
+ sample_reward : np.ndarray
+ Estimated rewards.
+ """
+ dm_sample_reward = self._dm.estimate_sample_rewards(
+ action=action, estimated_policy=estimated_policy, expected_reward=expected_reward
+ )
+ other_sample_reward = self._other_method.estimate_sample_rewards(
+ action=action,
+ reward=reward - dm_sample_reward,
+ propensity_score=propensity_score,
+ estimated_policy=estimated_policy,
+ shrinkage_method=self._shrink_weights,
+ )
+ sample_reward = dm_sample_reward + other_sample_reward
+ return sample_reward
+
+ def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray:
+ return importance_weight
+
+
+class DoublyRobust(GeneralizedDoublyRobust):
+ """
+ Doubly Robust (DR) estimator.
+
+ Reference: Doubly Robust Policy Evaluation and Optimization (DudÃk, Erhan, Langford, and Li, 2014)
+ https://arxiv.org/pdf/1503.02834
+
+ More Robust Doubly Robust Off-policy Evaluation (Farajtabar, Chow, and Ghavamzadeh, 2018)
+ https://arxiv.org/pdf/1802.03493
+
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ """
+
+ _name = "dr"
+ _alternative_method_cls = InverseProbabilityWeighting
+
+
+class SelfNormalizedDoublyRobust(GeneralizedDoublyRobust):
+ """
+ Self-Normalized Doubly Robust (SNDR) estimator.
+
+ This estimator uses the self-normalized importance weights to combine the DR and IPS estimators.
+
+ Reference: Intrinsically Efficient, Stable, and Bounded Off-Policy Evaluation for Reinforcement Learning (Kallus and Uehara, 2019)
+ https://arxiv.org/pdf/1906.03735
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ """
+
+ _name = "sndr"
+ _alternative_method_cls = SelfNormalizedInverseProbabilityWeighting
+
+
+class SwitchDoublyRobust(DoublyRobust):
+ """
+ Switch Doubly Robust (Switch-DR) estimator.
+
+ This estimator uses a switching rule based on the propensity score to combine the DR and IPS estimators.
+
+ Reference: Optimal and Adaptive Off-policy Evaluation in Contextual Bandits (Wang, Agarwal, and Dudik, 2017)
+ https://arxiv.org/pdf/1507.02646
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : Optional[int], default=None
+ Random seed for bootstrap sampling.
+ switch_threshold : float, default=inf
+ Threshold for the importance weight to switch between the DR and IPS estimators.
+ """
+
+ _name = "switch-dr"
+ switch_threshold: float = float("inf")
+
+ def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray:
+ switch_indicator = importance_weight >= self.switch_threshold
+ return switch_indicator * importance_weight
+
+
+class DoublyRobustWithOptimisticShrinkage(DoublyRobust):
+ """
+ Optimistic version of DRos estimator.
+
+ This estimator uses a shrinkage factor to shrink the importance weight in the native DR.
+
+ Reference: Doubly Robust Off-Policy Evaluation with Shrinkage (Su, Dimakopoulou, Krishnamurthy, and Dudik, 2020)
+ https://arxiv.org/pdf/1907.09623
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ shrinkage_factor : float, default=0.0
+ Shrinkage factor for the importance weights.
+ If set to 0 or infinity, the estimator is equivalent to the native DM or DR estimators, respectively.
+ """
+
+ shrinkage_factor: NonNegativeFloat = 0.0
+ _name = "dros-opt"
+
+ def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray:
+ if self.shrinkage_factor == 0:
+ return np.zeros_like(importance_weight)
+ elif self.shrinkage_factor == float("inf"):
+ return importance_weight
+ return self.shrinkage_factor * importance_weight / (importance_weight**2 + self.shrinkage_factor)
+
+
+class DoublyRobustWithPessimisticShrinkage(DoublyRobust):
+ """
+ Pessimistic version of DRos estimator.
+
+ This estimator uses a shrinkage factor to shrink the importance weight in the native DR.
+
+ Reference: Doubly Robust Off-Policy Evaluation with Shrinkage (Su, Dimakopoulou, Krishnamurthy, and Dudik, 2020)
+ https://arxiv.org/pdf/1907.09623
+
+ Parameters
+ ----------
+ alpha : Float01, default=0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, default=10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, default=None
+ Random seed for bootstrap sampling.
+ shrinkage_factor : float, default=0.0
+ Shrinkage factor for the importance weights.
+ """
+
+ _name = "dros-pess"
+ shrinkage_factor: PositiveFloat = float("inf")
+
+ def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray:
+ importance_weight = np.minimum(self.shrinkage_factor, importance_weight)
+ return importance_weight
+
+
+class SubGaussianInverseProbabilityWeighting(InverseProbabilityWeighting):
+ """
+ SubGaussian Inverse Probability Weighing estimator.
+
+ Reference: Subgaussian and Differentiable Importance Sampling for Off-Policy Evaluation and Learning (Metelli, Russo, and Restelli, 2021)
+ https://proceedings.neurips.cc/paper_files/paper/2021/file/4476b929e30dd0c4e8bdbcc82c6ba23a-Paper.pdf
+
+ Parameters
+ ----------
+ alpha : Float01, defaults to 0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, defaults to 10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, defaults to None
+ Random seed for bootstrap sampling.
+ shrinkage_factor : Float01, defaults to 0.0
+ Shrinkage factor for the importance weights.
+
+ """
+
+ _name = "sg-ipw"
+ shrinkage_factor: Float01 = 0.0
+
+ def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray:
+ return importance_weight / (1 - self.shrinkage_factor + self.shrinkage_factor * importance_weight)
+
+
+class SubGaussianDoublyRobust(GeneralizedDoublyRobust):
+ """
+ SubGaussian Doubly Robust estimator.
+
+ Reference: Subgaussian and Differentiable Importance Sampling for Off-Policy Evaluation and Learning (Metelli, Russo, and Restelli, 2021)
+ https://proceedings.neurips.cc/paper_files/paper/2021/file/4476b929e30dd0c4e8bdbcc82c6ba23a-Paper.pdf
+
+ Parameters
+ ----------
+ alpha : Float01, defaults to 0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, defaults to 10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, defaults to None
+ Random seed for bootstrap sampling.
+ """
+
+ _name = "sg-dr"
+ _alternative_method_cls = SubGaussianInverseProbabilityWeighting
+
+
+class BalancedInverseProbabilityWeighting(GeneralizedInverseProbabilityWeighting):
+ """
+ Balanced Inverse Probability Weighing estimator.
+
+ Reference: Balanced Off-Policy Evaluation in General Action Spaces (Sondhi, Arbour, and Dimmery, 2020)
+ https://arxiv.org/pdf/1906.03694
+
+
+ Parameters
+ ----------
+ alpha : Float01, defaults to 0.05
+ Significance level for confidence interval estimation.
+ n_bootstrap_samples : int, defaults to 10000
+ Number of bootstrap samples for confidence interval estimation.
+ random_state : int, defaults to None
+ Random seed for bootstrap sampling.
+
+ ----------
+ Arjun Sondhi, David Arbour, and Drew Dimmery
+ "Balanced Off-Policy Evaluation in General Action Spaces.", 2020.
+ """
+
+ _name = "b-ipw"
+
+ def _get_importance_weights(self, expected_importance_weight: np.ndarray, **kwargs) -> np.ndarray:
+ """
+ Get the importance weights.
+
+ Parameters
+ ----------
+ expected_importance_weight : np.ndarray
+ Array of expected importance weights.
+
+ Returns
+ -------
+ expected_importance_weight : np.ndarray
+ Array of expected importance weights.
+ """
+ return expected_importance_weight
+
+ def estimate_sample_rewards(
+ self,
+ reward: np.ndarray,
+ expected_importance_weight: np.ndarray,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Estimate the sample rewards.
+
+ Parameters
+ ----------
+ reward : np.ndarray
+ Array of rewards corresponding to each action.
+ expected_importance_weight : np.ndarray
+ Array of expected importance weights.
+
+ Returns
+ -------
+ sample_reward : np.ndarray
+ Estimated rewards.
+ """
+ return super().estimate_sample_rewards(
+ reward=reward, expected_importance_weight=expected_importance_weight, shrinkage_method=None
+ )
diff --git a/pybandits/offline_policy_evaluator.py b/pybandits/offline_policy_evaluator.py
new file mode 100644
index 0000000..97d7f4b
--- /dev/null
+++ b/pybandits/offline_policy_evaluator.py
@@ -0,0 +1,1127 @@
+import os
+from copy import deepcopy
+from functools import partial
+from itertools import product
+from math import floor
+from multiprocessing import Pool, cpu_count
+from sys import version_info
+from typing import Any, Dict, List, Literal, Optional, Union
+
+import numpy as np
+import optuna
+import pandas as pd
+from bokeh.models import ColumnDataSource, TabPanel
+from bokeh.plotting import figure
+from loguru import logger
+from optuna import Trial
+from sklearn.base import ClassifierMixin, TransformerMixin
+from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import cross_val_score
+from sklearn.neural_network import MLPClassifier
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+from tqdm import tqdm
+
+from pybandits.pydantic_version_compatibility import (
+ PYDANTIC_VERSION_1,
+ PYDANTIC_VERSION_2,
+ NonNegativeInt,
+ PositiveInt,
+ PrivateAttr,
+ field_validator,
+ model_validator,
+ pydantic_version,
+ validate_call,
+)
+
+if version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
+
+from pybandits import offline_policy_estimator
+from pybandits.base import ActionId, Float01, PyBanditsBaseModel
+from pybandits.mab import BaseMab
+from pybandits.offline_policy_estimator import BaseOfflinePolicyEstimator
+from pybandits.utils import (
+ extract_argument_names_from_function,
+ get_non_abstract_classes,
+ in_jupyter_notebook,
+ visualize_via_bokeh,
+)
+
+optuna.logging.enable_propagation() # Propagate logs to the root logger.
+optuna.logging.disable_default_handler() # Stop showing logs in sys.stderr.
+
+
+class _FunctionEstimator(PyBanditsBaseModel, ClassifierMixin, arbitrary_types_allowed=True):
+ """
+ This class provides functionality for model optimization using hyperparameter tuning via Optuna,
+ and prediction with optimized or default machine learning models.
+ It is used to estimate the propensity score and expected reward.
+
+
+ Parameters
+ ----------
+ estimator_type : Optional[Literal["logreg", "gbm", "rf", "mlp"]]
+ The model type to optimize.
+
+ fast_fit : bool
+ Whether to use the default parameter set for the model.
+
+ action_one_hot_encoder : OneHotEncoder
+ Fitted one hot encoder for action encoding.
+
+ n_trials : int
+ Number of trials for the Optuna optimization process.
+
+ verbose : bool
+ Whether to log detailed information during the optimization process.
+
+ study_name : Optional[str]
+ Name of the study to be created by Optuna.
+
+ multi_action_prediction : bool
+ Whether to predict for all actions or only for real action.
+
+ """
+
+ estimator_type: Literal["logreg", "gbm", "rf", "mlp"]
+ fast_fit: bool
+ action_one_hot_encoder: OneHotEncoder = OneHotEncoder(sparse=False)
+ n_trials: int
+ verbose: bool
+ study_name: Optional[str] = None
+ multi_action_prediction: bool
+ _model: Union[LogisticRegression, GradientBoostingClassifier, RandomForestClassifier, MLPClassifier] = PrivateAttr()
+ _model_mapping = {
+ "mlp": MLPClassifier,
+ "rf": RandomForestClassifier,
+ "logreg": LogisticRegression,
+ "gbm": GradientBoostingClassifier,
+ }
+
+ def _pre_process(self, batch: Dict[str, Any]) -> np.ndarray:
+ """
+ Preprocess the feature vectors to be used for regression model training.
+ This method concatenates the context vector and action context vectors.
+
+ Parameters
+ ----------
+ batch : Dict[str, Any]
+ The batch of data containing context, action, and action context.
+
+ Returns
+ -------
+ np.ndarray
+ A concatenated array of context and action context, shape (n_rounds, n_features_context + dim_action_context).
+ """
+ context = batch["context"]
+ action = batch["action_ids"]
+ return np.concatenate([context, self.action_one_hot_encoder.transform(action.reshape((-1, 1)))], axis=1)
+
+ def _sample_parameter_space(self, trial: Trial) -> Dict[str, Union[str, int, float]]:
+ """
+ Define the hyperparameter search space for a given model type in Optuna.
+
+ The search space is dynamically selected based on the model type being optimized.
+
+ Parameters
+ ----------
+ trial : optuna.Trial
+ A single trial in the Optuna optimization process.
+
+ Returns
+ -------
+ dict
+ A dictionary representing the search space for the model's hyperparameters.
+ """
+
+ if self.estimator_type == "mlp":
+ return {
+ "hidden_layer_sizes": 2 ** trial.suggest_int("hidden_layer_sizes", 2, 6),
+ "activation": trial.suggest_categorical("activation", ["relu", "logistic", "tanh"]),
+ "solver": trial.suggest_categorical("solver", ["lbfgs", "sgd", "adam"]),
+ "alpha": np.sqrt(10) ** -trial.suggest_int("learning_rate_init", 0, 10),
+ "max_iter": 1000,
+ "learning_rate_init": np.sqrt(10) ** -trial.suggest_int("learning_rate_init", 0, 6),
+ }
+ elif self.estimator_type == "rf":
+ return {
+ "max_depth": trial.suggest_int("max_depth", 2, 5),
+ "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
+ "max_features": trial.suggest_int("max_features", 1, 3),
+ "n_estimators": trial.suggest_int("n_estimators", 10, 50),
+ "n_jobs": -1,
+ }
+ elif self.estimator_type == "logreg":
+ return {
+ "tol": trial.suggest_float("tol", 0.00001, 0.0001),
+ "C": trial.suggest_float("C", 0.05, 3),
+ "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]),
+ "max_iter": 1000,
+ "n_jobs": -1,
+ }
+ elif self.estimator_type == "gbm":
+ return {
+ "n_estimators": trial.suggest_int("n_estimators", 10, 100),
+ "learning_rate": np.sqrt(10) ** -trial.suggest_int("learning_rate_init", 0, 6),
+ "max_depth": trial.suggest_int("max_depth", 2, 10),
+ }
+
+ def _objective(self, trial: Trial, feature_set: np.ndarray, label: np.ndarray) -> float:
+ """
+ Objective function for Optuna optimization.
+
+ This function trains a model using cross-validation and returns the negative accuracy
+ to be minimized.
+
+ Parameters
+ ----------
+ trial : Trial
+ A single trial in the Optuna optimization process.
+
+ feature_set : np.ndarray
+ The training dataset, containing context and encoded actions.
+
+ label : np.ndarray
+ The labels for the dataset.
+
+ Returns
+ -------
+ score : float
+ The score to be maximized by Optuna.
+ """
+ params = self._sample_parameter_space(trial)
+ model = self._model_mapping[self.estimator_type](**params)
+ score = cross_val_score(model, feature_set, label).mean()
+ trial.set_user_attr("model_params", params)
+
+ return score
+
+ def _optimize(self, feature_set: np.ndarray, label: np.ndarray, study: optuna.Study) -> dict:
+ """
+ Optimize the model's hyperparameters using Optuna.
+
+ Parameters
+ ----------
+ feature_set : np.ndarray
+ The training dataset, containing 'context' and 'action_ids' keys.
+
+ study : optuna.Study
+ The Optuna study object to store optimization results.
+
+ Returns
+ -------
+ best_params : dict
+ The best set of hyperparameters found by Optuna.
+ """
+
+ study.optimize(lambda trial: self._objective(trial, feature_set, label), n_trials=self.n_trials)
+
+ best_params = study.best_trial.user_attrs["model_params"]
+ if self.verbose:
+ logger.info(f"Optuna best model with optimized parameters for {self.estimator_type}:\n {best_params}")
+
+ return best_params
+
+ @validate_call(config=dict(arbitrary_types_allowed=True))
+ def fit(self, X: dict, y: np.ndarray) -> Self:
+ """
+ Fit the model using the given dataset X and labels y.
+
+ Parameters
+ ----------
+ X : dict
+ The dataset containing 'context' and 'action_ids' keys.
+ y : np.ndarray
+ The labels for the dataset.
+
+ Returns
+ -------
+ self : _FunctionEstimator
+ The fitted model.
+ """
+ feature_set = self._pre_process(X)
+ if self.fast_fit:
+ model_parameters = {}
+ else:
+ pruner = optuna.pruners.MedianPruner()
+ sampler = optuna.samplers.TPESampler(multivariate=True, group=True)
+ study = optuna.create_study(
+ direction="maximize", study_name=self.study_name, pruner=pruner, sampler=sampler
+ )
+ model_parameters = self._optimize(feature_set, y, study)
+
+ model = self._model_mapping[self.estimator_type](**model_parameters)
+ model.fit(feature_set, y)
+ self._model = model
+ return self
+
+ @validate_call
+ def predict(self, X: dict) -> np.ndarray:
+ """
+ Predict the labels for the given dataset X.
+
+ Parameters
+ ----------
+ X : dict
+ The dataset containing 'context' and 'action_ids' keys.
+
+ Returns
+ -------
+ prediction : np.ndarray
+ The predicted labels for the dataset.
+ """
+ if not self._model:
+ raise AttributeError("Model has not been fitted yet.")
+
+ if self.multi_action_prediction:
+ specific_action_X = X.copy()
+ prediction = np.empty((X["n_rounds"], len(X["unique_actions"])))
+ for action_index, action in enumerate(X["unique_actions"]):
+ specific_action_X["action_ids"] = np.array([action] * X["n_rounds"])
+ specific_action_feature_set = self._pre_process(specific_action_X)
+ specific_action_prediction = self._model.predict_proba(specific_action_feature_set)[:, 1]
+ prediction[:, action_index] = specific_action_prediction
+ else:
+ feature_set = self._pre_process(X)
+ prediction = self._model.predict_proba(feature_set)[:, 1]
+ return prediction
+
+
+class OfflinePolicyEvaluator(PyBanditsBaseModel, arbitrary_types_allowed=True):
+ """
+ Class to conduct OPE with multiple OPE estimators
+
+ Reference: Open Bandit Dataset and Pipeline: Towards Realistic and Reproducible Off-Policy Evaluation
+ https://arxiv.org/abs/2008.07146 https://github.com/st-tech/zr-obp
+
+ Parameters
+ ----------
+ logged_data : pd.DataFrame
+ Logging data set
+ split_prop: Float01
+ Proportion of dataset used as training set
+ propensity_score_model_type: Literal["logreg", "gbm", "rf", "mlp", "batch_empirical", "empirical", "propensity_score"]
+ Method used to compute/estimate propensity score pi_b (propensity_score, logging / behavioral policy).
+ expected_reward_model_type: Literal["logreg", "gbm", "rf", "mlp"]
+ Method used to estimate expected reward for each action a in the training set.
+ n_trials : Optional[int]
+ Number of trials for the Optuna optimization process.
+ fast_fit : bool
+ Whether to use the default parameter set for the function estimator models.
+ ope_estimators: Optional[List[BaseOfflinePolicyEstimator]]
+ List of OPE estimators used to evaluate the policy value of evaluation policy.
+ All available estimators are if not specified.
+ batch_feature: str
+ Column name for batch as available in logged_data
+ action_feature: str
+ Column name for action as available in logged_data
+ reward_feature: Union[str, List[str]]
+ Column name for reward as available in logged_data
+ contextual_features: Optional[List[str]]
+ Column names for contextual features as available in logged_data
+ cost_feature: Optional[str]
+ Column name for cost as available in logged_data; used for bandit with cost control
+ group_feature: Optional[str]
+ Column name for group definition feature as available in logged_data; available from simulated data
+ to define samples with similar contextual profile
+ true_reward_feature: Optional[Union[str, List[str]]]
+ Column names for reward proba distribution features as available in simulated logged_data. Used to compute ground truth
+ propensity_score_feature : Optional[str]
+ Column name for propensity score as available in logged_data; used for evaluation of the policy value
+ verbose : bool
+ Whether to log detailed information during the optimization process.
+ """
+
+ logged_data: pd.DataFrame
+ split_prop: Float01
+ propensity_score_model_type: Literal[
+ "logreg", "gbm", "rf", "mlp", "batch_empirical", "empirical", "propensity_score"
+ ]
+ expected_reward_model_type: Literal["logreg", "gbm", "rf", "mlp"]
+ importance_weights_model_type: Literal["logreg", "gbm", "rf", "mlp"]
+ scaler: Optional[Union[TransformerMixin, Dict[str, TransformerMixin]]] = None
+ n_trials: Optional[int] = 100
+ fast_fit: bool = False
+ ope_estimators: Optional[List[BaseOfflinePolicyEstimator]]
+ batch_feature: str
+ action_feature: str
+ reward_feature: Union[str, List[str]]
+ contextual_features: Optional[List[str]] = None
+ cost_feature: Optional[str] = None
+ group_feature: Optional[str] = None
+ true_reward_feature: Optional[Union[str, List[str]]] = None
+ propensity_score_feature: Optional[str] = None
+ verbose: bool = False
+ _train_data: Optional[Dict[str, Any]] = PrivateAttr()
+ _test_data: Optional[Dict[str, Any]] = PrivateAttr()
+ _estimated_expected_reward: Optional[Dict[str, np.ndarray]] = PrivateAttr(default=None)
+ _action_one_hot_encoder = OneHotEncoder(sparse=False)
+ _propensity_score_epsilon = 1e-08
+
+ @field_validator("split_prop", mode="before")
+ @classmethod
+ def check_split_prop(cls, value):
+ if value == 0 or value == 1:
+ raise ValueError("split_prop should be strictly between 0 and 1")
+ return value
+
+ @field_validator("ope_estimators", mode="before")
+ @classmethod
+ def populate_ope_metrics(cls, value):
+ return (
+ value if value is not None else [class_() for class_ in get_non_abstract_classes(offline_policy_estimator)]
+ )
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_batch_feature(cls, values):
+ if values["batch_feature"] not in values["logged_data"]:
+ raise AttributeError("Batch feature missing from logged data.")
+ if not (
+ pd.api.types.is_datetime64_ns_dtype(values["logged_data"][values["batch_feature"]])
+ or pd.api.types.is_integer_dtype(values["logged_data"][values["batch_feature"]])
+ ):
+ raise TypeError(f"Column {values['batch_feature']} should be either date or int type")
+ return values
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_action_feature(cls, values):
+ if values["action_feature"] not in values["logged_data"]:
+ raise AttributeError("Action feature missing from logged data.")
+ return values
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_propensity_score_estimation_method(cls, values):
+ if values["propensity_score_model_type"] == "propensity_score":
+ if cls._get_value_with_default("propensity_score_feature", values) is None:
+ raise ValueError(
+ "Propensity score feature should be defined when using it as propensity_score_model_type"
+ )
+ return values
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_reward_features(cls, values):
+ reward_feature = values["reward_feature"]
+ reward_feature = reward_feature if isinstance(reward_feature, list) else [reward_feature]
+ if not all([reward in values["logged_data"] for reward in reward_feature]):
+ raise AttributeError("Reward feature missing from logged data.")
+ values["reward_feature"] = reward_feature
+ if "true_reward_feature" in values:
+ true_reward_feature = values["true_reward_feature"]
+ true_reward_feature = (
+ true_reward_feature
+ if isinstance(true_reward_feature, list)
+ else [true_reward_feature]
+ if true_reward_feature is not None
+ else None
+ )
+ if not all([true_reward in values["logged_data"] for true_reward in true_reward_feature]):
+ raise AttributeError("True reward feature missing from logged data.")
+ if len(reward_feature) != len(true_reward_feature):
+ raise ValueError("Reward and true reward features should have the same length")
+ values["true_reward_feature"] = true_reward_feature
+
+ return values
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_optional_scalar_features(cls, values):
+ for feature in [
+ "cost_feature",
+ "group_feature",
+ "propensity_score_feature",
+ ]:
+ value = cls._get_value_with_default(feature, values)
+ if value is not None and value not in values["logged_data"]:
+ raise AttributeError(f"{feature} missing from logged data.")
+ return values
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_contextual_features(cls, values):
+ value = cls._get_value_with_default("contextual_features", values)
+ if value is not None and not set(value).issubset(values["logged_data"].columns):
+ raise AttributeError("contextual_features missing from logged data.")
+ return values
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_model_optimization(cls, values):
+ n_trials_value = cls._get_value_with_default("n_trials", values)
+ fast_fit_value = cls._get_value_with_default("fast_fit", values)
+
+ if (n_trials_value is None or fast_fit_value is None) and values["propensity_score_model_type"] not in [
+ "logreg",
+ "gbm",
+ "rf",
+ "mlp",
+ ]:
+ raise ValueError("The requested propensity score model requires n_trials and fast_fit to be well defined")
+ if (n_trials_value is None or fast_fit_value is None) and cls._check_argument_required_by_estimators(
+ "expected_reward", values["ope_estimators"]
+ ):
+ raise ValueError(
+ "The requested offline policy evaluation metrics model require estimation of the expected reward. "
+ "Thus, n_trials and fast_fit need to be well defined."
+ )
+ return values
+
+ @classmethod
+ def _check_argument_required_by_estimators(cls, argument: str, ope_estimators: List[BaseOfflinePolicyEstimator]):
+ """
+ Check if argument is required by OPE estimators.
+
+ Parameters
+ ----------
+ argument : str
+ Argument to check if required by OPE estimators.
+ ope_estimators : List[BaseOfflinePolicyEstimator]
+ List of OPE estimators.
+
+ Returns
+ -------
+ bool
+ True if argument is required by OPE estimators, False otherwise.
+ """
+ return any(
+ [
+ argument
+ in extract_argument_names_from_function(ope_estimator.estimate_sample_rewards)
+ + extract_argument_names_from_function(ope_estimator.estimate_policy_value_with_confidence_interval)
+ for ope_estimator in ope_estimators
+ ]
+ )
+
+ if pydantic_version == PYDANTIC_VERSION_1:
+
+ def __init__(self, **data):
+ super().__init__(**data)
+
+ # Extract batches for train and test set
+ self._extract_batches()
+
+ # Estimate propensity score in the train and test set
+ self._estimate_propensity_score()
+
+ # Estimate expected reward estimator and predict in the test set, when required by OPE estimators
+ if self._check_argument_required_by_estimators("expected_reward", self.ope_estimators):
+ self._estimate_expected_reward()
+
+ elif pydantic_version == PYDANTIC_VERSION_2:
+
+ def model_post_init(self, __context: Any) -> None:
+ # Extract batches for train and test set
+ self._extract_batches()
+
+ # Estimate propensity score in the train and test set
+ self._estimate_propensity_score()
+
+ # Estimate expected reward estimator and predict in the test set, when required by OPE estimators
+ if self._check_argument_required_by_estimators("expected_reward", self.ope_estimators):
+ self._estimate_expected_reward()
+
+ else:
+ raise ValueError(f"Unsupported pydantic version: {pydantic_version}")
+
+ def _extract_batches(self):
+ """
+ Create training and test sets in dictionary form.
+
+ """
+ logged_data = self.logged_data.sort_values(by=self.batch_feature)
+ unique_batch = logged_data[self.batch_feature].unique()
+ split_batch = unique_batch[int(floor(len(unique_batch) * self.split_prop))]
+
+ # add list of actions in dict in order to avoid test set with n_actions
+ # lower than nb of total actions
+ unique_actions = sorted(self.logged_data[self.action_feature].unique().tolist())
+ action_label_encoder = LabelEncoder()
+ for batch_idx in tqdm(range(2)):
+ # extract samples batch
+ if batch_idx == 0:
+ extracted_batch = self.logged_data[self.logged_data[self.batch_feature] <= split_batch]
+ else:
+ extracted_batch = self.logged_data[self.logged_data[self.batch_feature] > split_batch]
+ extracted_batch = extracted_batch.reset_index(drop=True)
+
+ # dict data set information for OPE
+ action_ids = extracted_batch[self.action_feature].values
+ if batch_idx == 0:
+ self._action_one_hot_encoder.fit(np.array(unique_actions).reshape((-1, 1)))
+ reward = extracted_batch[self.reward_feature].values
+
+ # if cost control bandit
+ if self.cost_feature is not None:
+ cost = extracted_batch[self.cost_feature].values
+ else:
+ cost = None
+
+ # if contextual information required
+ if self.contextual_features is not None:
+ if self.scaler is not None:
+ if type(self.scaler) is dict:
+ if batch_idx == 0:
+ x_scale = np.array(
+ pd.concat(
+ [
+ self.scaler[feature].fit_transform(np.array(extracted_batch[[feature]]))
+ for feature in self.contextual_features
+ ],
+ axis=1,
+ )
+ )
+ else:
+ x_scale = np.array(
+ pd.concat(
+ [
+ self.scaler[feature].transform(np.array(extracted_batch[[feature]]))
+ for feature in self.contextual_features
+ ],
+ axis=1,
+ )
+ )
+ else:
+ if batch_idx == 0:
+ x_scale = self.scaler.fit_transform(np.array(extracted_batch[self.contextual_features]))
+ else:
+ x_scale = self.scaler.transform(np.array(extracted_batch[self.contextual_features]))
+ else:
+ x_scale = np.array(extracted_batch[self.contextual_features])
+ else:
+ x_scale = np.zeros((len(action_ids), 0)) # zero-columns 2d array to allow concatenation later
+
+ # extract data for policy information
+ policy_information_cols = [
+ self.batch_feature,
+ self.action_feature,
+ ] + self.reward_feature
+ if self.group_feature:
+ policy_information_cols.append(self.group_feature)
+
+ policy_information = extracted_batch[policy_information_cols]
+
+ # reward probability distribution as used during simulation process if available
+ ground_truth = extracted_batch[self.true_reward_feature] if self.true_reward_feature else None
+
+ # propensity_score may be available from simulation: propensity_score is added to the dict
+ propensity_score = (
+ extracted_batch[self.propensity_score_feature].values if self.propensity_score_feature else None
+ )
+ if batch_idx == 0:
+ action_label_encoder.fit(unique_actions)
+ actions = action_label_encoder.transform(action_ids)
+
+ # Store information in a dictionary as required by obp package
+ data_batch = {
+ "n_rounds": len(action_ids), # number of samples
+ "n_action": len(unique_actions), # number of actions
+ "unique_actions": unique_actions, # list of actions in the whole data set
+ "action_ids": action_ids, # action identifiers
+ "action": actions, # encoded action identifiers
+ "reward": reward, # samples' reward
+ "propensity_score": propensity_score, # propensity score, pi_b(a|x), vector
+ "context": x_scale, # the matrix of features i.e. context
+ "data": policy_information, # data array with informative features
+ "ground_truth": ground_truth, # true reward probability for each action and samples, list of list
+ "cost": cost, # samples' action cost for bandit with cost control
+ }
+ if batch_idx == 0:
+ self._train_data = data_batch
+ else:
+ self._test_data = data_batch
+
+ def _estimate_propensity_score_empirical(
+ self, batch: Dict[str, Any], groupby_cols: List[str], inner_groupby_cols: Optional[List[str]] = None
+ ) -> np.ndarray:
+ """
+ Empirical propensity score computation based on batches average
+
+ Parameters
+ ----------
+ batch: Dict[str, Any]
+ Dataset dictionary
+ groupby_cols : List[str]
+ Columns to group by
+ inner_groupby_cols : Optional[List[str]]
+ Columns to group by after the first groupby
+
+ Returns
+ -------
+ propensity_score : np.ndarray
+ computed propensity score for each of the objectives
+ """
+ inner_groupby_cols = [] if inner_groupby_cols is None else inner_groupby_cols
+ overall_groupby_cols = groupby_cols + inner_groupby_cols
+ # number of recommended actions per group and batch
+ grouped_data = batch["data"].groupby(overall_groupby_cols)[self.reward_feature[0]].count()
+
+ # proportion of recommended actions per group
+ if inner_groupby_cols:
+ empirical_distribution = pd.DataFrame(
+ grouped_data / grouped_data.groupby(inner_groupby_cols).sum()
+ ).reset_index()
+ else:
+ empirical_distribution = pd.DataFrame(grouped_data / grouped_data.sum()).reset_index()
+
+ empirical_distribution.columns = overall_groupby_cols + ["propensity_score"]
+
+ # deal with missing segment after group by
+ if len(overall_groupby_cols) > 1:
+ all_combinations = pd.DataFrame(
+ list(product(*[empirical_distribution[col].unique() for col in overall_groupby_cols])),
+ columns=overall_groupby_cols,
+ )
+
+ # Merge with the original dataframe, filling missing values in 'c' with 0
+ empirical_distribution = pd.merge(
+ all_combinations, empirical_distribution, on=groupby_cols + inner_groupby_cols, how="left"
+ ).fillna(0)
+
+ # extract propensity_score in the test set for user according to group and action recommended
+ matching_df = pd.DataFrame({k: batch["data"][k] for k in overall_groupby_cols})
+ merged_df = pd.merge(
+ matching_df,
+ empirical_distribution[overall_groupby_cols + ["propensity_score"]],
+ how="left", # left join to ensure we get all rows from the batch
+ on=overall_groupby_cols,
+ )
+ propensity_score = merged_df["propensity_score"].values
+
+ return propensity_score
+
+ def _empirical_averaged_propensity_score(self, batch: Dict[str, Any]) -> np.ndarray:
+ """
+ Empirical propensity score computation based on batches average
+
+ Parameters
+ ----------
+ batch : Dict[str, Any]
+ dataset.
+
+ Returns
+ ------
+ : np.ndarray
+ estimated propensity_score
+ """
+
+ return self._estimate_propensity_score_empirical(
+ batch=batch, groupby_cols=[self.action_feature], inner_groupby_cols=[self.batch_feature]
+ )
+
+ def _empirical_propensity_score(self, batch: Dict[str, Any]) -> np.ndarray:
+ """
+ Propensity score empirical computation based on data set average
+
+ Parameters
+ ----------
+ batch : Dict[str, Any]
+ dataset.
+
+ Return
+ ------
+ np.ndarray
+ estimated propensity_score
+ """
+
+ return self._estimate_propensity_score_empirical(batch=batch, groupby_cols=[self.action_feature])
+
+ def _estimate_propensity_score(self):
+ """
+ Compute/approximate propensity score based on different methods in the train and test set.
+ Different approaches may be evaluated when logging policy is unknown.
+ """
+ if not self.contextual_features:
+ # if no contextual features, propensity score is directly defined by the action taken,
+ # thus uniformly set to 1
+ train_propensity_score = np.ones(self._train_data["n_rounds"])
+ test_propensity_score = np.ones(self._test_data["n_rounds"])
+ logger.warning(
+ f"No contextual features available, "
+ f"overriding the requested propensity_score_model_type={self.propensity_score_model_type} "
+ f"using uniform propensity score"
+ )
+ else:
+ if self.propensity_score_model_type == "batch_empirical":
+ if self.verbose:
+ logger.info("Data batch-empirical estimation of propensity score.")
+
+ # Empirical approach: propensity score pi_b computed as action means per samples batch
+ train_propensity_score = self._empirical_propensity_score(self._train_data)
+ test_propensity_score = self._empirical_propensity_score(self._test_data)
+
+ elif self.propensity_score_model_type == "empirical":
+ if self.verbose:
+ logger.info("Data empirical estimation of propensity score.")
+
+ # Empirical approach: propensity score pi_b computed as action means per samples batch
+ train_propensity_score = self._empirical_averaged_propensity_score(self._train_data)
+ test_propensity_score = self._empirical_averaged_propensity_score(self._test_data)
+
+ elif self.propensity_score_model_type == "propensity_score":
+ if self.verbose:
+ logger.info("Data given value of propensity score.")
+
+ train_propensity_score = self._train_data["propensity_score"]
+ test_propensity_score = self._test_data["propensity_score"]
+
+ else: # self.propensity_score_model_type in ["gbm", "rf", "logreg", "mlp"]
+ if self.verbose:
+ logger.info(
+ f"Data prediction of propensity score based on {self.propensity_score_model_type} model."
+ )
+ propensity_score_estimator = _FunctionEstimator(
+ estimator_type=self.propensity_score_model_type,
+ fast_fit=self.fast_fit,
+ action_one_hot_encoder=self._action_one_hot_encoder,
+ n_trials=self.n_trials,
+ verbose=self.verbose,
+ study_name=f"{self.propensity_score_model_type}_propensity_score",
+ multi_action_prediction=False,
+ )
+ propensity_score_estimator.fit(X=self._train_data, y=self._train_data["action"])
+ train_propensity_score = np.clip(
+ propensity_score_estimator.predict(self._train_data), self._propensity_score_epsilon, 1
+ )
+ test_propensity_score = np.clip(
+ propensity_score_estimator.predict(self._test_data), self._propensity_score_epsilon, 1
+ )
+ self._train_data["propensity_score"] = train_propensity_score
+ self._test_data["propensity_score"] = test_propensity_score
+
+ def _estimate_expected_reward(self):
+ """
+ Compute expected reward for each round and action.
+ """
+ if self.verbose:
+ logger.info(f"Data prediction of expected reward based on {self.expected_reward_model_type} model.")
+ estimated_expected_reward = {}
+ for reward_feature, reward in zip(self.reward_feature, self._train_data["reward"].T):
+ expected_reward_model = _FunctionEstimator(
+ estimator_type=self.expected_reward_model_type,
+ fast_fit=self.fast_fit,
+ action_one_hot_encoder=self._action_one_hot_encoder,
+ n_trials=self.n_trials,
+ verbose=self.verbose,
+ study_name=f"{self.expected_reward_model_type}_expected_reward",
+ multi_action_prediction=True,
+ )
+
+ expected_reward_model.fit(X=self._train_data, y=reward.T)
+
+ # predict in test set
+ estimated_expected_reward[reward_feature] = expected_reward_model.predict(self._test_data)
+ self._estimated_expected_reward = estimated_expected_reward
+
+ def _estimate_importance_weight(self, mab: BaseMab) -> np.ndarray:
+ """
+ Compute importance weights induced by the behavior and evaluation policies.
+
+ Reference: Balanced Off-Policy Evaluation in General Action Spaces (Sondhi, Arbour, and Dimmery, 2020)
+ https://arxiv.org/pdf/1906.03694
+
+ Parameters
+ ----------
+ mab : BaseMab
+ Multi-armed bandit to be evaluated
+
+ Return
+ ------
+ expected_importance_weights : np.ndarray
+ estimated importance weights
+ """
+ if self.verbose:
+ logger.info(f"Data prediction of importance weights based on {self.importance_weights_model_type} model.")
+
+ importance_weights_model = _FunctionEstimator(
+ estimator_type=self.importance_weights_model_type,
+ fast_fit=self.fast_fit,
+ action_one_hot_encoder=self._action_one_hot_encoder,
+ n_trials=self.n_trials,
+ verbose=self.verbose,
+ study_name=f"{self.importance_weights_model_type}_importance_weights",
+ multi_action_prediction=False,
+ )
+ train_data = deepcopy(self._train_data)
+ mab_data = self._train_data["context"] if self.contextual_features else self._train_data["n_rounds"]
+ selected_actions = _mab_predict(mab, mab_data)
+ train_data["action_ids"] = np.concatenate((train_data["action_ids"], selected_actions), axis=0)
+ train_data["context"] = np.concatenate((train_data["context"], train_data["context"]), axis=0)
+ y = np.concatenate((np.zeros(len(selected_actions)), np.ones(len(selected_actions))), axis=0)
+ importance_weights_model.fit(X=train_data, y=y)
+
+ # predict in test set
+ estimated_proba = importance_weights_model.predict(self._test_data)
+ expected_importance_weights = estimated_proba / (1 - estimated_proba)
+ return expected_importance_weights
+
+ def _estimate_policy(
+ self,
+ mab: BaseMab,
+ n_mc_experiments: PositiveInt = 1000,
+ n_cores: Optional[NonNegativeInt] = None,
+ ) -> np.ndarray:
+ """
+ Estimate policy via Monte Carlo (MC) sampling based on sampling distribution of each action a in the test set.
+
+ Reference: Estimation Considerations in Contextual Bandit
+ https://arxiv.org/pdf/1711.07077.pdf
+ Reference: Debiased Off-Policy Evaluation for Recommendation Systems
+ https://arxiv.org/pdf/2002.08536.pdf
+ Reference: CAB: Continuous Adaptive Blending for Policy Evaluation and Learning
+ https://arxiv.org/pdf/1811.02672.pdf
+
+ Parameters
+ ----------
+ mab : BaseMab
+ Multi-armed bandit to be evaluated
+ n_mc_experiments: PositiveInt
+ Number of MC sampling rounds. Default: 1000
+ n_cores: Optional[NonNegativeInt], all available cores if not specified
+ Number of cores used for multiprocessing
+
+ Returns
+ -------
+ estimated_policy: np.ndarray (nb samples, nb actions)
+ action probabilities for each action and samples
+ """
+ if self.verbose:
+ logger.info("Data prediction of expected policy based on Monte Carlo experiments.")
+ n_cores = n_cores or cpu_count()
+
+ # using MC, create a () best actions matrix
+ mc_actions = []
+ mab_data = self._test_data["context"] if self.contextual_features else self._test_data["n_rounds"]
+ predict_func = partial(_mab_predict, mab, mab_data)
+ with Pool(processes=n_cores) as pool:
+ # predict best action for a new prior parameters draw
+ # using argmax(p(r|a, x)) with a in the list of actions
+ for mc_action in tqdm(pool.imap_unordered(predict_func, range(n_mc_experiments))):
+ mc_actions.append(mc_action)
+
+ # finalize the dataframe shape to #samples X #mc experiments
+ mc_actions = pd.DataFrame(mc_actions).T
+
+ # for each sample / each action, count the occurrence frequency during MC iteration
+ estimated_policy = np.zeros((self._test_data["n_rounds"], len(self._test_data["unique_actions"])))
+ mc_action_counts = mc_actions.apply(pd.Series.value_counts, axis=1).fillna(0)
+
+ for u in tqdm(range(self._test_data["n_rounds"])):
+ estimated_policy[u, :] = (
+ mc_action_counts.iloc[u, :].reindex(self._test_data["unique_actions"], fill_value=0).values
+ / mc_actions.shape[1]
+ )
+ return estimated_policy
+
+ def evaluate(
+ self,
+ mab: BaseMab,
+ n_mc_experiments: int = 1000,
+ save_path: Optional[str] = None,
+ visualize: bool = True,
+ ) -> pd.DataFrame:
+ """
+ Execute the OPE process with multiple estimators simultaneously.
+
+ Parameters
+ ----------
+ mab : BaseMab
+ Multi-armed bandit model to be evaluated
+ n_mc_experiments : int
+ Number of Monte Carlo experiments for policy estimation
+ save_path : Optional[str], defaults to None.
+ Path to save the results. Nothing is saved if not specified.
+ visualize : bool, defaults to True.
+ Whether to visualize the results of the OPE process
+
+ Returns
+ -------
+ estimated_policy_value_df : pd.DataFrame
+ Estimated policy values and confidence intervals
+ """
+ if visualize and not save_path and not in_jupyter_notebook():
+ raise ValueError("save_path is required for visualization when not running in a Jupyter notebook")
+
+ # Define OPE keyword arguments
+ kwargs = {}
+ if self._check_argument_required_by_estimators("action", self.ope_estimators):
+ kwargs["action"] = self._test_data["action"]
+ if self._check_argument_required_by_estimators("estimated_policy", self.ope_estimators):
+ kwargs["estimated_policy"] = self._estimate_policy(mab=mab, n_mc_experiments=n_mc_experiments)
+ if self._check_argument_required_by_estimators("propensity_score", self.ope_estimators):
+ kwargs["propensity_score"] = self._test_data["propensity_score"]
+ if self._check_argument_required_by_estimators("expected_importance_weight", self.ope_estimators):
+ kwargs["expected_importance_weight"] = self._estimate_importance_weight(mab)
+
+ # Instantiate class to conduct OPE by multiple estimators simultaneously
+ multi_objective_estimated_policy_value_df = pd.DataFrame()
+ results = {"value": [], "lower": [], "upper": [], "std": [], "estimator": [], "objective": []}
+ for reward_feature in self.reward_feature:
+ if self.verbose:
+ logger.info(f"Offline Policy Evaluation for {reward_feature}.")
+
+ if self._check_argument_required_by_estimators("reward", self.ope_estimators):
+ kwargs["reward"] = self._test_data["reward"][:, self.reward_feature.index(reward_feature)]
+ if self._check_argument_required_by_estimators("expected_reward", self.ope_estimators):
+ kwargs["expected_reward"] = self._estimated_expected_reward[reward_feature]
+
+ # Summarize policy values and their confidence intervals estimated by OPE estimators
+ for ope_estimator in self.ope_estimators:
+ estimated_policy_value, low, high, std = ope_estimator.estimate_policy_value_with_confidence_interval(
+ **kwargs,
+ )
+ results["value"].append(estimated_policy_value)
+ results["lower"].append(low)
+ results["upper"].append(high)
+ results["std"].append(std)
+ results["estimator"].append(ope_estimator.name)
+ results["objective"].append(reward_feature)
+
+ multi_objective_estimated_policy_value_df = pd.concat(
+ [multi_objective_estimated_policy_value_df, pd.DataFrame.from_dict(results)],
+ axis=0,
+ )
+ if save_path:
+ multi_objective_estimated_policy_value_df.to_csv(os.path.join(save_path, "estimated_policy_value.csv"))
+
+ if visualize:
+ self._visualize_results(save_path, multi_objective_estimated_policy_value_df)
+
+ return multi_objective_estimated_policy_value_df
+
+ def update_and_evaluate(
+ self,
+ mab: BaseMab,
+ n_mc_experiments: int = 1000,
+ save_path: Optional[str] = None,
+ visualize: bool = True,
+ with_test: bool = False,
+ ) -> pd.DataFrame:
+ """
+ Execute update of the multi-armed bandit based on the logged data,
+ followed by the OPE process with multiple estimators simultaneously.
+
+ Parameters
+ ----------
+ mab : BaseMab
+ Multi-armed bandit model to be updated and evaluated
+ n_mc_experiments : int
+ Number of Monte Carlo experiments for policy estimation
+ save_path : Optional[str]
+ Path to save the results. Nothing is saved if not specified.
+ visualize : bool
+ Whether to visualize the results of the OPE process
+ with_test : bool
+ Whether to update the bandit model with the test data
+
+ Returns
+ -------
+ estimated_policy_value_df : pd.DataFrame
+ Estimated policy values
+ """
+ self._update_mab(mab, self._train_data)
+ if with_test:
+ self._update_mab(mab, self._test_data)
+ estimated_policy_value_df = self.evaluate(mab, n_mc_experiments, save_path, visualize)
+ return estimated_policy_value_df
+
+ def _update_mab(self, mab: BaseMab, data: Dict[str, Any]):
+ """
+ Update the multi-armed bandit model based on the logged data.
+
+ Parameters
+ ----------
+ mab : BaseMab
+ Multi-armed bandit model to be updated.
+ data : Dict[str, Any]
+ Data used to update the bandit model.
+ """
+ if self.verbose:
+ logger.info(f"Offline policy update for {type(mab)}.")
+ kwargs = {"context": data["context"]} if self.contextual_features else {}
+ mab.update(actions=data["action_ids"].tolist(), rewards=np.squeeze(data["reward"]).tolist(), **kwargs)
+
+ def _visualize_results(self, save_path: Optional[str], multi_objective_estimated_policy_value_df: pd.DataFrame):
+ """
+ Visualize the results of the OPE process.
+
+ Parameters
+ ----------
+ save_path : Optional[str]
+ Path to save the visualization results. Required if not running in a Jupyter notebook.
+ multi_objective_estimated_policy_value_df : pd.DataFrame
+ Estimated confidence intervals
+ """
+
+ tabs = []
+ grouped_df = multi_objective_estimated_policy_value_df.groupby("objective")
+ tools = "crosshair, pan, wheel_zoom, box_zoom, reset, hover, save"
+
+ tooltips = [
+ ("Estimator", "@estimator"),
+ ("Estimated policy value", "@value"),
+ ("Lower CI", "@lower"),
+ ("Upper CI", "@upper"),
+ ]
+ for group_name, estimated_interval_df in grouped_df:
+ source = ColumnDataSource(
+ data=dict(
+ estimator=estimated_interval_df["estimator"],
+ value=estimated_interval_df["value"],
+ lower=estimated_interval_df["lower"],
+ upper=estimated_interval_df["upper"],
+ )
+ )
+ fig = figure(
+ title=f"Policy value estimates for {group_name} objective",
+ x_axis_label="Estimator",
+ y_axis_label="Estimated policy value (\u00b1 CI)",
+ sizing_mode="inherit",
+ x_range=source.data["estimator"],
+ tools=tools,
+ tooltips=tooltips,
+ )
+ fig.vbar(x="estimator", top="value", width=0.9, source=source)
+
+ # Add error bars for confidence intervals
+ fig.segment(
+ x0="estimator", y0="lower", x1="estimator", y1="upper", source=source, line_width=2, color="black"
+ ) # error bar line
+ fig.vbar(
+ x="estimator", width=0.1, bottom="lower", top="upper", source=source, color="black"
+ ) # error bar cap
+
+ fig.xgrid.grid_line_color = None
+
+ tabs.append(TabPanel(child=fig, title=f"{group_name}"))
+
+ output_path = os.path.join(save_path, "multi_objective_estimated_policy.html") if save_path else None
+ visualize_via_bokeh(tabs=tabs, output_path=output_path)
+
+
+def _mab_predict(mab: BaseMab, mab_data: Union[np.ndarray, PositiveInt], mc_experiment: int = 0) -> List[ActionId]:
+ """
+ bandit action probabilities prediction in test set
+
+ Parameters
+ ----------
+ mab : BaseMab
+ Multi-armed bandit model
+ mab_data : Union[np.ndarray, PositiveInt]
+ test data used to update the bandit model; context or number of samples.
+ mc_experiment : int
+ placeholder for multiprocessing
+
+ Returns
+ -------
+ actions: List[ActionId] of shape (n_samples,)
+ The actions selected by the multi-armed bandit model.
+ """
+ mab_output = mab.predict(context=mab_data) if type(mab_data) is np.ndarray else mab.predict(n_samples=mab_data)
+ actions = mab_output[0]
+ return actions
diff --git a/pybandits/pydantic_version_compatibility.py b/pybandits/pydantic_version_compatibility.py
index a119264..76a8a07 100644
--- a/pybandits/pydantic_version_compatibility.py
+++ b/pybandits/pydantic_version_compatibility.py
@@ -27,7 +27,19 @@
from typing import Any, Callable, Dict, Literal, Optional, Union
from warnings import warn
-from pydantic import BaseModel, Field, NonNegativeFloat, PositiveInt, ValidationError, confloat, conint, constr
+from pydantic import (
+ BaseModel,
+ Field,
+ NonNegativeFloat,
+ NonNegativeInt,
+ PositiveFloat,
+ PositiveInt,
+ PrivateAttr,
+ ValidationError,
+ confloat,
+ conint,
+ constr,
+)
from pydantic.version import VERSION as _VERSION
# Define the pydantic versions
@@ -258,6 +270,8 @@ def _convert_config_param(config: Dict[str, Any], v2_name: str, v1_name: str) ->
"model_validator",
"validate_call",
"NonNegativeFloat",
+ "NonNegativeInt",
+ "PositiveFloat",
"PositiveInt",
"BaseModel",
"ValidationError",
@@ -265,4 +279,5 @@ def _convert_config_param(config: Dict[str, Any], v2_name: str, v1_name: str) ->
"conint",
"constr",
"Field",
+ "PrivateAttr",
]
diff --git a/pybandits/strategy.py b/pybandits/strategy.py
index a67be09..c1b33d8 100644
--- a/pybandits/strategy.py
+++ b/pybandits/strategy.py
@@ -19,14 +19,18 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-
from abc import ABC, abstractmethod
from random import random
+from sys import version_info
from typing import Any, Dict, List, Optional, Union
import numpy as np
from scipy.stats import ttest_ind_from_stats
-from typing_extensions import Self
+
+if version_info >= (3, 11):
+ from typing import Self
+else:
+ from typing_extensions import Self
from pybandits.base import ActionId, Float01, Probability, PyBanditsBaseModel
from pybandits.model import Beta, BetaMOCC, Model
diff --git a/pybandits/utils.py b/pybandits/utils.py
index d0577b5..3d703df 100644
--- a/pybandits/utils.py
+++ b/pybandits/utils.py
@@ -1,5 +1,12 @@
+import inspect
import json
-from typing import Any, Callable, Dict, List, Union
+from abc import ABC
+from types import ModuleType
+from typing import Any, Callable, Dict, List, Optional, Union
+
+from bokeh.io import curdoc, output_file, output_notebook, save, show
+from bokeh.models import InlineStyleSheet, TabPanel, Tabs
+from IPython import get_ipython
from pybandits.pydantic_version_compatibility import validate_call
@@ -42,3 +49,74 @@ def extract_argument_names_from_function(function_handle: Callable, is_class_met
start_index = int(is_class_method)
argument_names = function_handle.__code__.co_varnames[start_index : function_handle.__code__.co_argcount]
return argument_names
+
+
+@validate_call(config=dict(arbitrary_types_allowed=True))
+def get_non_abstract_classes(module: ModuleType) -> List[type]:
+ non_abc_classes = []
+ for name, obj in inspect.getmembers(module, inspect.isclass):
+ # Ensure the class is defined in the module and not imported
+ if obj.__module__ == module.__name__:
+ # Check if the class is not an abstract class (i.e., doesn't inherit from abc.ABC)
+ if not inspect.isabstract(obj) and ABC not in obj.__bases__:
+ non_abc_classes.append(obj)
+ return non_abc_classes
+
+
+def in_jupyter_notebook() -> bool:
+ """
+ Check if the code is running in a Jupyter notebook.
+
+ Reference: https://stackoverflow.com/a/39662359
+
+ Returns
+ -------
+ bool
+ True if the code is running in a Jupyter notebook, False otherwise.
+ """
+
+ try:
+ shell = get_ipython().__class__.__name__
+
+ if shell == "ZMQInteractiveShell":
+ return True # Jupyter notebook or qtconsole
+
+ elif shell == "TerminalInteractiveShell":
+ return False # Terminal running IPython
+
+ else:
+ return False # Other type (likely shouldn't happen)
+
+ except NameError:
+ return False # Probably standard Python interpreter
+
+
+def visualize_via_bokeh(output_path: Optional[str], tabs: List[TabPanel]):
+ """
+ Visualize output to either a Jupyter notebook or an HTML file.
+
+ Parameters
+ ----------
+ output_path : Optional[str]
+ Path to the output file. Required if not running in a Jupyter notebook.
+ """
+
+ if in_jupyter_notebook():
+ output_notebook()
+ else:
+ if output_path is None:
+ raise ValueError("output_path is required when not running in a Jupyter notebook.")
+ output_file(output_path)
+
+ # Add a Div model to the Bokeh layout for flexible tabs
+ css = """
+ :host(.bk-Tabs) .bk-header {
+ flex-wrap: wrap !important;
+ }
+ """
+ stylesheet = InlineStyleSheet(css=css)
+ curdoc().title = "Visual report"
+ if in_jupyter_notebook():
+ show(Tabs(tabs=tabs, stylesheets=[stylesheet]))
+ else:
+ save(Tabs(tabs=tabs, sizing_mode="stretch_both", stylesheets=[stylesheet]))
diff --git a/pyproject.toml b/pyproject.toml
index 91bf9fa..65f6303 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pybandits"
-version = "1.0.2"
+version = "1.2.0"
description = "Python Multi-Armed Bandit Library"
authors = [
"Dario d'Andrea ",
@@ -11,28 +11,41 @@ authors = [
]
license = "MIT License"
readme = "README.md"
+homepage = "https://github.com/PlaytikaOSS/pybandits"
+repository = "https://github.com/PlaytikaOSS/pybandits"
+keywords = ["multi-armed bandit", "reinforcement-learning", "optimization"]
[tool.poetry.dependencies]
-python = ">=3.8.1,<3.12"
+python = ">=3.8.1,<3.13"
loguru = "^0.6"
-numpy = "^1.23"
+numpy = [
+ { version = "<1.25", python = "3.8.*" },
+ { version = ">=1.25", python = ">=3.9,<3.12" },
+ { version = ">=1.26", python = "3.12.*" },
+]
pydantic = ">=1.10"
-scipy = "^1.9"
-pymc = "^5.3"
+scipy = [
+ { version = ">=1.9,<1.13", python = ">=3.8,<3.12" },
+ { version = ">=1.11,<1.13", python = "3.12.*" },
+]
+pymc = [
+ { version = "^5.3", python = "3.8.*" },
+ { version = "^5.10", python = ">=3.9" },
+]
scikit-learn = "^1.1"
+optuna = "^3.6"
+bokeh = "^3.1"
[tool.poetry.group.dev.dependencies]
hypothesis = "^6.68.2"
-pytest = "^7.2.2"
+pytest = "^8.3.3"
tox = "^4.4.7"
-pandas = "^1.5.3"
+pandas = ">=1.5.3"
pre-commit = "^3.1.1"
-nbdev = "^2.3.12"
-rich = "^13.3.2"
-pyzmq = "25.0.0"
+nbstripout = "^0.7.1"
ipykernel = "^6.21.3"
jupyterlab = "^3.6.1"
-pytest-cov = "^4.0.0"
+pytest-cov = "^5.0.0"
pytest_mock = "^3.14.0"
ruff = "^0.5.6"
diff --git a/tests/test_cmab.py b/tests/test_cmab.py
index fdf2173..3b365ed 100644
--- a/tests/test_cmab.py
+++ b/tests/test_cmab.py
@@ -159,7 +159,7 @@ def test_cmab_init_with_wrong_blr_models(n_features, other_n_features, update_me
)
-@settings(deadline=60000)
+@settings(deadline=None)
@given(st.just(100), st.just(3), st.sampled_from(literal_update_methods))
def test_cmab_update(n_samples, n_features, update_method):
actions = np.random.choice(["a1", "a2"], size=n_samples).tolist()
@@ -200,7 +200,7 @@ def run_update(context):
run_update(context=context)
-@settings(deadline=10000)
+@settings(deadline=None)
@given(st.just(100), st.just(3), st.sampled_from(literal_update_methods))
def test_cmab_update_not_all_actions(n_samples, n_feat, update_method):
actions = np.random.choice(["a3", "a4"], size=n_samples).tolist()
@@ -547,7 +547,7 @@ def test_cmab_bai_predict(n_samples, n_features):
assert len(selected_actions) == len(probs) == len(weighted_sums) == n_samples
-@settings(deadline=10000)
+@settings(deadline=None)
@given(st.just(100), st.just(3), st.sampled_from(literal_update_methods))
def test_cmab_bai_update(n_samples, n_features, update_method):
actions = np.random.choice(["a1", "a2"], size=n_samples).tolist()
@@ -783,7 +783,7 @@ def test_cmab_cc_predict(n_samples, n_features):
assert len(selected_actions) == len(probs) == len(weighted_sums) == n_samples
-@settings(deadline=10000)
+@settings(deadline=None)
@given(st.just(100), st.just(3), st.sampled_from(literal_update_methods))
def test_cmab_cc_update(n_samples, n_features, update_method):
actions = np.random.choice(["a1", "a2"], size=n_samples).tolist()
diff --git a/tests/test_offline_policy_estimator.py b/tests/test_offline_policy_estimator.py
new file mode 100644
index 0000000..d345413
--- /dev/null
+++ b/tests/test_offline_policy_estimator.py
@@ -0,0 +1,162 @@
+from typing import Tuple
+from unittest import mock
+
+import numpy as np
+import pytest
+from hypothesis import assume, given
+from hypothesis import strategies as st
+from hypothesis.extra.numpy import arrays
+
+from pybandits import offline_policy_estimator
+from pybandits.offline_policy_estimator import BaseOfflinePolicyEstimator
+from pybandits.utils import get_non_abstract_classes
+
+
+@st.composite
+def invalid_inputs(draw, n_samples: int = 10, n_actions: int = 2):
+ reward = None
+ propensity_score = None
+ estimated_policy = None
+ expected_reward = None
+ expected_importance_weight = None
+ bad_argument = draw(
+ st.sampled_from(
+ [
+ "action",
+ "reward",
+ "propensity_score",
+ "estimated_policy",
+ "expected_reward",
+ "expected_importance_weight",
+ ]
+ )
+ )
+ if bad_argument == "action":
+ action = draw(arrays(dtype=int, shape=(n_samples, 2), elements=st.integers(0, n_actions - 1)))
+ else:
+ action = draw(arrays(dtype=int, shape=(n_samples,), elements=st.integers(0, n_actions - 1)))
+ assume(np.unique(action).size == n_actions)
+ if bad_argument == "reward":
+ reward = draw(
+ st.one_of(
+ arrays(dtype=int, shape=(n_samples, 2), elements=st.integers(0, 1)),
+ arrays(dtype=float, shape=(n_samples,), elements=st.floats(0, 1)),
+ arrays(
+ dtype=int,
+ shape=(n_samples - 1,),
+ elements=st.integers(0, 1),
+ ),
+ arrays(
+ dtype=int,
+ shape=(n_samples + 1,),
+ elements=st.integers(0, 1),
+ ),
+ )
+ )
+ elif bad_argument in ("propensity_score", "expected_importance_weight"):
+ random_value = draw(
+ st.one_of(
+ arrays(dtype=float, shape=(n_samples, 2), elements=st.floats(0, 1)),
+ arrays(dtype=float, shape=(n_samples,), elements=st.floats(0, 0)),
+ arrays(dtype=int, shape=(n_samples,), elements=st.integers(0, 1)),
+ arrays(
+ dtype=float,
+ shape=(n_samples - 1,),
+ elements=st.floats(0, 1),
+ ),
+ arrays(
+ dtype=float,
+ shape=(n_samples + 1,),
+ elements=st.floats(0, 1),
+ ),
+ )
+ )
+
+ if bad_argument == "propensity_score":
+ propensity_score = random_value
+ elif bad_argument == "expected_importance_weight":
+ expected_importance_weight = random_value
+ elif bad_argument == "estimated_policy":
+ estimated_policy = draw(
+ st.one_of(
+ arrays(dtype=float, shape=(n_samples,), elements=st.floats(0, 1)),
+ arrays(dtype=float, shape=(n_samples, 2), elements=st.floats(0, 0)),
+ arrays(dtype=int, shape=(n_samples, 2), elements=st.integers(0, 1)),
+ arrays(
+ dtype=float,
+ shape=(n_samples - 1, 1),
+ elements=st.floats(0, 1),
+ ),
+ arrays(
+ dtype=float,
+ shape=(n_samples + 1, 1),
+ elements=st.floats(0, 1),
+ ),
+ )
+ )
+ elif bad_argument == "expected_reward":
+ expected_reward = draw(
+ st.one_of(
+ arrays(dtype=float, shape=(n_samples,), elements=st.floats(0, 1)),
+ arrays(dtype=int, shape=(n_samples, 2), elements=st.integers(0, 1)),
+ arrays(
+ dtype=float,
+ shape=(n_samples - 1, 1),
+ elements=st.floats(0, 1),
+ ),
+ arrays(
+ dtype=float,
+ shape=(n_samples + 1, 1),
+ elements=st.floats(0, 1),
+ ),
+ )
+ )
+ else:
+ raise ValueError(f"Invalid bad_argument: {bad_argument}")
+ return action, reward, propensity_score, estimated_policy, expected_reward, expected_importance_weight
+
+
+@mock.patch.multiple(BaseOfflinePolicyEstimator, __abstractmethods__=set())
+@given(invalid_inputs())
+def test_shape_mismatches(
+ inputs: Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray],
+):
+ action, reward, propensity_score, estimated_policy, expected_reward, expected_importance_weight = inputs
+ estimator = BaseOfflinePolicyEstimator()
+ kwargs = {}
+ if reward is not None:
+ kwargs["reward"] = reward
+ if propensity_score is not None:
+ kwargs["propensity_score"] = propensity_score
+ if estimated_policy is not None:
+ kwargs["estimated_policy"] = estimated_policy
+ if expected_reward is not None:
+ kwargs["expected_reward"] = expected_reward
+ if expected_importance_weight is not None:
+ kwargs["expected_importance_weight"] = expected_importance_weight
+ with pytest.raises(ValueError):
+ estimator._check_inputs(action=action, **kwargs)
+
+
+@given(
+ arrays(dtype=int, shape=(10,), elements=st.integers(0, 1)),
+ arrays(dtype=int, shape=(10,), elements=st.integers(0, 1)),
+ arrays(dtype=float, shape=(10,), elements=st.floats(0.01, 1)),
+ arrays(dtype=float, shape=(10, 2), elements=st.floats(0.01, 1)),
+ arrays(dtype=float, shape=(10, 2), elements=st.floats(0, 1)),
+ arrays(dtype=float, shape=(10,), elements=st.floats(0.01, 1)),
+)
+def test_default_estimators(
+ action, reward, propensity_score, estimated_policy, expected_reward, expected_importance_weight
+):
+ if np.unique(action).size > 1:
+ estimators = [class_() for class_ in get_non_abstract_classes(offline_policy_estimator)]
+ for estimator in estimators:
+ estimator.estimate_policy_value_with_confidence_interval(
+ action=action,
+ reward=reward,
+ propensity_score=propensity_score,
+ estimated_policy=estimated_policy,
+ expected_reward=expected_reward,
+ expected_importance_weight=expected_importance_weight,
+ )
diff --git a/tests/test_offline_policy_evaluator.py b/tests/test_offline_policy_evaluator.py
new file mode 100644
index 0000000..6067529
--- /dev/null
+++ b/tests/test_offline_policy_evaluator.py
@@ -0,0 +1,300 @@
+from tempfile import TemporaryDirectory
+from typing import Dict, List, Optional, Union, get_args, get_type_hints
+
+import numpy as np
+import pandas as pd
+import pytest
+from hypothesis import given, settings
+from hypothesis import strategies as st
+from matplotlib.pyplot import close
+from pydantic import PositiveInt
+from pytest_mock import MockerFixture
+from sklearn.base import TransformerMixin
+from sklearn.preprocessing import MinMaxScaler
+
+from pybandits.cmab import CmabBernoulli, CmabBernoulliCC
+from pybandits.offline_policy_estimator import BaseOfflinePolicyEstimator
+from pybandits.offline_policy_evaluator import OfflinePolicyEvaluator
+from pybandits.smab import (
+ SmabBernoulli,
+ SmabBernoulliCC,
+ SmabBernoulliMO,
+ SmabBernoulliMOCC,
+)
+
+
+@pytest.fixture(scope="module")
+def logged_data(n_samples=10, n_actions=2, n_batches=3, n_rewards=2, n_groups=2, n_features=3):
+ unique_actions = [f"a{i}" for i in range(n_actions)]
+ action_ids = np.random.choice(unique_actions, n_samples * n_batches)
+ batches = [i for i in range(n_batches) for _ in range(n_samples)]
+ rewards = [np.random.randint(2, size=(n_samples * n_batches)) for _ in range(n_rewards)]
+ action_true_rewards = {(a, r): np.random.rand() for a in unique_actions for r in range(n_rewards)}
+ true_rewards = [
+ np.array([action_true_rewards[(a, r)] for a in action_ids]).reshape(n_samples * n_batches)
+ for r in range(n_rewards)
+ ]
+ groups = np.random.randint(n_groups, size=n_samples * n_batches)
+ action_costs = {action: np.random.rand() for action in unique_actions}
+ costs = np.array([action_costs[a] for a in action_ids])
+ context = np.random.rand(n_samples * n_batches, n_features)
+ action_propensity_score = {action: np.random.rand() for action in unique_actions}
+ propensity_score = np.array([action_propensity_score[a] for a in action_ids])
+ return pd.DataFrame(
+ {
+ "batch": batches,
+ "action_id": action_ids,
+ "cost": costs,
+ "group": groups,
+ **{f"reward_{r}": rewards[r] for r in range(n_rewards)},
+ **{f"true_reward_{r}": true_rewards[r] for r in range(n_rewards)},
+ **{f"context_{i}": context[:, i] for i in range(n_features)},
+ "propensity_score": propensity_score,
+ }
+ )
+
+
+# validate failure for empty logged_data
+def test_empty_logged_data(
+ split_prop=0.5,
+ n_trials=10,
+ verbose=False,
+ batch_feature="batch",
+ action_feature="action_id",
+ reward_feature="reward",
+ propensity_score_model_type="empirical",
+ expected_reward_model_type="logreg",
+ importance_weights_model_type="logreg",
+):
+ with pytest.raises(AttributeError):
+ OfflinePolicyEvaluator(
+ logged_data=pd.DataFrame(),
+ split_prop=split_prop,
+ propensity_score_model_type=propensity_score_model_type,
+ expected_reward_model_type=expected_reward_model_type,
+ importance_weights_model_type=importance_weights_model_type,
+ n_trials=n_trials,
+ ope_metrics=None,
+ batch_feature=batch_feature,
+ action_feature=action_feature,
+ reward_feature=reward_feature,
+ verbose=verbose,
+ )
+
+
+@pytest.mark.usefixtures("logged_data")
+@given(
+ split_prop=st.sampled_from([0.0, 1.0]),
+ n_trials=st.just(10),
+ ope_metrics=st.just(None),
+ verbose=st.just(False),
+ batch_feature=st.just("batch"),
+ action_feature=st.just("action_id"),
+ reward_feature=st.just("reward_0"),
+ propensity_score_model_type=st.just("empirical"),
+ expected_reward_model_type=st.just("logreg"),
+ importance_weights_model_type=st.just("logreg"),
+)
+# validate failure for extreme split_prop values
+def test_initialization_extreme_split_prop(
+ logged_data: MockerFixture,
+ split_prop: float,
+ n_trials: PositiveInt,
+ ope_metrics: Optional[List[BaseOfflinePolicyEstimator]],
+ verbose: bool,
+ batch_feature: str,
+ action_feature: str,
+ reward_feature: str,
+ propensity_score_model_type: str,
+ expected_reward_model_type: str,
+ importance_weights_model_type: str,
+):
+ with pytest.raises(ValueError):
+ OfflinePolicyEvaluator(
+ logged_data=logged_data,
+ split_prop=split_prop,
+ propensity_score_model_type=propensity_score_model_type,
+ expected_reward_model_type=expected_reward_model_type,
+ importance_weights_model_type=importance_weights_model_type,
+ n_trials=n_trials,
+ ope_metrics=ope_metrics,
+ batch_feature=batch_feature,
+ action_feature=action_feature,
+ reward_feature=reward_feature,
+ true_reward_feature=reward_feature,
+ verbose=verbose,
+ )
+
+
+# validate failure for invalid initialization parameters
+def test_initialization_mismatches(
+ logged_data: MockerFixture,
+ split_prop=0.5,
+ n_trials=10,
+ ope_metrics=None,
+ verbose=False,
+ batch_feature="batch",
+ action_feature="action_id",
+ reward_feature="reward_0",
+ propensity_score_model_type="empirical",
+ expected_reward_model_type="logreg",
+ importance_weights_model_type="logreg",
+):
+ # more true_reward_features than rewards
+ with pytest.raises(ValueError):
+ OfflinePolicyEvaluator(
+ logged_data=logged_data,
+ split_prop=split_prop,
+ propensity_score_model_type=propensity_score_model_type,
+ expected_reward_model_type=expected_reward_model_type,
+ importance_weights_model_type=importance_weights_model_type,
+ n_trials=n_trials,
+ ope_metrics=ope_metrics,
+ batch_feature=batch_feature,
+ action_feature=action_feature,
+ reward_feature=reward_feature,
+ true_reward_feature=[reward_feature, reward_feature],
+ verbose=verbose,
+ )
+ # missing propensity_score_feature
+ with pytest.raises(ValueError):
+ OfflinePolicyEvaluator(
+ logged_data=logged_data,
+ split_prop=split_prop,
+ propensity_score_model_type="propensity_score",
+ expected_reward_model_type=expected_reward_model_type,
+ importance_weights_model_type=importance_weights_model_type,
+ n_trials=n_trials,
+ ope_metrics=ope_metrics,
+ batch_feature=batch_feature,
+ action_feature=action_feature,
+ reward_feature=reward_feature,
+ visualize=False,
+ )
+ # missing context
+ with pytest.raises(AttributeError):
+ OfflinePolicyEvaluator(
+ logged_data=logged_data,
+ split_prop=split_prop,
+ propensity_score_model_type=propensity_score_model_type,
+ expected_reward_model_type=expected_reward_model_type,
+ importance_weights_model_type=importance_weights_model_type,
+ n_trials=n_trials,
+ ope_metrics=ope_metrics,
+ batch_feature=batch_feature,
+ action_feature=action_feature,
+ reward_feature=reward_feature,
+ verbose=False,
+ contextual_features=["non_existent"],
+ )
+
+
+@pytest.mark.usefixtures("logged_data")
+@settings(deadline=None)
+@given(
+ split_prop=st.just(0.5),
+ n_trials=st.just(10),
+ fast_fit=st.booleans(),
+ scaler=st.sampled_from([None, MinMaxScaler()]),
+ verbose=st.booleans(),
+ visualize=st.booleans(),
+ propensity_score_model_type=st.sampled_from(
+ get_args(get_type_hints(OfflinePolicyEvaluator)["propensity_score_model_type"])
+ ),
+ expected_reward_model_type=st.sampled_from(
+ get_args(get_type_hints(OfflinePolicyEvaluator)["expected_reward_model_type"])
+ ),
+ importance_weights_model_type=st.sampled_from(
+ get_args(get_type_hints(OfflinePolicyEvaluator)["importance_weights_model_type"])
+ ),
+ batch_feature=st.just("batch"),
+ action_feature=st.just("action_id"),
+ reward_feature=st.sampled_from(["reward_0", ["reward_0", "reward_1"]]),
+ context=st.booleans(),
+ group_feature=st.sampled_from(["group", None]),
+ cost_feature=st.sampled_from(["cost", None]),
+ propensity_score_feature=st.just("propensity_score"),
+ n_mc_experiments=st.just(2),
+ update=st.booleans(),
+)
+# test various OfflinePolicyEvaluator configurations to validate that everything works
+def test_running_configuration(
+ logged_data: MockerFixture,
+ split_prop: float,
+ n_trials: PositiveInt,
+ fast_fit: bool,
+ scaler: Optional[Union[TransformerMixin, Dict[str, TransformerMixin]]],
+ verbose: bool,
+ visualize: bool,
+ propensity_score_model_type: str,
+ expected_reward_model_type: str,
+ importance_weights_model_type: str,
+ batch_feature: str,
+ action_feature: str,
+ reward_feature: Union[str, List[int]],
+ context: bool,
+ group_feature: Optional[str],
+ cost_feature: Optional[str],
+ propensity_score_feature: Optional[str],
+ n_mc_experiments: int,
+ update: bool,
+):
+ if context and type(reward_feature) is List:
+ pass # CmabMO and CmabMOCC are not supported yet
+ true_reward_feature = (
+ f"true_{reward_feature}" if isinstance(reward_feature, str) else [f"true_{r}" for r in reward_feature]
+ )
+ contextual_features = [col for col in logged_data.columns if col.startswith("context")] if context else None
+ unique_actions = logged_data["action_id"].unique()
+ if cost_feature:
+ action_ids_cost = {
+ action_id: logged_data["cost"][logged_data["action_id"] == action_id].iloc[0]
+ for action_id in unique_actions
+ }
+ if context:
+ if cost_feature:
+ if type(reward_feature) is list:
+ return # CmabMOCC is not supported yet
+ else:
+ mab = CmabBernoulliCC.cold_start(action_ids_cost=action_ids_cost, n_features=len(contextual_features))
+ else:
+ if type(reward_feature) is list:
+ return # CmabMO is not supported yet
+ else:
+ mab = CmabBernoulli.cold_start(action_ids=set(unique_actions), n_features=len(contextual_features))
+ else:
+ if cost_feature:
+ if type(reward_feature) is list:
+ mab = SmabBernoulliMOCC.cold_start(action_ids_cost=action_ids_cost, n_objectives=len(reward_feature))
+ else:
+ mab = SmabBernoulliCC.cold_start(action_ids_cost=action_ids_cost)
+ else:
+ if type(reward_feature) is list:
+ mab = SmabBernoulliMO.cold_start(action_ids=set(unique_actions), n_objectives=len(reward_feature))
+ else:
+ mab = SmabBernoulli.cold_start(action_ids=set(unique_actions))
+ evaluator = OfflinePolicyEvaluator(
+ logged_data=logged_data,
+ split_prop=split_prop,
+ n_trials=n_trials,
+ fast_fit=fast_fit,
+ scaler=scaler,
+ ope_estimators=None,
+ verbose=verbose,
+ propensity_score_model_type=propensity_score_model_type,
+ expected_reward_model_type=expected_reward_model_type,
+ importance_weights_model_type=importance_weights_model_type,
+ batch_feature=batch_feature,
+ action_feature=action_feature,
+ reward_feature=reward_feature,
+ true_reward_feature=true_reward_feature,
+ contextual_features=contextual_features,
+ group_feature=group_feature,
+ cost_feature=cost_feature,
+ propensity_score_feature=propensity_score_feature,
+ )
+ execution_func = evaluator.update_and_evaluate if update else evaluator.evaluate
+ with TemporaryDirectory() as tmp_dir:
+ execution_func(mab=mab, visualize=visualize, n_mc_experiments=n_mc_experiments, save_path=tmp_dir)
+ if visualize:
+ close("all") # close all figures to avoid memory leak