From eb2eebe1cb3234c5288fb75aca7b9beb137c0695 Mon Sep 17 00:00:00 2001 From: Shahar Bar <33932594+shaharbar1@users.noreply.github.com> Date: Thu, 5 Sep 2024 15:28:40 +0300 Subject: [PATCH] Add offline policy evaluation module and update dependencies ### Changes * Introduced `offline_policy_evaluator.py` with classes for propensity score estimation and offline policy evaluation. * Introduced `offline_policy_estimator.py` with classes for offline policy estimation. * Updated `pyproject.toml` to include new dependencies: `bokeh` and `optuna`. Further adjusted existing dependencies to compatible versions and added python 3.12 support. * Changed .pre-commit-config.yaml to utilize nbstripout instead of nbdev_clean. * Added caching of dependencies on CI and CD. * Added class method to PyBanditsBaseModel on base.py to allow seeing default values for arguments that were not passed to the model. * Added test_offline_policy_evaluator.py and test_offline_policy_estimator.py as a test suite for the OfflinePolicyEvaluator. * Added `get_non_abstract_classes`, `visualize_via_bokeh` and `in_jupyter_notebook` utility functions. --- .github/workflows/continuous_delivery.yml | 15 +- .github/workflows/continuous_integration.yml | 19 +- .pre-commit-config.yaml | 6 +- docs/src/tutorials/cmab.ipynb | 79 +- docs/src/tutorials/simulation_cmab.ipynb | 299 +---- docs/src/tutorials/simulation_smab.ipynb | 38 +- docs/src/tutorials/smab.ipynb | 36 +- docs/tutorials/cmab.ipynb | 79 +- docs/tutorials/mab.ipynb | 337 +----- docs/tutorials/simulation_cmab.ipynb | 299 +---- docs/tutorials/simulation_smab.ipynb | 38 +- docs/tutorials/smab.ipynb | 36 +- docs/tutorials/smab_mo_cc.ipynb | 409 +------ pybandits/offline_policy_estimator.py | 807 +++++++++++++ pybandits/offline_policy_evaluator.py | 1127 ++++++++++++++++++ pybandits/pydantic_version_compatibility.py | 17 +- pybandits/strategy.py | 8 +- pybandits/utils.py | 80 +- pyproject.toml | 35 +- tests/test_cmab.py | 8 +- tests/test_offline_policy_estimator.py | 162 +++ tests/test_offline_policy_evaluator.py | 300 +++++ 22 files changed, 2741 insertions(+), 1493 deletions(-) create mode 100644 pybandits/offline_policy_estimator.py create mode 100644 pybandits/offline_policy_evaluator.py create mode 100644 tests/test_offline_policy_estimator.py create mode 100644 tests/test_offline_policy_evaluator.py diff --git a/.github/workflows/continuous_delivery.yml b/.github/workflows/continuous_delivery.yml index ab2679e..4059d01 100644 --- a/.github/workflows/continuous_delivery.yml +++ b/.github/workflows/continuous_delivery.yml @@ -29,10 +29,23 @@ jobs: export PATH="$HOME/.poetry/bin:$PATH" - name: Backup pyproject.toml run: cp pyproject.toml pyproject.toml.bak + - name: Change pydantic version + run: | + poetry add pydantic@${{ matrix.pydantic-version }} --lock + - name: Cache Poetry virtualenv and dependencies + uses: actions/cache@v4 + with: + path: | + ~/.cache/pypoetry + ~/.local/share/pypoetry/virtualenvs + key: ${{ runner.os }}-poetry-${{ matrix.python-version }}-${{ matrix.pydantic-version }}-${{ hashFiles('poetry.lock') }} + restore-keys: | + ${{ runner.os }}-poetry-${{ matrix.python-version }}-${{ matrix.pydantic-version }}- - name: Install project dependencies with Poetry run: | - poetry add pydantic@${{ matrix.pydantic-version }} poetry install + - name: Restore pyproject.toml + run: | mv pyproject.toml.bak pyproject.toml - name: Style check run: | diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 2853deb..3b9de13 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] pydantic-version: [ "1.10.*", "2.*" ] steps: @@ -35,9 +35,20 @@ jobs: run: | curl -sSL https://install.python-poetry.org | python3 - export PATH="$HOME/.poetry/bin:$PATH" + - name: Change pydantic version + run: | + poetry add pydantic@${{ matrix.pydantic-version }} --lock + - name: Cache Poetry virtualenv and dependencies + uses: actions/cache@v4 + with: + path: | + ~/.cache/pypoetry + ~/.local/share/pypoetry/virtualenvs + key: ${{ runner.os }}-poetry-${{ matrix.python-version }}-${{ matrix.pydantic-version }}-${{ hashFiles('poetry.lock') }} + restore-keys: | + ${{ runner.os }}-poetry-${{ matrix.python-version }}-${{ matrix.pydantic-version }}- - name: Install project dependencies with Poetry run: | - poetry add pydantic@${{ matrix.pydantic-version }} poetry install - name: Style check run: | @@ -45,4 +56,8 @@ jobs: poetry run pre-commit run --all-files - name: Run tests run: | + START_TIME=$(date +%s) poetry run pytest -vv -k 'not time and not update_parallel' + END_TIME=$(date +%s) + DURATION=$((END_TIME - START_TIME)) + echo "Tests completed in $DURATION seconds." diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c185f1b..d0b0d5f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: types_or: [ python, pyi, jupyter ] require_serial: true - - repo: https://github.com/fastai/nbdev - rev: 2.3.11 + - repo: https://github.com/kynan/nbstripout + rev: 0.7.1 hooks: - - id: nbdev_clean + - id: nbstripout diff --git a/docs/src/tutorials/cmab.ipynb b/docs/src/tutorials/cmab.ipynb index 07b74fa..cb83a7e 100644 --- a/docs/src/tutorials/cmab.ipynb +++ b/docs/src/tutorials/cmab.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "pycharm": { "is_executing": false @@ -56,31 +56,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "X: context matrix of shape (n_samples, n_features)\n", - "[[-0.53211475 -0.40592956 0.05892565 -0.88067628 -0.84061481]\n", - " [-0.95680954 -0.00540581 0.09148556 -0.82021004 -0.63425381]\n", - " [-0.87792928 -0.51881823 -0.51767022 -0.05385187 -0.64499044]\n", - " [-0.10569516 0.30847784 -0.353929 -0.94831998 -0.52175713]\n", - " [-0.05088401 0.17155683 -0.4322128 -0.07509104 -0.78919832]\n", - " [-0.88604157 0.55037109 0.42634479 -0.87179776 -0.69767766]\n", - " [-0.0022063 0.99304089 0.76398198 -0.87343131 -0.12363411]\n", - " [ 0.36371019 0.6660538 0.17177652 -0.08891719 -0.91070485]\n", - " [-0.1056742 -0.72879406 -0.69367421 -0.8684397 0.70903817]\n", - " [-0.15422305 0.31069811 -0.47487951 0.00853137 0.23793364]]\n" - ] - } - ], + "outputs": [], "source": [ "# context\n", "n_samples = 1000\n", @@ -92,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -126,18 +108,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recommended action: ['action C' 'action C' 'action B' 'action B' 'action C' 'action C'\n", - " 'action B' 'action C' 'action B' 'action C']\n" - ] - } - ], + "outputs": [], "source": [ "# predict action\n", "pred_actions, _ = cmab.predict(X)\n", @@ -153,17 +126,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Simulated rewards: [1 0 0 0 0 0 0 0 1 1]\n" - ] - } - ], + "outputs": [], "source": [ "# simulate reward from environment\n", "simulated_rewards = np.random.randint(2, size=n_samples)\n", @@ -179,31 +144,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 5 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - } - ], + "outputs": [], "source": [ "# update model\n", "cmab.update(X, actions=pred_actions, rewards=simulated_rewards)" diff --git a/docs/src/tutorials/simulation_cmab.ipynb b/docs/src/tutorials/simulation_cmab.ipynb index 1ce2423..5a972ee 100644 --- a/docs/src/tutorials/simulation_cmab.ipynb +++ b/docs/src/tutorials/simulation_cmab.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -80,77 +80,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Probability of positive reward for each group/action:\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
action Aaction Baction C
00.050.800.05
10.800.050.05
20.800.050.80
\n", - "
" - ], - "text/plain": [ - " action A action B action C\n", - "0 0.05 0.80 0.05\n", - "1 0.80 0.05 0.05\n", - "2 0.80 0.05 0.80" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# init probability of rewards from the environment\n", "prob_rewards = pd.DataFrame(\n", @@ -171,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -181,24 +113,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Setup simulation completed.\n", - "Simulated input probability rewards:\n", - " action A action B action C\n", - "group \n", - "0 0.041176 0.835294 0.052941\n", - "1 0.819277 0.036145 0.054217\n", - "2 0.786585 0.042683 0.817073 \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# init simulation\n", "sim = SimulationCmab(\n", @@ -222,205 +139,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #1\n", - "Start predict batch 1 ...\n", - "Start update batch 1 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 11 seconds.\n", - "The number of effective samples is smaller than 25% for some parameters.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 10 seconds.\n", - "The number of effective samples is smaller than 25% for some parameters.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #2\n", - "Start predict batch 2 ...\n", - "Start update batch 2 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 9 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 5 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #3\n", - "Start predict batch 3 ...\n", - "Start update batch 3 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 9 seconds.\n", - "The number of effective samples is smaller than 25% for some parameters.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #4\n", - "Start predict batch 4 ...\n", - "Start update batch 4 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #5\n", - "Start predict batch 5 ...\n", - "Start update batch 5 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Simulation results (first 10 observations):\n", - " action reward group selected_prob_reward max_prob_reward regret \\\n", - "0 action C 0.0 1 0.05 0.8 0.75 \n", - "1 action C 1.0 2 0.80 0.8 0.00 \n", - "2 action B 1.0 0 0.80 0.8 0.00 \n", - "3 action C 0.0 1 0.05 0.8 0.75 \n", - "4 action C 0.0 1 0.05 0.8 0.75 \n", - "5 action B 1.0 0 0.80 0.8 0.00 \n", - "6 action A 0.0 0 0.05 0.8 0.75 \n", - "7 action C 0.0 2 0.80 0.8 0.00 \n", - "8 action C 0.0 1 0.05 0.8 0.75 \n", - "9 action C 1.0 2 0.80 0.8 0.00 \n", - "\n", - " cum_regret \n", - "0 0.75 \n", - "1 0.75 \n", - "2 0.75 \n", - "3 1.50 \n", - "4 2.25 \n", - "5 2.25 \n", - "6 3.00 \n", - "7 3.00 \n", - "8 3.75 \n", - "9 3.75 \n", - "\n", - "Count of actions selected by the bandit: \n", - " {'group 0': {'action B': 85, 'action A': 53, 'action C': 32}, 'group 1': {'action A': 109, 'action C': 31, 'action B': 26}, 'group 2': {'action A': 70, 'action C': 59, 'action B': 35}} \n", - "\n", - "Observed proportion of positive rewards for each action:\n", - " {'group 0': {'action B': 0.788235294117647, 'action A': 0.03773584905660377, 'action C': 0.03125}, 'group 1': {'action A': 0.7981651376146789, 'action B': 0.07692307692307693, 'action C': 0.03225806451612903}, 'group 2': {'action A': 0.7142857142857143, 'action C': 0.8305084745762712, 'action B': 0.02857142857142857}} \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "sim.run()" ] diff --git a/docs/src/tutorials/simulation_smab.ipynb b/docs/src/tutorials/simulation_smab.ipynb index df7d1e8..e15d57f 100644 --- a/docs/src/tutorials/simulation_smab.ipynb +++ b/docs/src/tutorials/simulation_smab.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -95,35 +95,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Simulation results (first 10 observations):\n", - " action reward\n", - "0 Action B 0.0\n", - "1 Action C 1.0\n", - "2 Action C 0.0\n", - "3 Action A 1.0\n", - "4 Action B 1.0\n", - "5 Action C 1.0\n", - "6 Action A 1.0\n", - "7 Action A 1.0\n", - "8 Action B 0.0\n", - "9 Action B 0.0 \n", - "\n", - "Count of actions selected by the bandit: \n", - " {'Action C': 38670, 'Action B': 683, 'Action A': 647} \n", - "\n", - "Observed proportion of positive rewards for each action:\n", - " {'Action A': 0.6120556414219475, 'Action B': 0.4978038067349927, 'Action C': 0.7995603827256271} \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# run simulation\n", "sim.run()" diff --git a/docs/src/tutorials/smab.ipynb b/docs/src/tutorials/smab.ipynb index ed119e1..c4bc60c 100644 --- a/docs/src/tutorials/smab.ipynb +++ b/docs/src/tutorials/smab.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "tags": [] }, @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -98,17 +98,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recommended action: ['Action C', 'Action C', 'Action C', 'Action B', 'Action B', 'Action C', 'Action B', 'Action C', 'Action A', 'Action B']\n" - ] - } - ], + "outputs": [], "source": [ "# predict actions\n", "pred_actions, _ = smab.predict(n_samples=1000)\n", @@ -124,19 +116,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Action A: n_successes=285, n_failures=31\n", - "Action B: n_successes=123, n_failures=210\n", - "Action C: n_successes=261, n_failures=90\n" - ] - } - ], + "outputs": [], "source": [ "# simulate rewards from environment\n", "n_successes, n_failures = {}, {}\n", @@ -155,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/tutorials/cmab.ipynb b/docs/tutorials/cmab.ipynb index 07b74fa..cb83a7e 100644 --- a/docs/tutorials/cmab.ipynb +++ b/docs/tutorials/cmab.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "pycharm": { "is_executing": false @@ -56,31 +56,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "pycharm": { "is_executing": false } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "X: context matrix of shape (n_samples, n_features)\n", - "[[-0.53211475 -0.40592956 0.05892565 -0.88067628 -0.84061481]\n", - " [-0.95680954 -0.00540581 0.09148556 -0.82021004 -0.63425381]\n", - " [-0.87792928 -0.51881823 -0.51767022 -0.05385187 -0.64499044]\n", - " [-0.10569516 0.30847784 -0.353929 -0.94831998 -0.52175713]\n", - " [-0.05088401 0.17155683 -0.4322128 -0.07509104 -0.78919832]\n", - " [-0.88604157 0.55037109 0.42634479 -0.87179776 -0.69767766]\n", - " [-0.0022063 0.99304089 0.76398198 -0.87343131 -0.12363411]\n", - " [ 0.36371019 0.6660538 0.17177652 -0.08891719 -0.91070485]\n", - " [-0.1056742 -0.72879406 -0.69367421 -0.8684397 0.70903817]\n", - " [-0.15422305 0.31069811 -0.47487951 0.00853137 0.23793364]]\n" - ] - } - ], + "outputs": [], "source": [ "# context\n", "n_samples = 1000\n", @@ -92,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -126,18 +108,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recommended action: ['action C' 'action C' 'action B' 'action B' 'action C' 'action C'\n", - " 'action B' 'action C' 'action B' 'action C']\n" - ] - } - ], + "outputs": [], "source": [ "# predict action\n", "pred_actions, _ = cmab.predict(X)\n", @@ -153,17 +126,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Simulated rewards: [1 0 0 0 0 0 0 0 1 1]\n" - ] - } - ], + "outputs": [], "source": [ "# simulate reward from environment\n", "simulated_rewards = np.random.randint(2, size=n_samples)\n", @@ -179,31 +144,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 5 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - } - ], + "outputs": [], "source": [ "# update model\n", "cmab.update(X, actions=pred_actions, rewards=simulated_rewards)" diff --git a/docs/tutorials/mab.ipynb b/docs/tutorials/mab.ipynb index 22c5666..d139501 100644 --- a/docs/tutorials/mab.ipynb +++ b/docs/tutorials/mab.ipynb @@ -3,7 +3,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "e2595bf3-9767-4338-9a51-ce706dc306cf", + "id": "0", "metadata": {}, "source": [ "# Stochastic Bernoulli Bandit" @@ -11,8 +11,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "8f8462e5-f38e-4b04-9002-07ababe3ee0c", + "execution_count": null, + "id": "1", "metadata": {}, "outputs": [], "source": [ @@ -25,21 +25,10 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "75d6f625", + "execution_count": null, + "id": "2", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'%.2f'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# print 2 decimal places in the notebook\n", "%precision %.2f" @@ -48,7 +37,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "b6b37329-6a3b-4f2a-87a5-e0dcbbb1bb69", + "id": "3", "metadata": {}, "source": [ "## 1. Initialization\n", @@ -58,7 +47,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "2ca215bf-6321-4819-a539-ebf1f378436a", + "id": "4", "metadata": {}, "source": [ "### 1.1 Initialize via class constructor\n", @@ -68,8 +57,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "701111ff-b659-49b7-8cf5-8349536b4cd8", + "execution_count": null, + "id": "5", "metadata": {}, "outputs": [], "source": [ @@ -84,38 +73,10 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "55112a02-8df2-4895-9414-ddabbfc8ecac", + "execution_count": null, + "id": "6", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SmabBernoulli(\n",
-       "    actions={\n",
-       "        'a1': Beta(n_successes=1, n_failures=1),\n",
-       "        'a2': Beta(n_successes=1, n_failures=1),\n",
-       "        'a3': Beta(n_successes=1, n_failures=1)\n",
-       "    },\n",
-       "    strategy=ClassicBandit()\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSmabBernoulli\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[33mstrategy\u001b[0m=\u001b[1;35mClassicBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(mab)" ] @@ -123,7 +84,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "f2ee7bdc-3881-47a5-b7d4-84862f70e643", + "id": "7", "metadata": {}, "source": [ "### 1.2 Initialize via utility function (for cold start)" @@ -132,7 +93,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "564914fd-73cc-4854-8ec7-548970f794a6", + "id": "8", "metadata": {}, "source": [ "You can initialize the bandit via the utility function `SmabBernoulliMOCC.cold_start()`. This is particulary useful in a cold start setting when there is no prior knowledge on the Beta distruibutions. In this case for all Betas `n_successes` and `n_failures` are set to `1`." @@ -140,8 +101,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "dbfb0ddd-4c16-441f-8c68-16020e425d57", + "execution_count": null, + "id": "9", "metadata": {}, "outputs": [], "source": [ @@ -151,38 +112,10 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "fcc3649c-d08c-46db-a534-f61d97962c99", + "execution_count": null, + "id": "10", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SmabBernoulli(\n",
-       "    actions={\n",
-       "        'a1': Beta(n_successes=1, n_failures=1),\n",
-       "        'a3': Beta(n_successes=1, n_failures=1),\n",
-       "        'a2': Beta(n_successes=1, n_failures=1)\n",
-       "    },\n",
-       "    strategy=ClassicBandit()\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSmabBernoulli\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[33mstrategy\u001b[0m=\u001b[1;35mClassicBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(mab)" ] @@ -190,7 +123,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "aa91a5ed-83cc-4016-aa3e-17b8a102bb77", + "id": "11", "metadata": {}, "source": [ "## 2. Function `predict()`" @@ -198,46 +131,18 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "a735c03d-cde4-4147-a50d-4b82dd9c1792", + "execution_count": null, + "id": "12", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on method predict in module pybandits.smab:\n", - "\n", - "predict(n_samples: pydantic.types.PositiveInt = 1, forbidden_actions: Optional[Set[pybandits.base.ActionId]] = None) -> Tuple[List[pybandits.base.ActionId], List[Dict[pybandits.base.ActionId, pybandits.base.Probability]]] method of pybandits.smab.SmabBernoulli instance\n", - " Predict actions.\n", - " \n", - " Parameters\n", - " ----------\n", - " n_samples : int > 0, default=1\n", - " Number of samples to predict.\n", - " forbidden_actions : Optional[Set[ActionId]], default=None\n", - " Set of forbidden actions. If specified, the model will discard the forbidden_actions and it will only\n", - " consider the remaining allowed_actions. By default, the model considers all actions as allowed_actions.\n", - " Note that: actions = allowed_actions U forbidden_actions.\n", - " \n", - " Returns\n", - " -------\n", - " actions: List[ActionId] of shape (n_samples,)\n", - " The actions selected by the multi-armed bandit model.\n", - " probs: List[Dict[ActionId, Probability]] of shape (n_samples,)\n", - " The probabilities of getting a positive reward for each action.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "help(mab.predict)" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "f3d9cb8b-7d9b-437b-bbc2-e7a55475a1fb", + "execution_count": null, + "id": "13", "metadata": {}, "outputs": [], "source": [ @@ -247,54 +152,28 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "a9284b11-05ba-4cda-9597-b69e6d7632a3", + "execution_count": null, + "id": "14", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a3', 'a1', 'a3', 'a1', 'a3']" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "actions" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "84cdbed4-9aa5-42e1-84db-1f8f72c52d93", + "execution_count": null, + "id": "15", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'a1': 0.68, 'a3': 0.77, 'a2': 0.51},\n", - " {'a1': 0.85, 'a3': 0.18, 'a2': 0.82},\n", - " {'a1': 0.68, 'a3': 0.82, 'a2': 0.42},\n", - " {'a1': 0.98, 'a3': 0.72, 'a2': 0.22},\n", - " {'a1': 0.72, 'a3': 0.83, 'a2': 0.13}]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "probs" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "bfc53fc8-b1bf-42ea-907a-fa5fb7173199", + "execution_count": null, + "id": "16", "metadata": {}, "outputs": [], "source": [ @@ -304,46 +183,20 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "696d58f4-ca5f-41d4-983f-bc7a5351ab28", + "execution_count": null, + "id": "17", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a2', 'a2', 'a2', 'a3', 'a2']" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "actions" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "f5826785-a5c6-4c06-9bab-9f05134e783e", + "execution_count": null, + "id": "18", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'a3': 0.71, 'a2': 0.86},\n", - " {'a3': 0.51, 'a2': 0.55},\n", - " {'a3': 0.42, 'a2': 0.87},\n", - " {'a3': 0.89, 'a2': 0.52},\n", - " {'a3': 0.41, 'a2': 0.42}]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "probs" ] @@ -351,7 +204,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "d89f7199-bec3-407d-92a9-bdf917c13de6", + "id": "19", "metadata": {}, "source": [ "## 3. Function `update()`" @@ -359,42 +212,18 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "140eb2fc-3659-4c13-86d1-ec5a575c79c1", + "execution_count": null, + "id": "20", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on method update in module pybandits.smab:\n", - "\n", - "update(actions: List[pybandits.base.ActionId], rewards: List[pybandits.base.BinaryReward]) method of pybandits.smab.SmabBernoulli instance\n", - " Update the stochastic Bernoulli bandit given the list of selected actions and their corresponding binary\n", - " rewards.\n", - " \n", - " Parameters\n", - " ----------\n", - " actions : List[ActionId] of shape (n_samples,), e.g. ['a1', 'a2', 'a3', 'a4', 'a5']\n", - " The selected action for each sample.\n", - " rewards : List[Union[BinaryReward, List[BinaryReward]]] of shape (n_samples, n_objectives)\n", - " The binary reward for each sample.\n", - " If strategy is not MultiObjectiveBandit, rewards should be a list, e.g.\n", - " rewards = [1, 0, 1, 1, 1, ...]\n", - " If strategy is MultiObjectiveBandit, rewards should be a list of list, e.g. (with n_objectives=2):\n", - " rewards = [[1, 1], [1, 0], [1, 1], [1, 0], [1, 1], ...]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "help(mab.update)" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "2526ed6d-82d4-4485-bc6e-b5cb53dd78a5", + "execution_count": null, + "id": "21", "metadata": {}, "outputs": [], "source": [ @@ -404,38 +233,10 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "3bd0ab45-94e8-415b-adea-a089c54f6274", + "execution_count": null, + "id": "22", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SmabBernoulli(\n",
-       "    actions={\n",
-       "        'a1': Beta(n_successes=1, n_failures=1),\n",
-       "        'a3': Beta(n_successes=2, n_failures=1),\n",
-       "        'a2': Beta(n_successes=3, n_failures=3)\n",
-       "    },\n",
-       "    strategy=ClassicBandit()\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSmabBernoulli\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m2\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[33mstrategy\u001b[0m=\u001b[1;35mClassicBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# update\n", "mab.update(actions=actions, rewards=rewards)\n", @@ -445,7 +246,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "9823d84c-862b-4bb6-ab36-024f34460595", + "id": "23", "metadata": {}, "source": [ "## 4. Example of usage\n", @@ -455,8 +256,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "id": "a785463d-d710-4844-80bf-42c09b0e0b45", + "execution_count": null, + "id": "24", "metadata": {}, "outputs": [], "source": [ @@ -476,38 +277,10 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "034add3d-e6f3-471c-b8b9-30c286faf2cc", + "execution_count": null, + "id": "25", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SmabBernoulli(\n",
-       "    actions={\n",
-       "        'a1': Beta(n_successes=337, n_failures=369),\n",
-       "        'a3': Beta(n_successes=4448, n_failures=4315),\n",
-       "        'a2': Beta(n_successes=246, n_failures=296)\n",
-       "    },\n",
-       "    strategy=ClassicBandit()\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSmabBernoulli\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m337\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m369\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m4448\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m4315\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m246\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m296\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[33mstrategy\u001b[0m=\u001b[1;35mClassicBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(mab)" ] diff --git a/docs/tutorials/simulation_cmab.ipynb b/docs/tutorials/simulation_cmab.ipynb index 1ce2423..5a972ee 100644 --- a/docs/tutorials/simulation_cmab.ipynb +++ b/docs/tutorials/simulation_cmab.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -80,77 +80,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Probability of positive reward for each group/action:\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
action Aaction Baction C
00.050.800.05
10.800.050.05
20.800.050.80
\n", - "
" - ], - "text/plain": [ - " action A action B action C\n", - "0 0.05 0.80 0.05\n", - "1 0.80 0.05 0.05\n", - "2 0.80 0.05 0.80" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# init probability of rewards from the environment\n", "prob_rewards = pd.DataFrame(\n", @@ -171,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -181,24 +113,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Setup simulation completed.\n", - "Simulated input probability rewards:\n", - " action A action B action C\n", - "group \n", - "0 0.041176 0.835294 0.052941\n", - "1 0.819277 0.036145 0.054217\n", - "2 0.786585 0.042683 0.817073 \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# init simulation\n", "sim = SimulationCmab(\n", @@ -222,205 +139,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #1\n", - "Start predict batch 1 ...\n", - "Start update batch 1 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 11 seconds.\n", - "The number of effective samples is smaller than 25% for some parameters.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 10 seconds.\n", - "The number of effective samples is smaller than 25% for some parameters.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #2\n", - "Start predict batch 2 ...\n", - "Start update batch 2 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 9 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 5 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #3\n", - "Start predict batch 3 ...\n", - "Start update batch 3 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 9 seconds.\n", - "The number of effective samples is smaller than 25% for some parameters.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #4\n", - "Start predict batch 4 ...\n", - "Start update batch 4 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration #5\n", - "Start predict batch 5 ...\n", - "Start update batch 5 ... \n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 4 seconds.\n", - "Auto-assigning NUTS sampler...\n", - "Initializing NUTS using adapt_diag...\n", - "Sequential sampling (2 chains in 1 job)\n", - "NUTS: [beta4, beta3, beta2, beta1, beta0, alpha]\n", - "Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 3 seconds.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Simulation results (first 10 observations):\n", - " action reward group selected_prob_reward max_prob_reward regret \\\n", - "0 action C 0.0 1 0.05 0.8 0.75 \n", - "1 action C 1.0 2 0.80 0.8 0.00 \n", - "2 action B 1.0 0 0.80 0.8 0.00 \n", - "3 action C 0.0 1 0.05 0.8 0.75 \n", - "4 action C 0.0 1 0.05 0.8 0.75 \n", - "5 action B 1.0 0 0.80 0.8 0.00 \n", - "6 action A 0.0 0 0.05 0.8 0.75 \n", - "7 action C 0.0 2 0.80 0.8 0.00 \n", - "8 action C 0.0 1 0.05 0.8 0.75 \n", - "9 action C 1.0 2 0.80 0.8 0.00 \n", - "\n", - " cum_regret \n", - "0 0.75 \n", - "1 0.75 \n", - "2 0.75 \n", - "3 1.50 \n", - "4 2.25 \n", - "5 2.25 \n", - "6 3.00 \n", - "7 3.00 \n", - "8 3.75 \n", - "9 3.75 \n", - "\n", - "Count of actions selected by the bandit: \n", - " {'group 0': {'action B': 85, 'action A': 53, 'action C': 32}, 'group 1': {'action A': 109, 'action C': 31, 'action B': 26}, 'group 2': {'action A': 70, 'action C': 59, 'action B': 35}} \n", - "\n", - "Observed proportion of positive rewards for each action:\n", - " {'group 0': {'action B': 0.788235294117647, 'action A': 0.03773584905660377, 'action C': 0.03125}, 'group 1': {'action A': 0.7981651376146789, 'action B': 0.07692307692307693, 'action C': 0.03225806451612903}, 'group 2': {'action A': 0.7142857142857143, 'action C': 0.8305084745762712, 'action B': 0.02857142857142857}} \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "sim.run()" ] diff --git a/docs/tutorials/simulation_smab.ipynb b/docs/tutorials/simulation_smab.ipynb index df7d1e8..e15d57f 100644 --- a/docs/tutorials/simulation_smab.ipynb +++ b/docs/tutorials/simulation_smab.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -95,35 +95,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Simulation results (first 10 observations):\n", - " action reward\n", - "0 Action B 0.0\n", - "1 Action C 1.0\n", - "2 Action C 0.0\n", - "3 Action A 1.0\n", - "4 Action B 1.0\n", - "5 Action C 1.0\n", - "6 Action A 1.0\n", - "7 Action A 1.0\n", - "8 Action B 0.0\n", - "9 Action B 0.0 \n", - "\n", - "Count of actions selected by the bandit: \n", - " {'Action C': 38670, 'Action B': 683, 'Action A': 647} \n", - "\n", - "Observed proportion of positive rewards for each action:\n", - " {'Action A': 0.6120556414219475, 'Action B': 0.4978038067349927, 'Action C': 0.7995603827256271} \n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# run simulation\n", "sim.run()" diff --git a/docs/tutorials/smab.ipynb b/docs/tutorials/smab.ipynb index ed119e1..c4bc60c 100644 --- a/docs/tutorials/smab.ipynb +++ b/docs/tutorials/smab.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "tags": [] }, @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -98,17 +98,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recommended action: ['Action C', 'Action C', 'Action C', 'Action B', 'Action B', 'Action C', 'Action B', 'Action C', 'Action A', 'Action B']\n" - ] - } - ], + "outputs": [], "source": [ "# predict actions\n", "pred_actions, _ = smab.predict(n_samples=1000)\n", @@ -124,19 +116,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Action A: n_successes=285, n_failures=31\n", - "Action B: n_successes=123, n_failures=210\n", - "Action C: n_successes=261, n_failures=90\n" - ] - } - ], + "outputs": [], "source": [ "# simulate rewards from environment\n", "n_successes, n_failures = {}, {}\n", @@ -155,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/tutorials/smab_mo_cc.ipynb b/docs/tutorials/smab_mo_cc.ipynb index 880654c..f92e04d 100644 --- a/docs/tutorials/smab_mo_cc.ipynb +++ b/docs/tutorials/smab_mo_cc.ipynb @@ -3,7 +3,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "e2595bf3-9767-4338-9a51-ce706dc306cf", + "id": "0", "metadata": {}, "source": [ "# Stochastic Bernoulli Bandit (Multi-Objective with Cost-Control)" @@ -11,8 +11,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "8f8462e5-f38e-4b04-9002-07ababe3ee0c", + "execution_count": null, + "id": "1", "metadata": {}, "outputs": [], "source": [ @@ -25,21 +25,10 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "75d6f625", + "execution_count": null, + "id": "2", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'%.2f'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# print 2 decimal places in the notebook\n", "%precision %.2f" @@ -47,7 +36,7 @@ }, { "cell_type": "markdown", - "id": "b6b37329-6a3b-4f2a-87a5-e0dcbbb1bb69", + "id": "3", "metadata": {}, "source": [ "## 1. Initialization\n", @@ -57,7 +46,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "2ca215bf-6321-4819-a539-ebf1f378436a", + "id": "4", "metadata": {}, "source": [ "### 1.1 Initialize via class constructor\n", @@ -67,8 +56,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "701111ff-b659-49b7-8cf5-8349536b4cd8", + "execution_count": null, + "id": "5", "metadata": {}, "outputs": [], "source": [ @@ -83,63 +72,17 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "55112a02-8df2-4895-9414-ddabbfc8ecac", + "execution_count": null, + "id": "6", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SmabBernoulliMOCC(\n",
-       "    actions={\n",
-       "        'a1': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
-       "            cost=30.0\n",
-       "        ),\n",
-       "        'a2': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
-       "            cost=10.0\n",
-       "        ),\n",
-       "        'a3': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
-       "            cost=20.0\n",
-       "        )\n",
-       "    },\n",
-       "    strategy=MultiObjectiveCostControlBandit()\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSmabBernoulliMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m30\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m10\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m20\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[33mstrategy\u001b[0m=\u001b[1;35mMultiObjectiveCostControlBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(mab)" ] }, { "cell_type": "markdown", - "id": "f2ee7bdc-3881-47a5-b7d4-84862f70e643", + "id": "7", "metadata": {}, "source": [ "### 1.2 Initialize via utility function (for cold start)" @@ -148,7 +91,7 @@ { "attachments": {}, "cell_type": "markdown", - "id": "564914fd-73cc-4854-8ec7-548970f794a6", + "id": "8", "metadata": {}, "source": [ "You can initialize the bandit via the utility function `SmabBernoulliMOCC.cold_start()`. This is particulary useful in a cold start setting when there is no prior knowledge on the Beta distruibutions. In this case for all Betas `n_successes` and `n_failures` are set to `1`." @@ -156,8 +99,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "dbfb0ddd-4c16-441f-8c68-16020e425d57", + "execution_count": null, + "id": "9", "metadata": {}, "outputs": [], "source": [ @@ -170,63 +113,17 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "fcc3649c-d08c-46db-a534-f61d97962c99", + "execution_count": null, + "id": "10", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SmabBernoulliMOCC(\n",
-       "    actions={\n",
-       "        'a1': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
-       "            cost=30.0\n",
-       "        ),\n",
-       "        'a2': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
-       "            cost=10.0\n",
-       "        ),\n",
-       "        'a3': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
-       "            cost=20.0\n",
-       "        )\n",
-       "    },\n",
-       "    strategy=MultiObjectiveCostControlBandit()\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSmabBernoulliMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m30\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m10\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m20\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[33mstrategy\u001b[0m=\u001b[1;35mMultiObjectiveCostControlBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(mab)" ] }, { "cell_type": "markdown", - "id": "aa91a5ed-83cc-4016-aa3e-17b8a102bb77", + "id": "11", "metadata": {}, "source": [ "## 2. Function `predict()`" @@ -234,46 +131,18 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "a735c03d-cde4-4147-a50d-4b82dd9c1792", + "execution_count": null, + "id": "12", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on method predict in module pybandits.smab:\n", - "\n", - "predict(n_samples: pydantic.types.PositiveInt = 1, forbidden_actions: Optional[Set[pybandits.base.ActionId]] = None) -> Tuple[List[pybandits.base.ActionId], List[Dict[pybandits.base.ActionId, pybandits.base.Probability]]] method of pybandits.smab.SmabBernoulliMOCC instance\n", - " Predict actions.\n", - " \n", - " Parameters\n", - " ----------\n", - " n_samples : int > 0, default=1\n", - " Number of samples to predict.\n", - " forbidden_actions : Optional[Set[ActionId]], default=None\n", - " Set of forbidden actions. If specified, the model will discard the forbidden_actions and it will only\n", - " consider the remaining allowed_actions. By default, the model considers all actions as allowed_actions.\n", - " Note that: actions = allowed_actions U forbidden_actions.\n", - " \n", - " Returns\n", - " -------\n", - " actions: List[ActionId] of shape (n_samples,)\n", - " The actions selected by the multi-armed bandit model.\n", - " probs: List[Dict[ActionId, Probability]] of shape (n_samples,)\n", - " The probabilities of getting a positive reward for each action.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "help(mab.predict)" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "f3d9cb8b-7d9b-437b-bbc2-e7a55475a1fb", + "execution_count": null, + "id": "13", "metadata": {}, "outputs": [], "source": [ @@ -283,54 +152,28 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "a9284b11-05ba-4cda-9597-b69e6d7632a3", + "execution_count": null, + "id": "14", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a3', 'a3', 'a2', 'a3', 'a2']" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "actions" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "84cdbed4-9aa5-42e1-84db-1f8f72c52d93", + "execution_count": null, + "id": "15", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'a1': [0.75, 0.55], 'a2': [0.78, 0.29], 'a3': [0.79, 0.83]},\n", - " {'a1': [0.95, 0.28], 'a2': [0.22, 0.23], 'a3': [0.99, 0.95]},\n", - " {'a1': [0.22, 0.64], 'a2': [0.62, 0.50], 'a3': [0.30, 0.12]},\n", - " {'a1': [0.19, 0.79], 'a2': [0.02, 0.70], 'a3': [0.27, 0.72]},\n", - " {'a1': [0.38, 0.03], 'a2': [0.80, 0.55], 'a3': [0.79, 0.04]}]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "probs" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "bfc53fc8-b1bf-42ea-907a-fa5fb7173199", + "execution_count": null, + "id": "16", "metadata": {}, "outputs": [], "source": [ @@ -340,53 +183,27 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "696d58f4-ca5f-41d4-983f-bc7a5351ab28", + "execution_count": null, + "id": "17", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a3', 'a2', 'a2', 'a2', 'a2']" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "actions" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "f5826785-a5c6-4c06-9bab-9f05134e783e", + "execution_count": null, + "id": "18", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'a2': [0.12, 0.45], 'a3': [0.38, 0.90]},\n", - " {'a2': [0.10, 0.96], 'a3': [0.58, 0.20]},\n", - " {'a2': [0.92, 0.85], 'a3': [0.31, 0.65]},\n", - " {'a2': [0.60, 0.04], 'a3': [0.45, 0.97]},\n", - " {'a2': [0.87, 0.51], 'a3': [0.74, 0.35]}]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "probs" ] }, { "cell_type": "markdown", - "id": "d89f7199-bec3-407d-92a9-bdf917c13de6", + "id": "19", "metadata": {}, "source": [ "## 3. Function `update()`" @@ -394,42 +211,18 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "140eb2fc-3659-4c13-86d1-ec5a575c79c1", + "execution_count": null, + "id": "20", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on method update in module pybandits.smab:\n", - "\n", - "update(actions: List[pybandits.base.ActionId], rewards: List[List[pybandits.base.BinaryReward]]) method of pybandits.smab.SmabBernoulliMOCC instance\n", - " Update the stochastic Bernoulli bandit given the list of selected actions and their corresponding binary\n", - " rewards.\n", - " \n", - " Parameters\n", - " ----------\n", - " actions : List[ActionId] of shape (n_samples,), e.g. ['a1', 'a2', 'a3', 'a4', 'a5']\n", - " The selected action for each sample.\n", - " rewards : List[Union[BinaryReward, List[BinaryReward]]] of shape (n_samples, n_objectives)\n", - " The binary reward for each sample.\n", - " If strategy is not MultiObjectiveBandit, rewards should be a list, e.g.\n", - " rewards = [1, 0, 1, 1, 1, ...]\n", - " If strategy is MultiObjectiveBandit, rewards should be a list of list, e.g. (with n_objectives=2):\n", - " rewards = [[1, 1], [1, 0], [1, 1], [1, 0], [1, 1], ...]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "help(mab.update)" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "2526ed6d-82d4-4485-bc6e-b5cb53dd78a5", + "execution_count": null, + "id": "21", "metadata": {}, "outputs": [], "source": [ @@ -439,56 +232,10 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "3bd0ab45-94e8-415b-adea-a089c54f6274", + "execution_count": null, + "id": "22", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SmabBernoulliMOCC(\n",
-       "    actions={\n",
-       "        'a1': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=1, n_failures=1), Beta(n_successes=1, n_failures=1)],\n",
-       "            cost=30.0\n",
-       "        ),\n",
-       "        'a2': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=7, n_failures=3), Beta(n_successes=7, n_failures=3)],\n",
-       "            cost=10.0\n",
-       "        ),\n",
-       "        'a3': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=3, n_failures=1), Beta(n_successes=3, n_failures=1)],\n",
-       "            cost=20.0\n",
-       "        )\n",
-       "    },\n",
-       "    strategy=MultiObjectiveCostControlBandit()\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSmabBernoulliMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m30\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m7\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m7\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m3\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m10\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m3\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m20\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[33mstrategy\u001b[0m=\u001b[1;35mMultiObjectiveCostControlBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# update\n", "mab.update(actions=actions, rewards=rewards)\n", @@ -497,7 +244,7 @@ }, { "cell_type": "markdown", - "id": "9823d84c-862b-4bb6-ab36-024f34460595", + "id": "23", "metadata": {}, "source": [ "## 4. Example of usage\n", @@ -507,8 +254,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "id": "a785463d-d710-4844-80bf-42c09b0e0b45", + "execution_count": null, + "id": "24", "metadata": {}, "outputs": [], "source": [ @@ -531,56 +278,10 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "034add3d-e6f3-471c-b8b9-30c286faf2cc", + "execution_count": null, + "id": "25", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
SmabBernoulliMOCC(\n",
-       "    actions={\n",
-       "        'a1': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=450, n_failures=488), Beta(n_successes=450, n_failures=488)],\n",
-       "            cost=30.0\n",
-       "        ),\n",
-       "        'a2': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=8541, n_failures=8325), Beta(n_successes=8541, n_failures=8325)],\n",
-       "            cost=10.0\n",
-       "        ),\n",
-       "        'a3': BetaMOCC(\n",
-       "            counters=[Beta(n_successes=1110, n_failures=1102), Beta(n_successes=1110, n_failures=1102)],\n",
-       "            cost=20.0\n",
-       "        )\n",
-       "    },\n",
-       "    strategy=MultiObjectiveCostControlBandit()\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mSmabBernoulliMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mactions\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'a1'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m450\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m488\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m450\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m488\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m30\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'a2'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m8541\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m8325\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m8541\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m8325\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m10\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[32m'a3'\u001b[0m: \u001b[1;35mBetaMOCC\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mcounters\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1110\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1102\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mBeta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mn_successes\u001b[0m=\u001b[1;36m1110\u001b[0m, \u001b[33mn_failures\u001b[0m=\u001b[1;36m1102\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mcost\u001b[0m=\u001b[1;36m20\u001b[0m\u001b[1;36m.0\u001b[0m\n", - " \u001b[1m)\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[33mstrategy\u001b[0m=\u001b[1;35mMultiObjectiveCostControlBandit\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(mab)" ] diff --git a/pybandits/offline_policy_estimator.py b/pybandits/offline_policy_estimator.py new file mode 100644 index 0000000..94a7817 --- /dev/null +++ b/pybandits/offline_policy_estimator.py @@ -0,0 +1,807 @@ +""" +Comprehensive Offline Policy Evaluation (OPE) estimators. + +This module provides a complete set of estimators for OPE. +""" + +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, Optional, Tuple, Type + +import numpy as np +from scipy.stats import bootstrap + +from pybandits.base import Float01, PyBanditsBaseModel +from pybandits.pydantic_version_compatibility import ( + PYDANTIC_VERSION_1, + PYDANTIC_VERSION_2, + NonNegativeFloat, + PositiveFloat, + PositiveInt, + PrivateAttr, + pydantic_version, + validate_call, +) + + +class BaseOfflinePolicyEstimator(PyBanditsBaseModel, ABC): + """Base class for all OPE estimators. + + This class defines the interface for all OPE estimators and provides a common method for estimating the policy value. + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + """ + + alpha: Float01 = 0.05 + n_bootstrap_samples: int = 10000 + random_state: Optional[int] = None + _name: str = PrivateAttr() + + @classmethod + def _check_array( + cls, + name: str, + data: Dict[str, np.ndarray], + ndim: PositiveInt, + dtype: type, + n_samples: PositiveInt, + n_actions: Optional[PositiveInt] = None, + ): + if name in data: + array = data[name] + if array.ndim != ndim: + raise ValueError(f"{name} must be a {ndim}D array.") + if array.shape[0] != n_samples: + raise ValueError(f"action and {name} must have the same length.") + if array.dtype != dtype: + raise ValueError(f"{name} must be a {dtype} array") + if ndim > 1: + if array.shape[1] != n_actions: + raise ValueError(f"{name} must have the same number of actions as the action array.") + + @classmethod + def _check_sum(cls, name: str, data: Dict[str, np.ndarray]): + if name in data: + array = data[name] + if not array.sum(axis=-1).all(): + raise ValueError(f"{name} must have at least one non-zero element on each column.") + + @classmethod + def _check_inputs(cls, action: np.ndarray, **kwargs): + """ + Check the inputs for the estimator. + + Parameters + ---------- + action : np.ndarray + Array of actions taken. + """ + if action.ndim != 1: + raise ValueError("action must be a 1D array.") + if action.dtype != int: + raise ValueError("action must be an integer array.") + n_samples = action.shape[0] + n_actions = np.unique(action).shape[0] + + for name, dtype in zip(["reward", "propensity_score", "expected_importance_weight"], [int, float, float]): + cls._check_array(name, kwargs, 1, dtype, n_samples) + + for name in ["estimated_policy", "expected_reward"]: + cls._check_array(name, kwargs, 2, float, n_samples, n_actions) + + for name in ["propensity_score", "estimated_policy", "expected_importance_weight"]: + cls._check_sum(name, kwargs) + + @validate_call(config=dict(arbitrary_types_allowed=True)) + def estimate_policy_value_with_confidence_interval(self, **kwargs) -> Tuple[float, float, float, float]: + """ + Estimate the policy value with a confidence interval. + + Parameters + ---------- + action : np.ndarray + Array of actions taken. + + Returns + ------- + Tuple[float, float, float, float] + Estimated policy value, mean, lower bound, and upper bound of the confidence interval. + """ + self._check_inputs(**kwargs) + sample_reward = self.estimate_sample_rewards(**kwargs) + estimated_policy_value = sample_reward.mean() + bootstrap_result = bootstrap( + data=(sample_reward,), + statistic=np.mean, + confidence_level=1 - self.alpha, + n_resamples=self.n_bootstrap_samples, + random_state=self.random_state, + ) + low, high = bootstrap_result.confidence_interval + std = bootstrap_result.standard_error + return estimated_policy_value, low, high, std + + @abstractmethod + def estimate_sample_rewards(self, **kwargs) -> np.ndarray: + """ + Estimate sample rewards. + + Returns + ------- + np.ndarray + Estimated sample rewards. + """ + pass + + @property + def name(self) -> str: + """ + Get the name of the estimator. + + Returns + ------- + str + Name of the estimator. + """ + return self._name + + +class ReplayMethod(BaseOfflinePolicyEstimator): + """ + Replay Method estimator. + + This estimator is a simple baseline that estimates the policy value by averaging the rewards of the matched samples. + + Reference: Unbiased Offline Evaluation of Contextual-bandit-based News Article Recommendation Algorithms (Li, Chu, Langford, and Wang, 2011) + https://arxiv.org/pdf/1003.5956 + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + + """ + + _name = "rep" + + def estimate_sample_rewards( + self, action: np.ndarray, reward: np.ndarray, estimated_policy: np.ndarray, **kwargs + ) -> np.ndarray: + """ + Estimate the sample rewards. + + Parameters + ---------- + action : np.ndarray + Array of actions taken. + reward : np.ndarray + Array of rewards corresponding to each action. + estimated_policy : np.ndarray + Array of action distributions. + + Returns + ------- + sample_reward : np.ndarray + Estimated sample rewards. + """ + n_samples = action.shape[0] + matched_evaluation_policy = estimated_policy[np.arange(n_samples), action] + matched_action = matched_evaluation_policy == 1 + sample_reward = ( + reward * matched_action / matched_action.mean() if matched_action.any() else np.zeros_like(matched_action) + ) + return sample_reward + + +class GeneralizedInverseProbabilityWeighting(BaseOfflinePolicyEstimator, ABC): + """ + Abstract generalization of the Inverse Probability Weighting (IPW) estimator. + + Reference: Learning from Logged Implicit Exploration Data (Strehl, Langford, Li, and Kakade, 2010) + https://arxiv.org/pdf/1003.0120 + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + """ + + @abstractmethod + def _get_importance_weights(self, **kwargs) -> np.ndarray: + """ + Get the importance weights. + + Returns + ------- + np.ndarray + Array of importance weights. + """ + pass + + def estimate_sample_rewards(self, reward: np.ndarray, shrinkage_method: Optional[Callable], **kwargs) -> np.ndarray: + """ + Estimate the sample rewards. + + Parameters + ---------- + reward : np.ndarray + Array of rewards corresponding to each action. + shrinkage_method : Optional[Callable] + Shrinkage method for the importance weights. + + Returns + ------- + sample_reward : np.ndarray + Estimated sample rewards. + """ + importance_weight = self._get_importance_weights(**kwargs) + importance_weight = shrinkage_method(importance_weight) if shrinkage_method is not None else importance_weight + sample_reward = reward * importance_weight + return sample_reward + + +class InverseProbabilityWeighting(GeneralizedInverseProbabilityWeighting): + """ + Inverse Probability Weighing (IPW) estimator. + + Reference: Learning from Logged Implicit Exploration Data (Strehl, Langford, Li, and Kakade, 2010) + https://arxiv.org/pdf/1003.0120 + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + """ + + _name = "ipw" + + def estimate_sample_rewards( + self, + action: np.ndarray, + reward: np.ndarray, + propensity_score: np.ndarray, + estimated_policy: np.ndarray, + shrinkage_method: Optional[Callable] = None, + **kwargs, + ) -> np.ndarray: + """ + Estimate the sample rewards. + + Parameters + ---------- + action : np.ndarray + Array of actions taken. + reward : np.ndarray + Array of rewards corresponding to each action. + propensity_score : np.ndarray + Array of propensity scores. + estimated_policy : np.ndarray + Array of action distributions. + + Returns + ------- + sample_reward : np.ndarray + Estimated sample rewards. + """ + return super().estimate_sample_rewards( + reward=reward, + action=action, + propensity_score=propensity_score, + estimated_policy=estimated_policy, + shrinkage_method=shrinkage_method, + ) + + def _get_importance_weights( + self, + action: np.ndarray, + propensity_score: np.ndarray, + estimated_policy: np.ndarray, + **kwargs, + ) -> np.ndarray: + """ + Get the importance weights. + + Parameters + ---------- + action : np.ndarray + Array of actions taken + propensity_score : np.ndarray + Array of propensity scores. + estimated_policy : np.ndarray + Array of action distributions. + + Returns + ------- + importance_weight : np.ndarray + Array of importance weights. + """ + n_samples = action.shape[0] + importance_weight = estimated_policy[np.arange(n_samples), action] / propensity_score + return importance_weight + + +class SelfNormalizedInverseProbabilityWeighting(InverseProbabilityWeighting): + """ + Self-Normalized Inverse Propensity Score (SNIPS) estimator. + + Reference: The Self-normalized Estimator for Counterfactual Learning (Swaminathan and Joachims, 2015) + https://papers.nips.cc/paper_files/paper/2015/file/39027dfad5138c9ca0c474d71db915c3-Paper.pdf + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + """ + + _name = "snips" + + def estimate_sample_rewards( + self, + action: np.ndarray, + reward: np.ndarray, + propensity_score: np.ndarray, + estimated_policy: np.ndarray, + shrinkage_method: Optional[Callable] = None, + **kwargs, + ) -> np.ndarray: + """ + Estimate the sample rewards. + + Parameters + ---------- + action : np.ndarray + Array of actions taken. + reward : np.ndarray + Array of rewards corresponding to each action. + propensity_score : np.ndarray + Array of propensity scores. + estimated_policy : np.ndarray + Array of action distributions. + shrinkage_method : Optional[Callable] + Shrinkage method for the importance weights. + + Returns + ------- + sample_reward : np.ndarray + Estimated sample rewards. + """ + + def self_normalized_shrink_weights(importance_weight: np.ndarray) -> np.ndarray: + importance_weight = ( + shrinkage_method(importance_weight) if shrinkage_method is not None else importance_weight + ) + return importance_weight / importance_weight.mean() + + sample_reward = super().estimate_sample_rewards( + action=action, + reward=reward, + propensity_score=propensity_score, + estimated_policy=estimated_policy, + shrinkage_method=self_normalized_shrink_weights, + ) + return sample_reward + + +class DirectMethod(BaseOfflinePolicyEstimator): + """ + Direct Method (DM) estimator. + + This estimator uses the evaluation policy to Estimate the sample rewards. + + Reference: The Offset Tree for Learning with Partial Labels (Beygelzimer and Langford, 2009) + https://arxiv.org/pdf/0812.4044 + + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + """ + + _name = "dm" + + def estimate_sample_rewards( + self, + estimated_policy: np.ndarray, + expected_reward: np.ndarray, + **kwargs, + ) -> np.ndarray: + """ + Estimate the sample rewards. + + Parameters + ---------- + estimated_policy : np.ndarray + Array of action distributions. + expected_reward : np.ndarray + Array of expected rewards. + + Returns + ------- + sample_reward : np.ndarray + Estimated sample rewards. + """ + n_samples = expected_reward.shape[0] + base_expected_reward = expected_reward[np.arange(n_samples), :] + evaluation_policy = estimated_policy[np.arange(n_samples), :] + expected_reward = np.average( + base_expected_reward, + weights=evaluation_policy, + axis=1, + ) + return expected_reward + + +class GeneralizedDoublyRobust(BaseOfflinePolicyEstimator, ABC): + """ + Abstract generalization of the Doubly Robust (DR) estimator. + + Reference: Doubly Robust Policy Evaluation and Optimization (Dudík, Erhan, Langford, and Li, 2014) + https://arxiv.org/pdf/1503.02834 + + More Robust Doubly Robust Off-policy Evaluation (Farajtabar, Chow, and Ghavamzadeh, 2018) + https://arxiv.org/pdf/1802.03493 + + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + """ + + _alternative_method_cls: Type[InverseProbabilityWeighting] + _dm: DirectMethod = PrivateAttr() + _other_method: BaseOfflinePolicyEstimator = PrivateAttr() + + if pydantic_version == PYDANTIC_VERSION_1: + + def __init__(self, **data): + super().__init__(**data) + self._dm = DirectMethod( + alpha=self.alpha, n_bootstrap_samples=self.n_bootstrap_samples, random_state=self.random_state + ) + self._other_method = self._alternative_method_cls( + alpha=self.alpha, n_bootstrap_samples=self.n_bootstrap_samples, random_state=self.random_state + ) + + elif pydantic_version == PYDANTIC_VERSION_2: + + def model_post_init(self, __context: Any) -> None: + self._dm = DirectMethod( + alpha=self.alpha, n_bootstrap_samples=self.n_bootstrap_samples, random_state=self.random_state + ) + self._other_method = self._alternative_method_cls( + alpha=self.alpha, n_bootstrap_samples=self.n_bootstrap_samples, random_state=self.random_state + ) + + else: + raise ValueError(f"Unsupported pydantic version: {pydantic_version}") + + def estimate_sample_rewards( + self, + action: np.ndarray, + reward: np.ndarray, + propensity_score: np.ndarray, + estimated_policy: np.ndarray, + expected_reward: np.ndarray, + **kwargs, + ) -> np.ndarray: + """ + Estimate the sample rewards. + + Parameters + ---------- + action : np.ndarray + Array of actions taken. + reward : np.ndarray + Array of rewards corresponding to each action. + propensity_score : np.ndarray + Array of propensity scores. + estimated_policy : np.ndarray + Array of action distributions. + expected_reward : np.ndarray + Array of expected rewards. + + Returns + ------- + sample_reward : np.ndarray + Estimated rewards. + """ + dm_sample_reward = self._dm.estimate_sample_rewards( + action=action, estimated_policy=estimated_policy, expected_reward=expected_reward + ) + other_sample_reward = self._other_method.estimate_sample_rewards( + action=action, + reward=reward - dm_sample_reward, + propensity_score=propensity_score, + estimated_policy=estimated_policy, + shrinkage_method=self._shrink_weights, + ) + sample_reward = dm_sample_reward + other_sample_reward + return sample_reward + + def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray: + return importance_weight + + +class DoublyRobust(GeneralizedDoublyRobust): + """ + Doubly Robust (DR) estimator. + + Reference: Doubly Robust Policy Evaluation and Optimization (Dudík, Erhan, Langford, and Li, 2014) + https://arxiv.org/pdf/1503.02834 + + More Robust Doubly Robust Off-policy Evaluation (Farajtabar, Chow, and Ghavamzadeh, 2018) + https://arxiv.org/pdf/1802.03493 + + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + """ + + _name = "dr" + _alternative_method_cls = InverseProbabilityWeighting + + +class SelfNormalizedDoublyRobust(GeneralizedDoublyRobust): + """ + Self-Normalized Doubly Robust (SNDR) estimator. + + This estimator uses the self-normalized importance weights to combine the DR and IPS estimators. + + Reference: Intrinsically Efficient, Stable, and Bounded Off-Policy Evaluation for Reinforcement Learning (Kallus and Uehara, 2019) + https://arxiv.org/pdf/1906.03735 + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + """ + + _name = "sndr" + _alternative_method_cls = SelfNormalizedInverseProbabilityWeighting + + +class SwitchDoublyRobust(DoublyRobust): + """ + Switch Doubly Robust (Switch-DR) estimator. + + This estimator uses a switching rule based on the propensity score to combine the DR and IPS estimators. + + Reference: Optimal and Adaptive Off-policy Evaluation in Contextual Bandits (Wang, Agarwal, and Dudik, 2017) + https://arxiv.org/pdf/1507.02646 + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : Optional[int], default=None + Random seed for bootstrap sampling. + switch_threshold : float, default=inf + Threshold for the importance weight to switch between the DR and IPS estimators. + """ + + _name = "switch-dr" + switch_threshold: float = float("inf") + + def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray: + switch_indicator = importance_weight >= self.switch_threshold + return switch_indicator * importance_weight + + +class DoublyRobustWithOptimisticShrinkage(DoublyRobust): + """ + Optimistic version of DRos estimator. + + This estimator uses a shrinkage factor to shrink the importance weight in the native DR. + + Reference: Doubly Robust Off-Policy Evaluation with Shrinkage (Su, Dimakopoulou, Krishnamurthy, and Dudik, 2020) + https://arxiv.org/pdf/1907.09623 + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + shrinkage_factor : float, default=0.0 + Shrinkage factor for the importance weights. + If set to 0 or infinity, the estimator is equivalent to the native DM or DR estimators, respectively. + """ + + shrinkage_factor: NonNegativeFloat = 0.0 + _name = "dros-opt" + + def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray: + if self.shrinkage_factor == 0: + return np.zeros_like(importance_weight) + elif self.shrinkage_factor == float("inf"): + return importance_weight + return self.shrinkage_factor * importance_weight / (importance_weight**2 + self.shrinkage_factor) + + +class DoublyRobustWithPessimisticShrinkage(DoublyRobust): + """ + Pessimistic version of DRos estimator. + + This estimator uses a shrinkage factor to shrink the importance weight in the native DR. + + Reference: Doubly Robust Off-Policy Evaluation with Shrinkage (Su, Dimakopoulou, Krishnamurthy, and Dudik, 2020) + https://arxiv.org/pdf/1907.09623 + + Parameters + ---------- + alpha : Float01, default=0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, default=10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, default=None + Random seed for bootstrap sampling. + shrinkage_factor : float, default=0.0 + Shrinkage factor for the importance weights. + """ + + _name = "dros-pess" + shrinkage_factor: PositiveFloat = float("inf") + + def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray: + importance_weight = np.minimum(self.shrinkage_factor, importance_weight) + return importance_weight + + +class SubGaussianInverseProbabilityWeighting(InverseProbabilityWeighting): + """ + SubGaussian Inverse Probability Weighing estimator. + + Reference: Subgaussian and Differentiable Importance Sampling for Off-Policy Evaluation and Learning (Metelli, Russo, and Restelli, 2021) + https://proceedings.neurips.cc/paper_files/paper/2021/file/4476b929e30dd0c4e8bdbcc82c6ba23a-Paper.pdf + + Parameters + ---------- + alpha : Float01, defaults to 0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, defaults to 10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, defaults to None + Random seed for bootstrap sampling. + shrinkage_factor : Float01, defaults to 0.0 + Shrinkage factor for the importance weights. + + """ + + _name = "sg-ipw" + shrinkage_factor: Float01 = 0.0 + + def _shrink_weights(self, importance_weight: np.ndarray) -> np.ndarray: + return importance_weight / (1 - self.shrinkage_factor + self.shrinkage_factor * importance_weight) + + +class SubGaussianDoublyRobust(GeneralizedDoublyRobust): + """ + SubGaussian Doubly Robust estimator. + + Reference: Subgaussian and Differentiable Importance Sampling for Off-Policy Evaluation and Learning (Metelli, Russo, and Restelli, 2021) + https://proceedings.neurips.cc/paper_files/paper/2021/file/4476b929e30dd0c4e8bdbcc82c6ba23a-Paper.pdf + + Parameters + ---------- + alpha : Float01, defaults to 0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, defaults to 10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, defaults to None + Random seed for bootstrap sampling. + """ + + _name = "sg-dr" + _alternative_method_cls = SubGaussianInverseProbabilityWeighting + + +class BalancedInverseProbabilityWeighting(GeneralizedInverseProbabilityWeighting): + """ + Balanced Inverse Probability Weighing estimator. + + Reference: Balanced Off-Policy Evaluation in General Action Spaces (Sondhi, Arbour, and Dimmery, 2020) + https://arxiv.org/pdf/1906.03694 + + + Parameters + ---------- + alpha : Float01, defaults to 0.05 + Significance level for confidence interval estimation. + n_bootstrap_samples : int, defaults to 10000 + Number of bootstrap samples for confidence interval estimation. + random_state : int, defaults to None + Random seed for bootstrap sampling. + + ---------- + Arjun Sondhi, David Arbour, and Drew Dimmery + "Balanced Off-Policy Evaluation in General Action Spaces.", 2020. + """ + + _name = "b-ipw" + + def _get_importance_weights(self, expected_importance_weight: np.ndarray, **kwargs) -> np.ndarray: + """ + Get the importance weights. + + Parameters + ---------- + expected_importance_weight : np.ndarray + Array of expected importance weights. + + Returns + ------- + expected_importance_weight : np.ndarray + Array of expected importance weights. + """ + return expected_importance_weight + + def estimate_sample_rewards( + self, + reward: np.ndarray, + expected_importance_weight: np.ndarray, + **kwargs, + ) -> np.ndarray: + """ + Estimate the sample rewards. + + Parameters + ---------- + reward : np.ndarray + Array of rewards corresponding to each action. + expected_importance_weight : np.ndarray + Array of expected importance weights. + + Returns + ------- + sample_reward : np.ndarray + Estimated rewards. + """ + return super().estimate_sample_rewards( + reward=reward, expected_importance_weight=expected_importance_weight, shrinkage_method=None + ) diff --git a/pybandits/offline_policy_evaluator.py b/pybandits/offline_policy_evaluator.py new file mode 100644 index 0000000..97d7f4b --- /dev/null +++ b/pybandits/offline_policy_evaluator.py @@ -0,0 +1,1127 @@ +import os +from copy import deepcopy +from functools import partial +from itertools import product +from math import floor +from multiprocessing import Pool, cpu_count +from sys import version_info +from typing import Any, Dict, List, Literal, Optional, Union + +import numpy as np +import optuna +import pandas as pd +from bokeh.models import ColumnDataSource, TabPanel +from bokeh.plotting import figure +from loguru import logger +from optuna import Trial +from sklearn.base import ClassifierMixin, TransformerMixin +from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import cross_val_score +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from tqdm import tqdm + +from pybandits.pydantic_version_compatibility import ( + PYDANTIC_VERSION_1, + PYDANTIC_VERSION_2, + NonNegativeInt, + PositiveInt, + PrivateAttr, + field_validator, + model_validator, + pydantic_version, + validate_call, +) + +if version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from pybandits import offline_policy_estimator +from pybandits.base import ActionId, Float01, PyBanditsBaseModel +from pybandits.mab import BaseMab +from pybandits.offline_policy_estimator import BaseOfflinePolicyEstimator +from pybandits.utils import ( + extract_argument_names_from_function, + get_non_abstract_classes, + in_jupyter_notebook, + visualize_via_bokeh, +) + +optuna.logging.enable_propagation() # Propagate logs to the root logger. +optuna.logging.disable_default_handler() # Stop showing logs in sys.stderr. + + +class _FunctionEstimator(PyBanditsBaseModel, ClassifierMixin, arbitrary_types_allowed=True): + """ + This class provides functionality for model optimization using hyperparameter tuning via Optuna, + and prediction with optimized or default machine learning models. + It is used to estimate the propensity score and expected reward. + + + Parameters + ---------- + estimator_type : Optional[Literal["logreg", "gbm", "rf", "mlp"]] + The model type to optimize. + + fast_fit : bool + Whether to use the default parameter set for the model. + + action_one_hot_encoder : OneHotEncoder + Fitted one hot encoder for action encoding. + + n_trials : int + Number of trials for the Optuna optimization process. + + verbose : bool + Whether to log detailed information during the optimization process. + + study_name : Optional[str] + Name of the study to be created by Optuna. + + multi_action_prediction : bool + Whether to predict for all actions or only for real action. + + """ + + estimator_type: Literal["logreg", "gbm", "rf", "mlp"] + fast_fit: bool + action_one_hot_encoder: OneHotEncoder = OneHotEncoder(sparse=False) + n_trials: int + verbose: bool + study_name: Optional[str] = None + multi_action_prediction: bool + _model: Union[LogisticRegression, GradientBoostingClassifier, RandomForestClassifier, MLPClassifier] = PrivateAttr() + _model_mapping = { + "mlp": MLPClassifier, + "rf": RandomForestClassifier, + "logreg": LogisticRegression, + "gbm": GradientBoostingClassifier, + } + + def _pre_process(self, batch: Dict[str, Any]) -> np.ndarray: + """ + Preprocess the feature vectors to be used for regression model training. + This method concatenates the context vector and action context vectors. + + Parameters + ---------- + batch : Dict[str, Any] + The batch of data containing context, action, and action context. + + Returns + ------- + np.ndarray + A concatenated array of context and action context, shape (n_rounds, n_features_context + dim_action_context). + """ + context = batch["context"] + action = batch["action_ids"] + return np.concatenate([context, self.action_one_hot_encoder.transform(action.reshape((-1, 1)))], axis=1) + + def _sample_parameter_space(self, trial: Trial) -> Dict[str, Union[str, int, float]]: + """ + Define the hyperparameter search space for a given model type in Optuna. + + The search space is dynamically selected based on the model type being optimized. + + Parameters + ---------- + trial : optuna.Trial + A single trial in the Optuna optimization process. + + Returns + ------- + dict + A dictionary representing the search space for the model's hyperparameters. + """ + + if self.estimator_type == "mlp": + return { + "hidden_layer_sizes": 2 ** trial.suggest_int("hidden_layer_sizes", 2, 6), + "activation": trial.suggest_categorical("activation", ["relu", "logistic", "tanh"]), + "solver": trial.suggest_categorical("solver", ["lbfgs", "sgd", "adam"]), + "alpha": np.sqrt(10) ** -trial.suggest_int("learning_rate_init", 0, 10), + "max_iter": 1000, + "learning_rate_init": np.sqrt(10) ** -trial.suggest_int("learning_rate_init", 0, 6), + } + elif self.estimator_type == "rf": + return { + "max_depth": trial.suggest_int("max_depth", 2, 5), + "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]), + "max_features": trial.suggest_int("max_features", 1, 3), + "n_estimators": trial.suggest_int("n_estimators", 10, 50), + "n_jobs": -1, + } + elif self.estimator_type == "logreg": + return { + "tol": trial.suggest_float("tol", 0.00001, 0.0001), + "C": trial.suggest_float("C", 0.05, 3), + "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]), + "max_iter": 1000, + "n_jobs": -1, + } + elif self.estimator_type == "gbm": + return { + "n_estimators": trial.suggest_int("n_estimators", 10, 100), + "learning_rate": np.sqrt(10) ** -trial.suggest_int("learning_rate_init", 0, 6), + "max_depth": trial.suggest_int("max_depth", 2, 10), + } + + def _objective(self, trial: Trial, feature_set: np.ndarray, label: np.ndarray) -> float: + """ + Objective function for Optuna optimization. + + This function trains a model using cross-validation and returns the negative accuracy + to be minimized. + + Parameters + ---------- + trial : Trial + A single trial in the Optuna optimization process. + + feature_set : np.ndarray + The training dataset, containing context and encoded actions. + + label : np.ndarray + The labels for the dataset. + + Returns + ------- + score : float + The score to be maximized by Optuna. + """ + params = self._sample_parameter_space(trial) + model = self._model_mapping[self.estimator_type](**params) + score = cross_val_score(model, feature_set, label).mean() + trial.set_user_attr("model_params", params) + + return score + + def _optimize(self, feature_set: np.ndarray, label: np.ndarray, study: optuna.Study) -> dict: + """ + Optimize the model's hyperparameters using Optuna. + + Parameters + ---------- + feature_set : np.ndarray + The training dataset, containing 'context' and 'action_ids' keys. + + study : optuna.Study + The Optuna study object to store optimization results. + + Returns + ------- + best_params : dict + The best set of hyperparameters found by Optuna. + """ + + study.optimize(lambda trial: self._objective(trial, feature_set, label), n_trials=self.n_trials) + + best_params = study.best_trial.user_attrs["model_params"] + if self.verbose: + logger.info(f"Optuna best model with optimized parameters for {self.estimator_type}:\n {best_params}") + + return best_params + + @validate_call(config=dict(arbitrary_types_allowed=True)) + def fit(self, X: dict, y: np.ndarray) -> Self: + """ + Fit the model using the given dataset X and labels y. + + Parameters + ---------- + X : dict + The dataset containing 'context' and 'action_ids' keys. + y : np.ndarray + The labels for the dataset. + + Returns + ------- + self : _FunctionEstimator + The fitted model. + """ + feature_set = self._pre_process(X) + if self.fast_fit: + model_parameters = {} + else: + pruner = optuna.pruners.MedianPruner() + sampler = optuna.samplers.TPESampler(multivariate=True, group=True) + study = optuna.create_study( + direction="maximize", study_name=self.study_name, pruner=pruner, sampler=sampler + ) + model_parameters = self._optimize(feature_set, y, study) + + model = self._model_mapping[self.estimator_type](**model_parameters) + model.fit(feature_set, y) + self._model = model + return self + + @validate_call + def predict(self, X: dict) -> np.ndarray: + """ + Predict the labels for the given dataset X. + + Parameters + ---------- + X : dict + The dataset containing 'context' and 'action_ids' keys. + + Returns + ------- + prediction : np.ndarray + The predicted labels for the dataset. + """ + if not self._model: + raise AttributeError("Model has not been fitted yet.") + + if self.multi_action_prediction: + specific_action_X = X.copy() + prediction = np.empty((X["n_rounds"], len(X["unique_actions"]))) + for action_index, action in enumerate(X["unique_actions"]): + specific_action_X["action_ids"] = np.array([action] * X["n_rounds"]) + specific_action_feature_set = self._pre_process(specific_action_X) + specific_action_prediction = self._model.predict_proba(specific_action_feature_set)[:, 1] + prediction[:, action_index] = specific_action_prediction + else: + feature_set = self._pre_process(X) + prediction = self._model.predict_proba(feature_set)[:, 1] + return prediction + + +class OfflinePolicyEvaluator(PyBanditsBaseModel, arbitrary_types_allowed=True): + """ + Class to conduct OPE with multiple OPE estimators + + Reference: Open Bandit Dataset and Pipeline: Towards Realistic and Reproducible Off-Policy Evaluation + https://arxiv.org/abs/2008.07146 https://github.com/st-tech/zr-obp + + Parameters + ---------- + logged_data : pd.DataFrame + Logging data set + split_prop: Float01 + Proportion of dataset used as training set + propensity_score_model_type: Literal["logreg", "gbm", "rf", "mlp", "batch_empirical", "empirical", "propensity_score"] + Method used to compute/estimate propensity score pi_b (propensity_score, logging / behavioral policy). + expected_reward_model_type: Literal["logreg", "gbm", "rf", "mlp"] + Method used to estimate expected reward for each action a in the training set. + n_trials : Optional[int] + Number of trials for the Optuna optimization process. + fast_fit : bool + Whether to use the default parameter set for the function estimator models. + ope_estimators: Optional[List[BaseOfflinePolicyEstimator]] + List of OPE estimators used to evaluate the policy value of evaluation policy. + All available estimators are if not specified. + batch_feature: str + Column name for batch as available in logged_data + action_feature: str + Column name for action as available in logged_data + reward_feature: Union[str, List[str]] + Column name for reward as available in logged_data + contextual_features: Optional[List[str]] + Column names for contextual features as available in logged_data + cost_feature: Optional[str] + Column name for cost as available in logged_data; used for bandit with cost control + group_feature: Optional[str] + Column name for group definition feature as available in logged_data; available from simulated data + to define samples with similar contextual profile + true_reward_feature: Optional[Union[str, List[str]]] + Column names for reward proba distribution features as available in simulated logged_data. Used to compute ground truth + propensity_score_feature : Optional[str] + Column name for propensity score as available in logged_data; used for evaluation of the policy value + verbose : bool + Whether to log detailed information during the optimization process. + """ + + logged_data: pd.DataFrame + split_prop: Float01 + propensity_score_model_type: Literal[ + "logreg", "gbm", "rf", "mlp", "batch_empirical", "empirical", "propensity_score" + ] + expected_reward_model_type: Literal["logreg", "gbm", "rf", "mlp"] + importance_weights_model_type: Literal["logreg", "gbm", "rf", "mlp"] + scaler: Optional[Union[TransformerMixin, Dict[str, TransformerMixin]]] = None + n_trials: Optional[int] = 100 + fast_fit: bool = False + ope_estimators: Optional[List[BaseOfflinePolicyEstimator]] + batch_feature: str + action_feature: str + reward_feature: Union[str, List[str]] + contextual_features: Optional[List[str]] = None + cost_feature: Optional[str] = None + group_feature: Optional[str] = None + true_reward_feature: Optional[Union[str, List[str]]] = None + propensity_score_feature: Optional[str] = None + verbose: bool = False + _train_data: Optional[Dict[str, Any]] = PrivateAttr() + _test_data: Optional[Dict[str, Any]] = PrivateAttr() + _estimated_expected_reward: Optional[Dict[str, np.ndarray]] = PrivateAttr(default=None) + _action_one_hot_encoder = OneHotEncoder(sparse=False) + _propensity_score_epsilon = 1e-08 + + @field_validator("split_prop", mode="before") + @classmethod + def check_split_prop(cls, value): + if value == 0 or value == 1: + raise ValueError("split_prop should be strictly between 0 and 1") + return value + + @field_validator("ope_estimators", mode="before") + @classmethod + def populate_ope_metrics(cls, value): + return ( + value if value is not None else [class_() for class_ in get_non_abstract_classes(offline_policy_estimator)] + ) + + @model_validator(mode="before") + @classmethod + def check_batch_feature(cls, values): + if values["batch_feature"] not in values["logged_data"]: + raise AttributeError("Batch feature missing from logged data.") + if not ( + pd.api.types.is_datetime64_ns_dtype(values["logged_data"][values["batch_feature"]]) + or pd.api.types.is_integer_dtype(values["logged_data"][values["batch_feature"]]) + ): + raise TypeError(f"Column {values['batch_feature']} should be either date or int type") + return values + + @model_validator(mode="before") + @classmethod + def check_action_feature(cls, values): + if values["action_feature"] not in values["logged_data"]: + raise AttributeError("Action feature missing from logged data.") + return values + + @model_validator(mode="before") + @classmethod + def check_propensity_score_estimation_method(cls, values): + if values["propensity_score_model_type"] == "propensity_score": + if cls._get_value_with_default("propensity_score_feature", values) is None: + raise ValueError( + "Propensity score feature should be defined when using it as propensity_score_model_type" + ) + return values + + @model_validator(mode="before") + @classmethod + def check_reward_features(cls, values): + reward_feature = values["reward_feature"] + reward_feature = reward_feature if isinstance(reward_feature, list) else [reward_feature] + if not all([reward in values["logged_data"] for reward in reward_feature]): + raise AttributeError("Reward feature missing from logged data.") + values["reward_feature"] = reward_feature + if "true_reward_feature" in values: + true_reward_feature = values["true_reward_feature"] + true_reward_feature = ( + true_reward_feature + if isinstance(true_reward_feature, list) + else [true_reward_feature] + if true_reward_feature is not None + else None + ) + if not all([true_reward in values["logged_data"] for true_reward in true_reward_feature]): + raise AttributeError("True reward feature missing from logged data.") + if len(reward_feature) != len(true_reward_feature): + raise ValueError("Reward and true reward features should have the same length") + values["true_reward_feature"] = true_reward_feature + + return values + + @model_validator(mode="before") + @classmethod + def check_optional_scalar_features(cls, values): + for feature in [ + "cost_feature", + "group_feature", + "propensity_score_feature", + ]: + value = cls._get_value_with_default(feature, values) + if value is not None and value not in values["logged_data"]: + raise AttributeError(f"{feature} missing from logged data.") + return values + + @model_validator(mode="before") + @classmethod + def check_contextual_features(cls, values): + value = cls._get_value_with_default("contextual_features", values) + if value is not None and not set(value).issubset(values["logged_data"].columns): + raise AttributeError("contextual_features missing from logged data.") + return values + + @model_validator(mode="before") + @classmethod + def check_model_optimization(cls, values): + n_trials_value = cls._get_value_with_default("n_trials", values) + fast_fit_value = cls._get_value_with_default("fast_fit", values) + + if (n_trials_value is None or fast_fit_value is None) and values["propensity_score_model_type"] not in [ + "logreg", + "gbm", + "rf", + "mlp", + ]: + raise ValueError("The requested propensity score model requires n_trials and fast_fit to be well defined") + if (n_trials_value is None or fast_fit_value is None) and cls._check_argument_required_by_estimators( + "expected_reward", values["ope_estimators"] + ): + raise ValueError( + "The requested offline policy evaluation metrics model require estimation of the expected reward. " + "Thus, n_trials and fast_fit need to be well defined." + ) + return values + + @classmethod + def _check_argument_required_by_estimators(cls, argument: str, ope_estimators: List[BaseOfflinePolicyEstimator]): + """ + Check if argument is required by OPE estimators. + + Parameters + ---------- + argument : str + Argument to check if required by OPE estimators. + ope_estimators : List[BaseOfflinePolicyEstimator] + List of OPE estimators. + + Returns + ------- + bool + True if argument is required by OPE estimators, False otherwise. + """ + return any( + [ + argument + in extract_argument_names_from_function(ope_estimator.estimate_sample_rewards) + + extract_argument_names_from_function(ope_estimator.estimate_policy_value_with_confidence_interval) + for ope_estimator in ope_estimators + ] + ) + + if pydantic_version == PYDANTIC_VERSION_1: + + def __init__(self, **data): + super().__init__(**data) + + # Extract batches for train and test set + self._extract_batches() + + # Estimate propensity score in the train and test set + self._estimate_propensity_score() + + # Estimate expected reward estimator and predict in the test set, when required by OPE estimators + if self._check_argument_required_by_estimators("expected_reward", self.ope_estimators): + self._estimate_expected_reward() + + elif pydantic_version == PYDANTIC_VERSION_2: + + def model_post_init(self, __context: Any) -> None: + # Extract batches for train and test set + self._extract_batches() + + # Estimate propensity score in the train and test set + self._estimate_propensity_score() + + # Estimate expected reward estimator and predict in the test set, when required by OPE estimators + if self._check_argument_required_by_estimators("expected_reward", self.ope_estimators): + self._estimate_expected_reward() + + else: + raise ValueError(f"Unsupported pydantic version: {pydantic_version}") + + def _extract_batches(self): + """ + Create training and test sets in dictionary form. + + """ + logged_data = self.logged_data.sort_values(by=self.batch_feature) + unique_batch = logged_data[self.batch_feature].unique() + split_batch = unique_batch[int(floor(len(unique_batch) * self.split_prop))] + + # add list of actions in dict in order to avoid test set with n_actions + # lower than nb of total actions + unique_actions = sorted(self.logged_data[self.action_feature].unique().tolist()) + action_label_encoder = LabelEncoder() + for batch_idx in tqdm(range(2)): + # extract samples batch + if batch_idx == 0: + extracted_batch = self.logged_data[self.logged_data[self.batch_feature] <= split_batch] + else: + extracted_batch = self.logged_data[self.logged_data[self.batch_feature] > split_batch] + extracted_batch = extracted_batch.reset_index(drop=True) + + # dict data set information for OPE + action_ids = extracted_batch[self.action_feature].values + if batch_idx == 0: + self._action_one_hot_encoder.fit(np.array(unique_actions).reshape((-1, 1))) + reward = extracted_batch[self.reward_feature].values + + # if cost control bandit + if self.cost_feature is not None: + cost = extracted_batch[self.cost_feature].values + else: + cost = None + + # if contextual information required + if self.contextual_features is not None: + if self.scaler is not None: + if type(self.scaler) is dict: + if batch_idx == 0: + x_scale = np.array( + pd.concat( + [ + self.scaler[feature].fit_transform(np.array(extracted_batch[[feature]])) + for feature in self.contextual_features + ], + axis=1, + ) + ) + else: + x_scale = np.array( + pd.concat( + [ + self.scaler[feature].transform(np.array(extracted_batch[[feature]])) + for feature in self.contextual_features + ], + axis=1, + ) + ) + else: + if batch_idx == 0: + x_scale = self.scaler.fit_transform(np.array(extracted_batch[self.contextual_features])) + else: + x_scale = self.scaler.transform(np.array(extracted_batch[self.contextual_features])) + else: + x_scale = np.array(extracted_batch[self.contextual_features]) + else: + x_scale = np.zeros((len(action_ids), 0)) # zero-columns 2d array to allow concatenation later + + # extract data for policy information + policy_information_cols = [ + self.batch_feature, + self.action_feature, + ] + self.reward_feature + if self.group_feature: + policy_information_cols.append(self.group_feature) + + policy_information = extracted_batch[policy_information_cols] + + # reward probability distribution as used during simulation process if available + ground_truth = extracted_batch[self.true_reward_feature] if self.true_reward_feature else None + + # propensity_score may be available from simulation: propensity_score is added to the dict + propensity_score = ( + extracted_batch[self.propensity_score_feature].values if self.propensity_score_feature else None + ) + if batch_idx == 0: + action_label_encoder.fit(unique_actions) + actions = action_label_encoder.transform(action_ids) + + # Store information in a dictionary as required by obp package + data_batch = { + "n_rounds": len(action_ids), # number of samples + "n_action": len(unique_actions), # number of actions + "unique_actions": unique_actions, # list of actions in the whole data set + "action_ids": action_ids, # action identifiers + "action": actions, # encoded action identifiers + "reward": reward, # samples' reward + "propensity_score": propensity_score, # propensity score, pi_b(a|x), vector + "context": x_scale, # the matrix of features i.e. context + "data": policy_information, # data array with informative features + "ground_truth": ground_truth, # true reward probability for each action and samples, list of list + "cost": cost, # samples' action cost for bandit with cost control + } + if batch_idx == 0: + self._train_data = data_batch + else: + self._test_data = data_batch + + def _estimate_propensity_score_empirical( + self, batch: Dict[str, Any], groupby_cols: List[str], inner_groupby_cols: Optional[List[str]] = None + ) -> np.ndarray: + """ + Empirical propensity score computation based on batches average + + Parameters + ---------- + batch: Dict[str, Any] + Dataset dictionary + groupby_cols : List[str] + Columns to group by + inner_groupby_cols : Optional[List[str]] + Columns to group by after the first groupby + + Returns + ------- + propensity_score : np.ndarray + computed propensity score for each of the objectives + """ + inner_groupby_cols = [] if inner_groupby_cols is None else inner_groupby_cols + overall_groupby_cols = groupby_cols + inner_groupby_cols + # number of recommended actions per group and batch + grouped_data = batch["data"].groupby(overall_groupby_cols)[self.reward_feature[0]].count() + + # proportion of recommended actions per group + if inner_groupby_cols: + empirical_distribution = pd.DataFrame( + grouped_data / grouped_data.groupby(inner_groupby_cols).sum() + ).reset_index() + else: + empirical_distribution = pd.DataFrame(grouped_data / grouped_data.sum()).reset_index() + + empirical_distribution.columns = overall_groupby_cols + ["propensity_score"] + + # deal with missing segment after group by + if len(overall_groupby_cols) > 1: + all_combinations = pd.DataFrame( + list(product(*[empirical_distribution[col].unique() for col in overall_groupby_cols])), + columns=overall_groupby_cols, + ) + + # Merge with the original dataframe, filling missing values in 'c' with 0 + empirical_distribution = pd.merge( + all_combinations, empirical_distribution, on=groupby_cols + inner_groupby_cols, how="left" + ).fillna(0) + + # extract propensity_score in the test set for user according to group and action recommended + matching_df = pd.DataFrame({k: batch["data"][k] for k in overall_groupby_cols}) + merged_df = pd.merge( + matching_df, + empirical_distribution[overall_groupby_cols + ["propensity_score"]], + how="left", # left join to ensure we get all rows from the batch + on=overall_groupby_cols, + ) + propensity_score = merged_df["propensity_score"].values + + return propensity_score + + def _empirical_averaged_propensity_score(self, batch: Dict[str, Any]) -> np.ndarray: + """ + Empirical propensity score computation based on batches average + + Parameters + ---------- + batch : Dict[str, Any] + dataset. + + Returns + ------ + : np.ndarray + estimated propensity_score + """ + + return self._estimate_propensity_score_empirical( + batch=batch, groupby_cols=[self.action_feature], inner_groupby_cols=[self.batch_feature] + ) + + def _empirical_propensity_score(self, batch: Dict[str, Any]) -> np.ndarray: + """ + Propensity score empirical computation based on data set average + + Parameters + ---------- + batch : Dict[str, Any] + dataset. + + Return + ------ + np.ndarray + estimated propensity_score + """ + + return self._estimate_propensity_score_empirical(batch=batch, groupby_cols=[self.action_feature]) + + def _estimate_propensity_score(self): + """ + Compute/approximate propensity score based on different methods in the train and test set. + Different approaches may be evaluated when logging policy is unknown. + """ + if not self.contextual_features: + # if no contextual features, propensity score is directly defined by the action taken, + # thus uniformly set to 1 + train_propensity_score = np.ones(self._train_data["n_rounds"]) + test_propensity_score = np.ones(self._test_data["n_rounds"]) + logger.warning( + f"No contextual features available, " + f"overriding the requested propensity_score_model_type={self.propensity_score_model_type} " + f"using uniform propensity score" + ) + else: + if self.propensity_score_model_type == "batch_empirical": + if self.verbose: + logger.info("Data batch-empirical estimation of propensity score.") + + # Empirical approach: propensity score pi_b computed as action means per samples batch + train_propensity_score = self._empirical_propensity_score(self._train_data) + test_propensity_score = self._empirical_propensity_score(self._test_data) + + elif self.propensity_score_model_type == "empirical": + if self.verbose: + logger.info("Data empirical estimation of propensity score.") + + # Empirical approach: propensity score pi_b computed as action means per samples batch + train_propensity_score = self._empirical_averaged_propensity_score(self._train_data) + test_propensity_score = self._empirical_averaged_propensity_score(self._test_data) + + elif self.propensity_score_model_type == "propensity_score": + if self.verbose: + logger.info("Data given value of propensity score.") + + train_propensity_score = self._train_data["propensity_score"] + test_propensity_score = self._test_data["propensity_score"] + + else: # self.propensity_score_model_type in ["gbm", "rf", "logreg", "mlp"] + if self.verbose: + logger.info( + f"Data prediction of propensity score based on {self.propensity_score_model_type} model." + ) + propensity_score_estimator = _FunctionEstimator( + estimator_type=self.propensity_score_model_type, + fast_fit=self.fast_fit, + action_one_hot_encoder=self._action_one_hot_encoder, + n_trials=self.n_trials, + verbose=self.verbose, + study_name=f"{self.propensity_score_model_type}_propensity_score", + multi_action_prediction=False, + ) + propensity_score_estimator.fit(X=self._train_data, y=self._train_data["action"]) + train_propensity_score = np.clip( + propensity_score_estimator.predict(self._train_data), self._propensity_score_epsilon, 1 + ) + test_propensity_score = np.clip( + propensity_score_estimator.predict(self._test_data), self._propensity_score_epsilon, 1 + ) + self._train_data["propensity_score"] = train_propensity_score + self._test_data["propensity_score"] = test_propensity_score + + def _estimate_expected_reward(self): + """ + Compute expected reward for each round and action. + """ + if self.verbose: + logger.info(f"Data prediction of expected reward based on {self.expected_reward_model_type} model.") + estimated_expected_reward = {} + for reward_feature, reward in zip(self.reward_feature, self._train_data["reward"].T): + expected_reward_model = _FunctionEstimator( + estimator_type=self.expected_reward_model_type, + fast_fit=self.fast_fit, + action_one_hot_encoder=self._action_one_hot_encoder, + n_trials=self.n_trials, + verbose=self.verbose, + study_name=f"{self.expected_reward_model_type}_expected_reward", + multi_action_prediction=True, + ) + + expected_reward_model.fit(X=self._train_data, y=reward.T) + + # predict in test set + estimated_expected_reward[reward_feature] = expected_reward_model.predict(self._test_data) + self._estimated_expected_reward = estimated_expected_reward + + def _estimate_importance_weight(self, mab: BaseMab) -> np.ndarray: + """ + Compute importance weights induced by the behavior and evaluation policies. + + Reference: Balanced Off-Policy Evaluation in General Action Spaces (Sondhi, Arbour, and Dimmery, 2020) + https://arxiv.org/pdf/1906.03694 + + Parameters + ---------- + mab : BaseMab + Multi-armed bandit to be evaluated + + Return + ------ + expected_importance_weights : np.ndarray + estimated importance weights + """ + if self.verbose: + logger.info(f"Data prediction of importance weights based on {self.importance_weights_model_type} model.") + + importance_weights_model = _FunctionEstimator( + estimator_type=self.importance_weights_model_type, + fast_fit=self.fast_fit, + action_one_hot_encoder=self._action_one_hot_encoder, + n_trials=self.n_trials, + verbose=self.verbose, + study_name=f"{self.importance_weights_model_type}_importance_weights", + multi_action_prediction=False, + ) + train_data = deepcopy(self._train_data) + mab_data = self._train_data["context"] if self.contextual_features else self._train_data["n_rounds"] + selected_actions = _mab_predict(mab, mab_data) + train_data["action_ids"] = np.concatenate((train_data["action_ids"], selected_actions), axis=0) + train_data["context"] = np.concatenate((train_data["context"], train_data["context"]), axis=0) + y = np.concatenate((np.zeros(len(selected_actions)), np.ones(len(selected_actions))), axis=0) + importance_weights_model.fit(X=train_data, y=y) + + # predict in test set + estimated_proba = importance_weights_model.predict(self._test_data) + expected_importance_weights = estimated_proba / (1 - estimated_proba) + return expected_importance_weights + + def _estimate_policy( + self, + mab: BaseMab, + n_mc_experiments: PositiveInt = 1000, + n_cores: Optional[NonNegativeInt] = None, + ) -> np.ndarray: + """ + Estimate policy via Monte Carlo (MC) sampling based on sampling distribution of each action a in the test set. + + Reference: Estimation Considerations in Contextual Bandit + https://arxiv.org/pdf/1711.07077.pdf + Reference: Debiased Off-Policy Evaluation for Recommendation Systems + https://arxiv.org/pdf/2002.08536.pdf + Reference: CAB: Continuous Adaptive Blending for Policy Evaluation and Learning + https://arxiv.org/pdf/1811.02672.pdf + + Parameters + ---------- + mab : BaseMab + Multi-armed bandit to be evaluated + n_mc_experiments: PositiveInt + Number of MC sampling rounds. Default: 1000 + n_cores: Optional[NonNegativeInt], all available cores if not specified + Number of cores used for multiprocessing + + Returns + ------- + estimated_policy: np.ndarray (nb samples, nb actions) + action probabilities for each action and samples + """ + if self.verbose: + logger.info("Data prediction of expected policy based on Monte Carlo experiments.") + n_cores = n_cores or cpu_count() + + # using MC, create a () best actions matrix + mc_actions = [] + mab_data = self._test_data["context"] if self.contextual_features else self._test_data["n_rounds"] + predict_func = partial(_mab_predict, mab, mab_data) + with Pool(processes=n_cores) as pool: + # predict best action for a new prior parameters draw + # using argmax(p(r|a, x)) with a in the list of actions + for mc_action in tqdm(pool.imap_unordered(predict_func, range(n_mc_experiments))): + mc_actions.append(mc_action) + + # finalize the dataframe shape to #samples X #mc experiments + mc_actions = pd.DataFrame(mc_actions).T + + # for each sample / each action, count the occurrence frequency during MC iteration + estimated_policy = np.zeros((self._test_data["n_rounds"], len(self._test_data["unique_actions"]))) + mc_action_counts = mc_actions.apply(pd.Series.value_counts, axis=1).fillna(0) + + for u in tqdm(range(self._test_data["n_rounds"])): + estimated_policy[u, :] = ( + mc_action_counts.iloc[u, :].reindex(self._test_data["unique_actions"], fill_value=0).values + / mc_actions.shape[1] + ) + return estimated_policy + + def evaluate( + self, + mab: BaseMab, + n_mc_experiments: int = 1000, + save_path: Optional[str] = None, + visualize: bool = True, + ) -> pd.DataFrame: + """ + Execute the OPE process with multiple estimators simultaneously. + + Parameters + ---------- + mab : BaseMab + Multi-armed bandit model to be evaluated + n_mc_experiments : int + Number of Monte Carlo experiments for policy estimation + save_path : Optional[str], defaults to None. + Path to save the results. Nothing is saved if not specified. + visualize : bool, defaults to True. + Whether to visualize the results of the OPE process + + Returns + ------- + estimated_policy_value_df : pd.DataFrame + Estimated policy values and confidence intervals + """ + if visualize and not save_path and not in_jupyter_notebook(): + raise ValueError("save_path is required for visualization when not running in a Jupyter notebook") + + # Define OPE keyword arguments + kwargs = {} + if self._check_argument_required_by_estimators("action", self.ope_estimators): + kwargs["action"] = self._test_data["action"] + if self._check_argument_required_by_estimators("estimated_policy", self.ope_estimators): + kwargs["estimated_policy"] = self._estimate_policy(mab=mab, n_mc_experiments=n_mc_experiments) + if self._check_argument_required_by_estimators("propensity_score", self.ope_estimators): + kwargs["propensity_score"] = self._test_data["propensity_score"] + if self._check_argument_required_by_estimators("expected_importance_weight", self.ope_estimators): + kwargs["expected_importance_weight"] = self._estimate_importance_weight(mab) + + # Instantiate class to conduct OPE by multiple estimators simultaneously + multi_objective_estimated_policy_value_df = pd.DataFrame() + results = {"value": [], "lower": [], "upper": [], "std": [], "estimator": [], "objective": []} + for reward_feature in self.reward_feature: + if self.verbose: + logger.info(f"Offline Policy Evaluation for {reward_feature}.") + + if self._check_argument_required_by_estimators("reward", self.ope_estimators): + kwargs["reward"] = self._test_data["reward"][:, self.reward_feature.index(reward_feature)] + if self._check_argument_required_by_estimators("expected_reward", self.ope_estimators): + kwargs["expected_reward"] = self._estimated_expected_reward[reward_feature] + + # Summarize policy values and their confidence intervals estimated by OPE estimators + for ope_estimator in self.ope_estimators: + estimated_policy_value, low, high, std = ope_estimator.estimate_policy_value_with_confidence_interval( + **kwargs, + ) + results["value"].append(estimated_policy_value) + results["lower"].append(low) + results["upper"].append(high) + results["std"].append(std) + results["estimator"].append(ope_estimator.name) + results["objective"].append(reward_feature) + + multi_objective_estimated_policy_value_df = pd.concat( + [multi_objective_estimated_policy_value_df, pd.DataFrame.from_dict(results)], + axis=0, + ) + if save_path: + multi_objective_estimated_policy_value_df.to_csv(os.path.join(save_path, "estimated_policy_value.csv")) + + if visualize: + self._visualize_results(save_path, multi_objective_estimated_policy_value_df) + + return multi_objective_estimated_policy_value_df + + def update_and_evaluate( + self, + mab: BaseMab, + n_mc_experiments: int = 1000, + save_path: Optional[str] = None, + visualize: bool = True, + with_test: bool = False, + ) -> pd.DataFrame: + """ + Execute update of the multi-armed bandit based on the logged data, + followed by the OPE process with multiple estimators simultaneously. + + Parameters + ---------- + mab : BaseMab + Multi-armed bandit model to be updated and evaluated + n_mc_experiments : int + Number of Monte Carlo experiments for policy estimation + save_path : Optional[str] + Path to save the results. Nothing is saved if not specified. + visualize : bool + Whether to visualize the results of the OPE process + with_test : bool + Whether to update the bandit model with the test data + + Returns + ------- + estimated_policy_value_df : pd.DataFrame + Estimated policy values + """ + self._update_mab(mab, self._train_data) + if with_test: + self._update_mab(mab, self._test_data) + estimated_policy_value_df = self.evaluate(mab, n_mc_experiments, save_path, visualize) + return estimated_policy_value_df + + def _update_mab(self, mab: BaseMab, data: Dict[str, Any]): + """ + Update the multi-armed bandit model based on the logged data. + + Parameters + ---------- + mab : BaseMab + Multi-armed bandit model to be updated. + data : Dict[str, Any] + Data used to update the bandit model. + """ + if self.verbose: + logger.info(f"Offline policy update for {type(mab)}.") + kwargs = {"context": data["context"]} if self.contextual_features else {} + mab.update(actions=data["action_ids"].tolist(), rewards=np.squeeze(data["reward"]).tolist(), **kwargs) + + def _visualize_results(self, save_path: Optional[str], multi_objective_estimated_policy_value_df: pd.DataFrame): + """ + Visualize the results of the OPE process. + + Parameters + ---------- + save_path : Optional[str] + Path to save the visualization results. Required if not running in a Jupyter notebook. + multi_objective_estimated_policy_value_df : pd.DataFrame + Estimated confidence intervals + """ + + tabs = [] + grouped_df = multi_objective_estimated_policy_value_df.groupby("objective") + tools = "crosshair, pan, wheel_zoom, box_zoom, reset, hover, save" + + tooltips = [ + ("Estimator", "@estimator"), + ("Estimated policy value", "@value"), + ("Lower CI", "@lower"), + ("Upper CI", "@upper"), + ] + for group_name, estimated_interval_df in grouped_df: + source = ColumnDataSource( + data=dict( + estimator=estimated_interval_df["estimator"], + value=estimated_interval_df["value"], + lower=estimated_interval_df["lower"], + upper=estimated_interval_df["upper"], + ) + ) + fig = figure( + title=f"Policy value estimates for {group_name} objective", + x_axis_label="Estimator", + y_axis_label="Estimated policy value (\u00b1 CI)", + sizing_mode="inherit", + x_range=source.data["estimator"], + tools=tools, + tooltips=tooltips, + ) + fig.vbar(x="estimator", top="value", width=0.9, source=source) + + # Add error bars for confidence intervals + fig.segment( + x0="estimator", y0="lower", x1="estimator", y1="upper", source=source, line_width=2, color="black" + ) # error bar line + fig.vbar( + x="estimator", width=0.1, bottom="lower", top="upper", source=source, color="black" + ) # error bar cap + + fig.xgrid.grid_line_color = None + + tabs.append(TabPanel(child=fig, title=f"{group_name}")) + + output_path = os.path.join(save_path, "multi_objective_estimated_policy.html") if save_path else None + visualize_via_bokeh(tabs=tabs, output_path=output_path) + + +def _mab_predict(mab: BaseMab, mab_data: Union[np.ndarray, PositiveInt], mc_experiment: int = 0) -> List[ActionId]: + """ + bandit action probabilities prediction in test set + + Parameters + ---------- + mab : BaseMab + Multi-armed bandit model + mab_data : Union[np.ndarray, PositiveInt] + test data used to update the bandit model; context or number of samples. + mc_experiment : int + placeholder for multiprocessing + + Returns + ------- + actions: List[ActionId] of shape (n_samples,) + The actions selected by the multi-armed bandit model. + """ + mab_output = mab.predict(context=mab_data) if type(mab_data) is np.ndarray else mab.predict(n_samples=mab_data) + actions = mab_output[0] + return actions diff --git a/pybandits/pydantic_version_compatibility.py b/pybandits/pydantic_version_compatibility.py index a119264..76a8a07 100644 --- a/pybandits/pydantic_version_compatibility.py +++ b/pybandits/pydantic_version_compatibility.py @@ -27,7 +27,19 @@ from typing import Any, Callable, Dict, Literal, Optional, Union from warnings import warn -from pydantic import BaseModel, Field, NonNegativeFloat, PositiveInt, ValidationError, confloat, conint, constr +from pydantic import ( + BaseModel, + Field, + NonNegativeFloat, + NonNegativeInt, + PositiveFloat, + PositiveInt, + PrivateAttr, + ValidationError, + confloat, + conint, + constr, +) from pydantic.version import VERSION as _VERSION # Define the pydantic versions @@ -258,6 +270,8 @@ def _convert_config_param(config: Dict[str, Any], v2_name: str, v1_name: str) -> "model_validator", "validate_call", "NonNegativeFloat", + "NonNegativeInt", + "PositiveFloat", "PositiveInt", "BaseModel", "ValidationError", @@ -265,4 +279,5 @@ def _convert_config_param(config: Dict[str, Any], v2_name: str, v1_name: str) -> "conint", "constr", "Field", + "PrivateAttr", ] diff --git a/pybandits/strategy.py b/pybandits/strategy.py index a67be09..c1b33d8 100644 --- a/pybandits/strategy.py +++ b/pybandits/strategy.py @@ -19,14 +19,18 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. - from abc import ABC, abstractmethod from random import random +from sys import version_info from typing import Any, Dict, List, Optional, Union import numpy as np from scipy.stats import ttest_ind_from_stats -from typing_extensions import Self + +if version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self from pybandits.base import ActionId, Float01, Probability, PyBanditsBaseModel from pybandits.model import Beta, BetaMOCC, Model diff --git a/pybandits/utils.py b/pybandits/utils.py index d0577b5..3d703df 100644 --- a/pybandits/utils.py +++ b/pybandits/utils.py @@ -1,5 +1,12 @@ +import inspect import json -from typing import Any, Callable, Dict, List, Union +from abc import ABC +from types import ModuleType +from typing import Any, Callable, Dict, List, Optional, Union + +from bokeh.io import curdoc, output_file, output_notebook, save, show +from bokeh.models import InlineStyleSheet, TabPanel, Tabs +from IPython import get_ipython from pybandits.pydantic_version_compatibility import validate_call @@ -42,3 +49,74 @@ def extract_argument_names_from_function(function_handle: Callable, is_class_met start_index = int(is_class_method) argument_names = function_handle.__code__.co_varnames[start_index : function_handle.__code__.co_argcount] return argument_names + + +@validate_call(config=dict(arbitrary_types_allowed=True)) +def get_non_abstract_classes(module: ModuleType) -> List[type]: + non_abc_classes = [] + for name, obj in inspect.getmembers(module, inspect.isclass): + # Ensure the class is defined in the module and not imported + if obj.__module__ == module.__name__: + # Check if the class is not an abstract class (i.e., doesn't inherit from abc.ABC) + if not inspect.isabstract(obj) and ABC not in obj.__bases__: + non_abc_classes.append(obj) + return non_abc_classes + + +def in_jupyter_notebook() -> bool: + """ + Check if the code is running in a Jupyter notebook. + + Reference: https://stackoverflow.com/a/39662359 + + Returns + ------- + bool + True if the code is running in a Jupyter notebook, False otherwise. + """ + + try: + shell = get_ipython().__class__.__name__ + + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + + else: + return False # Other type (likely shouldn't happen) + + except NameError: + return False # Probably standard Python interpreter + + +def visualize_via_bokeh(output_path: Optional[str], tabs: List[TabPanel]): + """ + Visualize output to either a Jupyter notebook or an HTML file. + + Parameters + ---------- + output_path : Optional[str] + Path to the output file. Required if not running in a Jupyter notebook. + """ + + if in_jupyter_notebook(): + output_notebook() + else: + if output_path is None: + raise ValueError("output_path is required when not running in a Jupyter notebook.") + output_file(output_path) + + # Add a Div model to the Bokeh layout for flexible tabs + css = """ + :host(.bk-Tabs) .bk-header { + flex-wrap: wrap !important; + } + """ + stylesheet = InlineStyleSheet(css=css) + curdoc().title = "Visual report" + if in_jupyter_notebook(): + show(Tabs(tabs=tabs, stylesheets=[stylesheet])) + else: + save(Tabs(tabs=tabs, sizing_mode="stretch_both", stylesheets=[stylesheet])) diff --git a/pyproject.toml b/pyproject.toml index 91bf9fa..65f6303 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pybandits" -version = "1.0.2" +version = "1.2.0" description = "Python Multi-Armed Bandit Library" authors = [ "Dario d'Andrea ", @@ -11,28 +11,41 @@ authors = [ ] license = "MIT License" readme = "README.md" +homepage = "https://github.com/PlaytikaOSS/pybandits" +repository = "https://github.com/PlaytikaOSS/pybandits" +keywords = ["multi-armed bandit", "reinforcement-learning", "optimization"] [tool.poetry.dependencies] -python = ">=3.8.1,<3.12" +python = ">=3.8.1,<3.13" loguru = "^0.6" -numpy = "^1.23" +numpy = [ + { version = "<1.25", python = "3.8.*" }, + { version = ">=1.25", python = ">=3.9,<3.12" }, + { version = ">=1.26", python = "3.12.*" }, +] pydantic = ">=1.10" -scipy = "^1.9" -pymc = "^5.3" +scipy = [ + { version = ">=1.9,<1.13", python = ">=3.8,<3.12" }, + { version = ">=1.11,<1.13", python = "3.12.*" }, +] +pymc = [ + { version = "^5.3", python = "3.8.*" }, + { version = "^5.10", python = ">=3.9" }, +] scikit-learn = "^1.1" +optuna = "^3.6" +bokeh = "^3.1" [tool.poetry.group.dev.dependencies] hypothesis = "^6.68.2" -pytest = "^7.2.2" +pytest = "^8.3.3" tox = "^4.4.7" -pandas = "^1.5.3" +pandas = ">=1.5.3" pre-commit = "^3.1.1" -nbdev = "^2.3.12" -rich = "^13.3.2" -pyzmq = "25.0.0" +nbstripout = "^0.7.1" ipykernel = "^6.21.3" jupyterlab = "^3.6.1" -pytest-cov = "^4.0.0" +pytest-cov = "^5.0.0" pytest_mock = "^3.14.0" ruff = "^0.5.6" diff --git a/tests/test_cmab.py b/tests/test_cmab.py index fdf2173..3b365ed 100644 --- a/tests/test_cmab.py +++ b/tests/test_cmab.py @@ -159,7 +159,7 @@ def test_cmab_init_with_wrong_blr_models(n_features, other_n_features, update_me ) -@settings(deadline=60000) +@settings(deadline=None) @given(st.just(100), st.just(3), st.sampled_from(literal_update_methods)) def test_cmab_update(n_samples, n_features, update_method): actions = np.random.choice(["a1", "a2"], size=n_samples).tolist() @@ -200,7 +200,7 @@ def run_update(context): run_update(context=context) -@settings(deadline=10000) +@settings(deadline=None) @given(st.just(100), st.just(3), st.sampled_from(literal_update_methods)) def test_cmab_update_not_all_actions(n_samples, n_feat, update_method): actions = np.random.choice(["a3", "a4"], size=n_samples).tolist() @@ -547,7 +547,7 @@ def test_cmab_bai_predict(n_samples, n_features): assert len(selected_actions) == len(probs) == len(weighted_sums) == n_samples -@settings(deadline=10000) +@settings(deadline=None) @given(st.just(100), st.just(3), st.sampled_from(literal_update_methods)) def test_cmab_bai_update(n_samples, n_features, update_method): actions = np.random.choice(["a1", "a2"], size=n_samples).tolist() @@ -783,7 +783,7 @@ def test_cmab_cc_predict(n_samples, n_features): assert len(selected_actions) == len(probs) == len(weighted_sums) == n_samples -@settings(deadline=10000) +@settings(deadline=None) @given(st.just(100), st.just(3), st.sampled_from(literal_update_methods)) def test_cmab_cc_update(n_samples, n_features, update_method): actions = np.random.choice(["a1", "a2"], size=n_samples).tolist() diff --git a/tests/test_offline_policy_estimator.py b/tests/test_offline_policy_estimator.py new file mode 100644 index 0000000..d345413 --- /dev/null +++ b/tests/test_offline_policy_estimator.py @@ -0,0 +1,162 @@ +from typing import Tuple +from unittest import mock + +import numpy as np +import pytest +from hypothesis import assume, given +from hypothesis import strategies as st +from hypothesis.extra.numpy import arrays + +from pybandits import offline_policy_estimator +from pybandits.offline_policy_estimator import BaseOfflinePolicyEstimator +from pybandits.utils import get_non_abstract_classes + + +@st.composite +def invalid_inputs(draw, n_samples: int = 10, n_actions: int = 2): + reward = None + propensity_score = None + estimated_policy = None + expected_reward = None + expected_importance_weight = None + bad_argument = draw( + st.sampled_from( + [ + "action", + "reward", + "propensity_score", + "estimated_policy", + "expected_reward", + "expected_importance_weight", + ] + ) + ) + if bad_argument == "action": + action = draw(arrays(dtype=int, shape=(n_samples, 2), elements=st.integers(0, n_actions - 1))) + else: + action = draw(arrays(dtype=int, shape=(n_samples,), elements=st.integers(0, n_actions - 1))) + assume(np.unique(action).size == n_actions) + if bad_argument == "reward": + reward = draw( + st.one_of( + arrays(dtype=int, shape=(n_samples, 2), elements=st.integers(0, 1)), + arrays(dtype=float, shape=(n_samples,), elements=st.floats(0, 1)), + arrays( + dtype=int, + shape=(n_samples - 1,), + elements=st.integers(0, 1), + ), + arrays( + dtype=int, + shape=(n_samples + 1,), + elements=st.integers(0, 1), + ), + ) + ) + elif bad_argument in ("propensity_score", "expected_importance_weight"): + random_value = draw( + st.one_of( + arrays(dtype=float, shape=(n_samples, 2), elements=st.floats(0, 1)), + arrays(dtype=float, shape=(n_samples,), elements=st.floats(0, 0)), + arrays(dtype=int, shape=(n_samples,), elements=st.integers(0, 1)), + arrays( + dtype=float, + shape=(n_samples - 1,), + elements=st.floats(0, 1), + ), + arrays( + dtype=float, + shape=(n_samples + 1,), + elements=st.floats(0, 1), + ), + ) + ) + + if bad_argument == "propensity_score": + propensity_score = random_value + elif bad_argument == "expected_importance_weight": + expected_importance_weight = random_value + elif bad_argument == "estimated_policy": + estimated_policy = draw( + st.one_of( + arrays(dtype=float, shape=(n_samples,), elements=st.floats(0, 1)), + arrays(dtype=float, shape=(n_samples, 2), elements=st.floats(0, 0)), + arrays(dtype=int, shape=(n_samples, 2), elements=st.integers(0, 1)), + arrays( + dtype=float, + shape=(n_samples - 1, 1), + elements=st.floats(0, 1), + ), + arrays( + dtype=float, + shape=(n_samples + 1, 1), + elements=st.floats(0, 1), + ), + ) + ) + elif bad_argument == "expected_reward": + expected_reward = draw( + st.one_of( + arrays(dtype=float, shape=(n_samples,), elements=st.floats(0, 1)), + arrays(dtype=int, shape=(n_samples, 2), elements=st.integers(0, 1)), + arrays( + dtype=float, + shape=(n_samples - 1, 1), + elements=st.floats(0, 1), + ), + arrays( + dtype=float, + shape=(n_samples + 1, 1), + elements=st.floats(0, 1), + ), + ) + ) + else: + raise ValueError(f"Invalid bad_argument: {bad_argument}") + return action, reward, propensity_score, estimated_policy, expected_reward, expected_importance_weight + + +@mock.patch.multiple(BaseOfflinePolicyEstimator, __abstractmethods__=set()) +@given(invalid_inputs()) +def test_shape_mismatches( + inputs: Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray], +): + action, reward, propensity_score, estimated_policy, expected_reward, expected_importance_weight = inputs + estimator = BaseOfflinePolicyEstimator() + kwargs = {} + if reward is not None: + kwargs["reward"] = reward + if propensity_score is not None: + kwargs["propensity_score"] = propensity_score + if estimated_policy is not None: + kwargs["estimated_policy"] = estimated_policy + if expected_reward is not None: + kwargs["expected_reward"] = expected_reward + if expected_importance_weight is not None: + kwargs["expected_importance_weight"] = expected_importance_weight + with pytest.raises(ValueError): + estimator._check_inputs(action=action, **kwargs) + + +@given( + arrays(dtype=int, shape=(10,), elements=st.integers(0, 1)), + arrays(dtype=int, shape=(10,), elements=st.integers(0, 1)), + arrays(dtype=float, shape=(10,), elements=st.floats(0.01, 1)), + arrays(dtype=float, shape=(10, 2), elements=st.floats(0.01, 1)), + arrays(dtype=float, shape=(10, 2), elements=st.floats(0, 1)), + arrays(dtype=float, shape=(10,), elements=st.floats(0.01, 1)), +) +def test_default_estimators( + action, reward, propensity_score, estimated_policy, expected_reward, expected_importance_weight +): + if np.unique(action).size > 1: + estimators = [class_() for class_ in get_non_abstract_classes(offline_policy_estimator)] + for estimator in estimators: + estimator.estimate_policy_value_with_confidence_interval( + action=action, + reward=reward, + propensity_score=propensity_score, + estimated_policy=estimated_policy, + expected_reward=expected_reward, + expected_importance_weight=expected_importance_weight, + ) diff --git a/tests/test_offline_policy_evaluator.py b/tests/test_offline_policy_evaluator.py new file mode 100644 index 0000000..6067529 --- /dev/null +++ b/tests/test_offline_policy_evaluator.py @@ -0,0 +1,300 @@ +from tempfile import TemporaryDirectory +from typing import Dict, List, Optional, Union, get_args, get_type_hints + +import numpy as np +import pandas as pd +import pytest +from hypothesis import given, settings +from hypothesis import strategies as st +from matplotlib.pyplot import close +from pydantic import PositiveInt +from pytest_mock import MockerFixture +from sklearn.base import TransformerMixin +from sklearn.preprocessing import MinMaxScaler + +from pybandits.cmab import CmabBernoulli, CmabBernoulliCC +from pybandits.offline_policy_estimator import BaseOfflinePolicyEstimator +from pybandits.offline_policy_evaluator import OfflinePolicyEvaluator +from pybandits.smab import ( + SmabBernoulli, + SmabBernoulliCC, + SmabBernoulliMO, + SmabBernoulliMOCC, +) + + +@pytest.fixture(scope="module") +def logged_data(n_samples=10, n_actions=2, n_batches=3, n_rewards=2, n_groups=2, n_features=3): + unique_actions = [f"a{i}" for i in range(n_actions)] + action_ids = np.random.choice(unique_actions, n_samples * n_batches) + batches = [i for i in range(n_batches) for _ in range(n_samples)] + rewards = [np.random.randint(2, size=(n_samples * n_batches)) for _ in range(n_rewards)] + action_true_rewards = {(a, r): np.random.rand() for a in unique_actions for r in range(n_rewards)} + true_rewards = [ + np.array([action_true_rewards[(a, r)] for a in action_ids]).reshape(n_samples * n_batches) + for r in range(n_rewards) + ] + groups = np.random.randint(n_groups, size=n_samples * n_batches) + action_costs = {action: np.random.rand() for action in unique_actions} + costs = np.array([action_costs[a] for a in action_ids]) + context = np.random.rand(n_samples * n_batches, n_features) + action_propensity_score = {action: np.random.rand() for action in unique_actions} + propensity_score = np.array([action_propensity_score[a] for a in action_ids]) + return pd.DataFrame( + { + "batch": batches, + "action_id": action_ids, + "cost": costs, + "group": groups, + **{f"reward_{r}": rewards[r] for r in range(n_rewards)}, + **{f"true_reward_{r}": true_rewards[r] for r in range(n_rewards)}, + **{f"context_{i}": context[:, i] for i in range(n_features)}, + "propensity_score": propensity_score, + } + ) + + +# validate failure for empty logged_data +def test_empty_logged_data( + split_prop=0.5, + n_trials=10, + verbose=False, + batch_feature="batch", + action_feature="action_id", + reward_feature="reward", + propensity_score_model_type="empirical", + expected_reward_model_type="logreg", + importance_weights_model_type="logreg", +): + with pytest.raises(AttributeError): + OfflinePolicyEvaluator( + logged_data=pd.DataFrame(), + split_prop=split_prop, + propensity_score_model_type=propensity_score_model_type, + expected_reward_model_type=expected_reward_model_type, + importance_weights_model_type=importance_weights_model_type, + n_trials=n_trials, + ope_metrics=None, + batch_feature=batch_feature, + action_feature=action_feature, + reward_feature=reward_feature, + verbose=verbose, + ) + + +@pytest.mark.usefixtures("logged_data") +@given( + split_prop=st.sampled_from([0.0, 1.0]), + n_trials=st.just(10), + ope_metrics=st.just(None), + verbose=st.just(False), + batch_feature=st.just("batch"), + action_feature=st.just("action_id"), + reward_feature=st.just("reward_0"), + propensity_score_model_type=st.just("empirical"), + expected_reward_model_type=st.just("logreg"), + importance_weights_model_type=st.just("logreg"), +) +# validate failure for extreme split_prop values +def test_initialization_extreme_split_prop( + logged_data: MockerFixture, + split_prop: float, + n_trials: PositiveInt, + ope_metrics: Optional[List[BaseOfflinePolicyEstimator]], + verbose: bool, + batch_feature: str, + action_feature: str, + reward_feature: str, + propensity_score_model_type: str, + expected_reward_model_type: str, + importance_weights_model_type: str, +): + with pytest.raises(ValueError): + OfflinePolicyEvaluator( + logged_data=logged_data, + split_prop=split_prop, + propensity_score_model_type=propensity_score_model_type, + expected_reward_model_type=expected_reward_model_type, + importance_weights_model_type=importance_weights_model_type, + n_trials=n_trials, + ope_metrics=ope_metrics, + batch_feature=batch_feature, + action_feature=action_feature, + reward_feature=reward_feature, + true_reward_feature=reward_feature, + verbose=verbose, + ) + + +# validate failure for invalid initialization parameters +def test_initialization_mismatches( + logged_data: MockerFixture, + split_prop=0.5, + n_trials=10, + ope_metrics=None, + verbose=False, + batch_feature="batch", + action_feature="action_id", + reward_feature="reward_0", + propensity_score_model_type="empirical", + expected_reward_model_type="logreg", + importance_weights_model_type="logreg", +): + # more true_reward_features than rewards + with pytest.raises(ValueError): + OfflinePolicyEvaluator( + logged_data=logged_data, + split_prop=split_prop, + propensity_score_model_type=propensity_score_model_type, + expected_reward_model_type=expected_reward_model_type, + importance_weights_model_type=importance_weights_model_type, + n_trials=n_trials, + ope_metrics=ope_metrics, + batch_feature=batch_feature, + action_feature=action_feature, + reward_feature=reward_feature, + true_reward_feature=[reward_feature, reward_feature], + verbose=verbose, + ) + # missing propensity_score_feature + with pytest.raises(ValueError): + OfflinePolicyEvaluator( + logged_data=logged_data, + split_prop=split_prop, + propensity_score_model_type="propensity_score", + expected_reward_model_type=expected_reward_model_type, + importance_weights_model_type=importance_weights_model_type, + n_trials=n_trials, + ope_metrics=ope_metrics, + batch_feature=batch_feature, + action_feature=action_feature, + reward_feature=reward_feature, + visualize=False, + ) + # missing context + with pytest.raises(AttributeError): + OfflinePolicyEvaluator( + logged_data=logged_data, + split_prop=split_prop, + propensity_score_model_type=propensity_score_model_type, + expected_reward_model_type=expected_reward_model_type, + importance_weights_model_type=importance_weights_model_type, + n_trials=n_trials, + ope_metrics=ope_metrics, + batch_feature=batch_feature, + action_feature=action_feature, + reward_feature=reward_feature, + verbose=False, + contextual_features=["non_existent"], + ) + + +@pytest.mark.usefixtures("logged_data") +@settings(deadline=None) +@given( + split_prop=st.just(0.5), + n_trials=st.just(10), + fast_fit=st.booleans(), + scaler=st.sampled_from([None, MinMaxScaler()]), + verbose=st.booleans(), + visualize=st.booleans(), + propensity_score_model_type=st.sampled_from( + get_args(get_type_hints(OfflinePolicyEvaluator)["propensity_score_model_type"]) + ), + expected_reward_model_type=st.sampled_from( + get_args(get_type_hints(OfflinePolicyEvaluator)["expected_reward_model_type"]) + ), + importance_weights_model_type=st.sampled_from( + get_args(get_type_hints(OfflinePolicyEvaluator)["importance_weights_model_type"]) + ), + batch_feature=st.just("batch"), + action_feature=st.just("action_id"), + reward_feature=st.sampled_from(["reward_0", ["reward_0", "reward_1"]]), + context=st.booleans(), + group_feature=st.sampled_from(["group", None]), + cost_feature=st.sampled_from(["cost", None]), + propensity_score_feature=st.just("propensity_score"), + n_mc_experiments=st.just(2), + update=st.booleans(), +) +# test various OfflinePolicyEvaluator configurations to validate that everything works +def test_running_configuration( + logged_data: MockerFixture, + split_prop: float, + n_trials: PositiveInt, + fast_fit: bool, + scaler: Optional[Union[TransformerMixin, Dict[str, TransformerMixin]]], + verbose: bool, + visualize: bool, + propensity_score_model_type: str, + expected_reward_model_type: str, + importance_weights_model_type: str, + batch_feature: str, + action_feature: str, + reward_feature: Union[str, List[int]], + context: bool, + group_feature: Optional[str], + cost_feature: Optional[str], + propensity_score_feature: Optional[str], + n_mc_experiments: int, + update: bool, +): + if context and type(reward_feature) is List: + pass # CmabMO and CmabMOCC are not supported yet + true_reward_feature = ( + f"true_{reward_feature}" if isinstance(reward_feature, str) else [f"true_{r}" for r in reward_feature] + ) + contextual_features = [col for col in logged_data.columns if col.startswith("context")] if context else None + unique_actions = logged_data["action_id"].unique() + if cost_feature: + action_ids_cost = { + action_id: logged_data["cost"][logged_data["action_id"] == action_id].iloc[0] + for action_id in unique_actions + } + if context: + if cost_feature: + if type(reward_feature) is list: + return # CmabMOCC is not supported yet + else: + mab = CmabBernoulliCC.cold_start(action_ids_cost=action_ids_cost, n_features=len(contextual_features)) + else: + if type(reward_feature) is list: + return # CmabMO is not supported yet + else: + mab = CmabBernoulli.cold_start(action_ids=set(unique_actions), n_features=len(contextual_features)) + else: + if cost_feature: + if type(reward_feature) is list: + mab = SmabBernoulliMOCC.cold_start(action_ids_cost=action_ids_cost, n_objectives=len(reward_feature)) + else: + mab = SmabBernoulliCC.cold_start(action_ids_cost=action_ids_cost) + else: + if type(reward_feature) is list: + mab = SmabBernoulliMO.cold_start(action_ids=set(unique_actions), n_objectives=len(reward_feature)) + else: + mab = SmabBernoulli.cold_start(action_ids=set(unique_actions)) + evaluator = OfflinePolicyEvaluator( + logged_data=logged_data, + split_prop=split_prop, + n_trials=n_trials, + fast_fit=fast_fit, + scaler=scaler, + ope_estimators=None, + verbose=verbose, + propensity_score_model_type=propensity_score_model_type, + expected_reward_model_type=expected_reward_model_type, + importance_weights_model_type=importance_weights_model_type, + batch_feature=batch_feature, + action_feature=action_feature, + reward_feature=reward_feature, + true_reward_feature=true_reward_feature, + contextual_features=contextual_features, + group_feature=group_feature, + cost_feature=cost_feature, + propensity_score_feature=propensity_score_feature, + ) + execution_func = evaluator.update_and_evaluate if update else evaluator.evaluate + with TemporaryDirectory() as tmp_dir: + execution_func(mab=mab, visualize=visualize, n_mc_experiments=n_mc_experiments, save_path=tmp_dir) + if visualize: + close("all") # close all figures to avoid memory leak