From 973b00d3ddcebb97d66e32fdb1e8d2434b708cf9 Mon Sep 17 00:00:00 2001 From: teddygroves <groves.teddy@gmail.com> Date: Tue, 23 May 2023 13:14:07 +0200 Subject: [PATCH 1/5] delete unused global variables in data preparation file --- .../examples/baseball/baseball/data_preparation.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/bibat/examples/baseball/baseball/data_preparation.py b/bibat/examples/baseball/baseball/data_preparation.py index 5b26c45..d427d57 100644 --- a/bibat/examples/baseball/baseball/data_preparation.py +++ b/bibat/examples/baseball/baseball/data_preparation.py @@ -9,23 +9,14 @@ import pandas as pd import pandera as pa -from baseball.util import CoordDict from pandera.typing import DataFrame, Series from pydantic.dataclasses import dataclass +from baseball.util import CoordDict + NAME_FILE = "name.txt" COORDS_FILE = "coords.json" -MEASUREMENTS_FILE = "measurements.csv" -NEW_COLNAMES = {"yButIThoughtIdAddSomeLetters": "y"} -DROPNA_COLS = ["y"] N_CV_FOLDS = 10 -DIMS = { - "b": ["covariate"], - "y": ["observation"], - "yrep": ["observation"], - "llik": ["observation"], -} - HERE = os.path.dirname(__file__) DATA_DIR = os.path.join(HERE, "..", "data") RAW_DIR = os.path.join(DATA_DIR, "raw") From 81ba05be4ea95cb2aabd06c6b9a0cc6a08624950 Mon Sep 17 00:00:00 2001 From: teddygroves <groves.teddy@gmail.com> Date: Tue, 23 May 2023 13:14:59 +0200 Subject: [PATCH 2/5] fix template conditional in makefile --- bibat/{{cookiecutter.repo_name}}/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bibat/{{cookiecutter.repo_name}}/Makefile b/bibat/{{cookiecutter.repo_name}}/Makefile index 9e73154..318dd37 100644 --- a/bibat/{{cookiecutter.repo_name}}/Makefile +++ b/bibat/{{cookiecutter.repo_name}}/Makefile @@ -35,7 +35,7 @@ $(ENV_MARKER): $(ACTIVATE_VENV) $(REQUIREMENTS_FILE) $(CMDSTAN) analysis: $(ENV_MARKER) . $(ACTIVATE_VENV) && (\ - {% if cookiecutter.create_tests_directory %}python -m pytest || exit 1; \{% endif %} + {% if cookiecutter.create_tests_directory == 'y' %}python -m pytest || exit 1; \{% endif %} python $(SRC)/prepare_data.py || exit 1; \ python $(SRC)/sample.py || exit 1; \ jupyter execute $(SRC)/investigate.ipynb || exit 1; \ From 915f2405d21501d7019c15ae82ee4da61176754c Mon Sep 17 00:00:00 2001 From: teddygroves <groves.teddy@gmail.com> Date: Tue, 23 May 2023 13:16:06 +0200 Subject: [PATCH 3/5] Update vignette --- bibat/examples/baseball/docs/report.html | 685 ++++++++++++----------- bibat/examples/baseball/docs/report.qmd | 60 +- docs/_static/report.html | 685 ++++++++++++----------- 3 files changed, 766 insertions(+), 664 deletions(-) diff --git a/bibat/examples/baseball/docs/report.html b/bibat/examples/baseball/docs/report.html index 5a2efc9..61c38e3 100644 --- a/bibat/examples/baseball/docs/report.html +++ b/bibat/examples/baseball/docs/report.html @@ -3095,7 +3095,7 @@ )) {"Symbol"in self&&"iterator"in Symbol&&"function"==typeof Array.prototype[Symbol.iterator]?CreateMethodProperty(Array.prototype,"values",Array.prototype[Symbol.iterator]):CreateMethodProperty(Array.prototype,"values",function r(){var r=ToObject(this);return new ArrayIterator(r,"value")});}if (!("Symbol"in self&&"iterator"in self.Symbol&&!!Array.prototype[self.Symbol.iterator] )) {CreateMethodProperty(Array.prototype,Symbol.iterator,Array.prototype.values);}var StringIterator=function(){var t=function(e){if(!(this instanceof t))return new t(e);e=String(e),Iterator.call(this,e),Object.defineProperty(this,"__length__",{value:e.length,configurable:!1,enumerable:!1,writable:!1})};return Object.setPrototypeOf&&Object.setPrototypeOf(t,Iterator),t.prototype=Object.create(Iterator.prototype,{constructor:{value:t,configurable:!0,enumerable:!1,writable:!0},_next:{value:function(){if(this.__list__)return this.__nextIndex__<this.__length__?this.__nextIndex__++:void this._unBind()},configurable:!0,enumerable:!1,writable:!0},_resolve:{value:function(t){var e,r=this.__list__[t];return this.__nextIndex__===this.__length__?r:(e=r.charCodeAt(0),e>=55296&&e<=56319?r+this.__list__[this.__nextIndex__++]:r)},configurable:!0,enumerable:!1,writable:!0},toString:{value:function(){return"[object String Iterator]"},configurable:!0,enumerable:!1,writable:!0}}),t}();if (!("Symbol"in self&&"iterator"in self.Symbol&&!!String.prototype[self.Symbol.iterator] )) {CreateMethodProperty(String.prototype,Symbol.iterator,function(){var r=RequireObjectCoercible(this),t=ToString(r);return new StringIterator(t)});}})('object' === typeof window && window || 'object' === typeof self && self || 'object' === typeof global && global || {});</script> - + <script> (function () { var script = document.createElement("script"); @@ -3114,7 +3114,7 @@ <div id="quarto-margin-sidebar" class="sidebar margin-sidebar"> <nav id="TOC" role="doc-toc" class="toc-active"> <h2 id="toc-title">Table of contents</h2> - + <ul> <li><a href="#setup" id="toc-setup" class="nav-link active" data-scroll-target="#setup">Setup</a></li> <li><a href="#getting-raw-data" id="toc-getting-raw-data" class="nav-link" data-scroll-target="#getting-raw-data">Getting raw data</a></li> @@ -3125,6 +3125,7 @@ <h2 id="toc-title">Table of contents</h2> <li><a href="#investigating-the-inferences" id="toc-investigating-the-inferences" class="nav-link" data-scroll-target="#investigating-the-inferences">Investigating the inferences</a></li> <li><a href="#choosing-priors-using-push-forward-calibration" id="toc-choosing-priors-using-push-forward-calibration" class="nav-link" data-scroll-target="#choosing-priors-using-push-forward-calibration">Choosing priors using push-forward calibration</a></li> <li><a href="#extending-the-analysis-to-the-baseballdatabank-data" id="toc-extending-the-analysis-to-the-baseballdatabank-data" class="nav-link" data-scroll-target="#extending-the-analysis-to-the-baseballdatabank-data">Extending the analysis to the baseballdatabank data</a></li> + <li><a href="#documenting-the-analysis" id="toc-documenting-the-analysis" class="nav-link" data-scroll-target="#documenting-the-analysis">Documenting the analysis</a></li> </ul> </nav> </div> @@ -3146,11 +3147,11 @@ <h1 class="title">Modelling at-bats in baseball using the generalised Pareto dis <p>Teddy Groves </p> </div> </div> - - - + + + </div> - + </header> @@ -3278,7 +3279,7 @@ <h1>Getting raw data</h1> </section> <section id="preparing-the-data" class="level1 page-columns page-full"> <h1>Preparing the data</h1> -<p>The first step in preparing data is to decide what prepared data looks like for the purposes of our analysis. Bibat provides dataclasses called <code>PreparedData</code> and <code>MeasurementsDF</code> to help get started with this, which I found in the file <code>baseball/prepared_data.py</code>.</p> +<p>The first step in preparing data is to decide what prepared data looks like for the purposes of our analysis. Bibat provides dataclasses called <code>PreparedData</code> and <code>MeasurementsDF</code> to help get started with this, which I found in the file <code>baseball/data_preparation.py</code>.</p> <div class="page-columns page-full"><p>As it happens, prepared data looks very similar in this analysis and the example. All I had to do was change the <code>MeasurementsDF</code> definition a little<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>:</p><div class="no-row-height column-margin column-container"><li id="fn1"><p><sup>1</sup> note that this class uses <a href="https://pandera.readthedocs.io">pandera</a>, a handy library for defining what a pandas dataframe should look like</p></li></div></div> <div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> MeasurementsDF(pa.SchemaModel):</span> <span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a> <span class="co">"""A PreparedData should have a measurements dataframe like this.</span></span> @@ -3291,94 +3292,117 @@ <h1>Preparing the data</h1> <span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a> n_attempt: Series[<span class="bu">int</span>] <span class="op">=</span> pa.Field(ge <span class="op">=</span> <span class="dv">1</span>)</span> <span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a> n_success: Series[<span class="bu">int</span>] <span class="op">=</span> pa.Field(ge <span class="op">=</span> <span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>The next step is to write functions that return <code>PreparedData</code> objects. In this case I wrote a couple of data preparation functions: <code>prepare_data_2006</code> and <code>prepare_data_bdb</code>:</p> -<div class="sourceCode" id="cb9" data-startfrom="111"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 110;"><span id="cb9-111"><a href="#cb9-111" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data_2006(measurements_raw: pd.DataFrame) <span class="op">-></span> PreparedData:</span> -<span id="cb9-112"><a href="#cb9-112" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Prepare the 2006 data."""</span></span> -<span id="cb9-113"><a href="#cb9-113" aria-hidden="true" tabindex="-1"></a> measurements <span class="op">=</span> measurements_raw.rename(</span> -<span id="cb9-114"><a href="#cb9-114" aria-hidden="true" tabindex="-1"></a> columns<span class="op">=</span>{<span class="st">"K"</span>: <span class="st">"n_attempt"</span>, <span class="st">"y"</span>: <span class="st">"n_success"</span>}</span> -<span id="cb9-115"><a href="#cb9-115" aria-hidden="true" tabindex="-1"></a> ).assign(</span> -<span id="cb9-116"><a href="#cb9-116" aria-hidden="true" tabindex="-1"></a> season<span class="op">=</span><span class="st">"2006"</span>,</span> -<span id="cb9-117"><a href="#cb9-117" aria-hidden="true" tabindex="-1"></a> player_season<span class="op">=</span><span class="kw">lambda</span> df: [<span class="ss">f"2006-player-</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">"</span> <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="bu">len</span>(df))],</span> -<span id="cb9-118"><a href="#cb9-118" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-119"><a href="#cb9-119" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> PreparedData(</span> -<span id="cb9-120"><a href="#cb9-120" aria-hidden="true" tabindex="-1"></a> name<span class="op">=</span><span class="st">"2006"</span>,</span> -<span id="cb9-121"><a href="#cb9-121" aria-hidden="true" tabindex="-1"></a> coords<span class="op">=</span>{</span> -<span id="cb9-122"><a href="#cb9-122" aria-hidden="true" tabindex="-1"></a> <span class="st">"player_season"</span>: measurements[<span class="st">"player_season"</span>].tolist(),</span> -<span id="cb9-123"><a href="#cb9-123" aria-hidden="true" tabindex="-1"></a> <span class="st">"season"</span>: measurements[<span class="st">"season"</span>].tolist(),</span> -<span id="cb9-124"><a href="#cb9-124" aria-hidden="true" tabindex="-1"></a> },</span> -<span id="cb9-125"><a href="#cb9-125" aria-hidden="true" tabindex="-1"></a> measurements<span class="op">=</span>measurements,</span> -<span id="cb9-126"><a href="#cb9-126" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-127"><a href="#cb9-127" aria-hidden="true" tabindex="-1"></a></span> +<div class="sourceCode" id="cb9" data-startfrom="101"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 100;"><span id="cb9-101"><a href="#cb9-101" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-102"><a href="#cb9-102" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data_2006(measurements_raw: pd.DataFrame) <span class="op">-></span> PreparedData:</span> +<span id="cb9-103"><a href="#cb9-103" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Prepare the 2006 data."""</span></span> +<span id="cb9-104"><a href="#cb9-104" aria-hidden="true" tabindex="-1"></a> measurements <span class="op">=</span> measurements_raw.rename(</span> +<span id="cb9-105"><a href="#cb9-105" aria-hidden="true" tabindex="-1"></a> columns<span class="op">=</span>{<span class="st">"K"</span>: <span class="st">"n_attempt"</span>, <span class="st">"y"</span>: <span class="st">"n_success"</span>}</span> +<span id="cb9-106"><a href="#cb9-106" aria-hidden="true" tabindex="-1"></a> ).assign(</span> +<span id="cb9-107"><a href="#cb9-107" aria-hidden="true" tabindex="-1"></a> season<span class="op">=</span><span class="st">"2006"</span>,</span> +<span id="cb9-108"><a href="#cb9-108" aria-hidden="true" tabindex="-1"></a> player_season<span class="op">=</span><span class="kw">lambda</span> df: [<span class="ss">f"2006-player-</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">"</span> <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="bu">len</span>(df))],</span> +<span id="cb9-109"><a href="#cb9-109" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-110"><a href="#cb9-110" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> PreparedData(</span> +<span id="cb9-111"><a href="#cb9-111" aria-hidden="true" tabindex="-1"></a> name<span class="op">=</span><span class="st">"2006"</span>,</span> +<span id="cb9-112"><a href="#cb9-112" aria-hidden="true" tabindex="-1"></a> coords<span class="op">=</span>{</span> +<span id="cb9-113"><a href="#cb9-113" aria-hidden="true" tabindex="-1"></a> <span class="st">"player_season"</span>: measurements[<span class="st">"player_season"</span>].tolist(),</span> +<span id="cb9-114"><a href="#cb9-114" aria-hidden="true" tabindex="-1"></a> <span class="st">"season"</span>: measurements[<span class="st">"season"</span>].tolist(),</span> +<span id="cb9-115"><a href="#cb9-115" aria-hidden="true" tabindex="-1"></a> },</span> +<span id="cb9-116"><a href="#cb9-116" aria-hidden="true" tabindex="-1"></a> measurements<span class="op">=</span>measurements,</span> +<span id="cb9-117"><a href="#cb9-117" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-118"><a href="#cb9-118" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-119"><a href="#cb9-119" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-120"><a href="#cb9-120" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data_bdb(</span> +<span id="cb9-121"><a href="#cb9-121" aria-hidden="true" tabindex="-1"></a> measurements_main: pd.DataFrame,</span> +<span id="cb9-122"><a href="#cb9-122" aria-hidden="true" tabindex="-1"></a> measurements_post: pd.DataFrame,</span> +<span id="cb9-123"><a href="#cb9-123" aria-hidden="true" tabindex="-1"></a> appearances: pd.DataFrame,</span> +<span id="cb9-124"><a href="#cb9-124" aria-hidden="true" tabindex="-1"></a>) <span class="op">-></span> PreparedData:</span> +<span id="cb9-125"><a href="#cb9-125" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Prepare the baseballdatabank data.</span></span> +<span id="cb9-126"><a href="#cb9-126" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-127"><a href="#cb9-127" aria-hidden="true" tabindex="-1"></a><span class="co"> There are a few substantive data choices here.</span></span> <span id="cb9-128"><a href="#cb9-128" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-129"><a href="#cb9-129" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data_bdb(</span> -<span id="cb9-130"><a href="#cb9-130" aria-hidden="true" tabindex="-1"></a> measurements_main: pd.DataFrame,</span> -<span id="cb9-131"><a href="#cb9-131" aria-hidden="true" tabindex="-1"></a> measurements_post: pd.DataFrame,</span> -<span id="cb9-132"><a href="#cb9-132" aria-hidden="true" tabindex="-1"></a> appearances: pd.DataFrame,</span> -<span id="cb9-133"><a href="#cb9-133" aria-hidden="true" tabindex="-1"></a>) <span class="op">-></span> PreparedData:</span> -<span id="cb9-134"><a href="#cb9-134" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Prepare the baseballdatabank data.</span></span> -<span id="cb9-135"><a href="#cb9-135" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-136"><a href="#cb9-136" aria-hidden="true" tabindex="-1"></a><span class="co"> There are a few substantive data choices here.</span></span> -<span id="cb9-137"><a href="#cb9-137" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-138"><a href="#cb9-138" aria-hidden="true" tabindex="-1"></a><span class="co"> First, the function excludes players who have a '1' in their position as</span></span> -<span id="cb9-139"><a href="#cb9-139" aria-hidden="true" tabindex="-1"></a><span class="co"> these are likely pitchers, as well as players with fewer than 20 at bats.</span></span> -<span id="cb9-140"><a href="#cb9-140" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-141"><a href="#cb9-141" aria-hidden="true" tabindex="-1"></a><span class="co"> Second, the function defines a successes and attempts according to the</span></span> -<span id="cb9-142"><a href="#cb9-142" aria-hidden="true" tabindex="-1"></a><span class="co"> 'on-base percentage' metric, so a success is a time when a player got a hit,</span></span> -<span id="cb9-143"><a href="#cb9-143" aria-hidden="true" tabindex="-1"></a><span class="co"> a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a</span></span> -<span id="cb9-144"><a href="#cb9-144" aria-hidden="true" tabindex="-1"></a><span class="co"> base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have</span></span> -<span id="cb9-145"><a href="#cb9-145" aria-hidden="true" tabindex="-1"></a><span class="co"> alternatively been calculated as just hits divided by at-bats, but my</span></span> -<span id="cb9-146"><a href="#cb9-146" aria-hidden="true" tabindex="-1"></a><span class="co"> understanding is that this method underrates players who are good at getting</span></span> -<span id="cb9-147"><a href="#cb9-147" aria-hidden="true" tabindex="-1"></a><span class="co"> walks.</span></span> -<span id="cb9-148"><a href="#cb9-148" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-149"><a href="#cb9-149" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span> -<span id="cb9-150"><a href="#cb9-150" aria-hidden="true" tabindex="-1"></a> pitchers <span class="op">=</span> appearances.loc[</span> -<span id="cb9-151"><a href="#cb9-151" aria-hidden="true" tabindex="-1"></a> <span class="kw">lambda</span> df: df[<span class="st">"G_p"</span>] <span class="op">==</span> df[<span class="st">"G_all"</span>], <span class="st">"playerID"</span></span> -<span id="cb9-152"><a href="#cb9-152" aria-hidden="true" tabindex="-1"></a> ].unique()</span> -<span id="cb9-153"><a href="#cb9-153" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-154"><a href="#cb9-154" aria-hidden="true" tabindex="-1"></a> <span class="kw">def</span> filter_batters(df: pd.DataFrame):</span> -<span id="cb9-155"><a href="#cb9-155" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> (</span> -<span id="cb9-156"><a href="#cb9-156" aria-hidden="true" tabindex="-1"></a> (df[<span class="st">"AB"</span>] <span class="op">>=</span> <span class="dv">20</span>)</span> -<span id="cb9-157"><a href="#cb9-157" aria-hidden="true" tabindex="-1"></a> <span class="op">&</span> (df[<span class="st">"season"</span>].ge(<span class="dv">2017</span>))</span> -<span id="cb9-158"><a href="#cb9-158" aria-hidden="true" tabindex="-1"></a> <span class="op">&</span> (<span class="op">~</span>df[<span class="st">"player"</span>].isin(pitchers))</span> -<span id="cb9-159"><a href="#cb9-159" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-160"><a href="#cb9-160" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-161"><a href="#cb9-161" aria-hidden="true" tabindex="-1"></a> measurements_main, measurements_post <span class="op">=</span> (</span> -<span id="cb9-162"><a href="#cb9-162" aria-hidden="true" tabindex="-1"></a> m.rename(columns<span class="op">=</span>{<span class="st">"yearID"</span>: <span class="st">"season"</span>, <span class="st">"playerID"</span>: <span class="st">"player"</span>})</span> -<span id="cb9-163"><a href="#cb9-163" aria-hidden="true" tabindex="-1"></a> .assign(</span> -<span id="cb9-164"><a href="#cb9-164" aria-hidden="true" tabindex="-1"></a> player_season<span class="op">=</span><span class="kw">lambda</span> df: df[<span class="st">"player"</span>].<span class="bu">str</span>.cat(</span> -<span id="cb9-165"><a href="#cb9-165" aria-hidden="true" tabindex="-1"></a> df[<span class="st">"season"</span>].astype(<span class="bu">str</span>)</span> -<span id="cb9-166"><a href="#cb9-166" aria-hidden="true" tabindex="-1"></a> ),</span> -<span id="cb9-167"><a href="#cb9-167" aria-hidden="true" tabindex="-1"></a> n_attempt<span class="op">=</span><span class="kw">lambda</span> df: df[[<span class="st">"AB"</span>, <span class="st">"BB"</span>, <span class="st">"HBP"</span>, <span class="st">"SF"</span>]]</span> -<span id="cb9-168"><a href="#cb9-168" aria-hidden="true" tabindex="-1"></a> .fillna(<span class="dv">0</span>)</span> -<span id="cb9-169"><a href="#cb9-169" aria-hidden="true" tabindex="-1"></a> .<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">1</span>)</span> -<span id="cb9-170"><a href="#cb9-170" aria-hidden="true" tabindex="-1"></a> .astype(<span class="bu">int</span>),</span> -<span id="cb9-171"><a href="#cb9-171" aria-hidden="true" tabindex="-1"></a> n_success<span class="op">=</span><span class="kw">lambda</span> df: (</span> -<span id="cb9-172"><a href="#cb9-172" aria-hidden="true" tabindex="-1"></a> df[[<span class="st">"H"</span>, <span class="st">"BB"</span>, <span class="st">"HBP"</span>]].fillna(<span class="dv">0</span>).<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">1</span>).astype(<span class="bu">int</span>)</span> -<span id="cb9-173"><a href="#cb9-173" aria-hidden="true" tabindex="-1"></a> ),</span> -<span id="cb9-174"><a href="#cb9-174" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-175"><a href="#cb9-175" aria-hidden="true" tabindex="-1"></a> .loc[</span> -<span id="cb9-176"><a href="#cb9-176" aria-hidden="true" tabindex="-1"></a> filter_batters,</span> -<span id="cb9-177"><a href="#cb9-177" aria-hidden="true" tabindex="-1"></a> [<span class="st">"player_season"</span>, <span class="st">"season"</span>, <span class="st">"n_attempt"</span>, <span class="st">"n_success"</span>],</span> -<span id="cb9-178"><a href="#cb9-178" aria-hidden="true" tabindex="-1"></a> ]</span> -<span id="cb9-179"><a href="#cb9-179" aria-hidden="true" tabindex="-1"></a> .copy()</span> -<span id="cb9-180"><a href="#cb9-180" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> m <span class="kw">in</span> [measurements_main, measurements_post]</span> -<span id="cb9-181"><a href="#cb9-181" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-182"><a href="#cb9-182" aria-hidden="true" tabindex="-1"></a> measurements <span class="op">=</span> (</span> -<span id="cb9-183"><a href="#cb9-183" aria-hidden="true" tabindex="-1"></a> pd.concat([measurements_main, measurements_post])</span> -<span id="cb9-184"><a href="#cb9-184" aria-hidden="true" tabindex="-1"></a> .groupby([<span class="st">"player_season"</span>, <span class="st">"season"</span>])</span> -<span id="cb9-185"><a href="#cb9-185" aria-hidden="true" tabindex="-1"></a> .<span class="bu">sum</span>()</span> -<span id="cb9-186"><a href="#cb9-186" aria-hidden="true" tabindex="-1"></a> .reset_index()</span> -<span id="cb9-187"><a href="#cb9-187" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-188"><a href="#cb9-188" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> PreparedData(</span> -<span id="cb9-189"><a href="#cb9-189" aria-hidden="true" tabindex="-1"></a> name<span class="op">=</span><span class="st">"bdb"</span>,</span> -<span id="cb9-190"><a href="#cb9-190" aria-hidden="true" tabindex="-1"></a> coords<span class="op">=</span>{</span> -<span id="cb9-191"><a href="#cb9-191" aria-hidden="true" tabindex="-1"></a> <span class="st">"player_season"</span>: measurements[<span class="st">"player_season"</span>].tolist(),</span> -<span id="cb9-192"><a href="#cb9-192" aria-hidden="true" tabindex="-1"></a> <span class="st">"season"</span>: measurements[<span class="st">"season"</span>].tolist(),</span> -<span id="cb9-193"><a href="#cb9-193" aria-hidden="true" tabindex="-1"></a> },</span> -<span id="cb9-194"><a href="#cb9-194" aria-hidden="true" tabindex="-1"></a> measurements<span class="op">=</span>measurements,</span> -<span id="cb9-195"><a href="#cb9-195" aria-hidden="true" tabindex="-1"></a> )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<span id="cb9-129"><a href="#cb9-129" aria-hidden="true" tabindex="-1"></a><span class="co"> First, the function excludes players who have a '1' in their position as</span></span> +<span id="cb9-130"><a href="#cb9-130" aria-hidden="true" tabindex="-1"></a><span class="co"> these are likely pitchers, as well as players with fewer than 20 at bats.</span></span> +<span id="cb9-131"><a href="#cb9-131" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-132"><a href="#cb9-132" aria-hidden="true" tabindex="-1"></a><span class="co"> Second, the function defines a successes and attempts according to the</span></span> +<span id="cb9-133"><a href="#cb9-133" aria-hidden="true" tabindex="-1"></a><span class="co"> 'on-base percentage' metric, so a success is a time when a player got a hit,</span></span> +<span id="cb9-134"><a href="#cb9-134" aria-hidden="true" tabindex="-1"></a><span class="co"> a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a</span></span> +<span id="cb9-135"><a href="#cb9-135" aria-hidden="true" tabindex="-1"></a><span class="co"> base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have</span></span> +<span id="cb9-136"><a href="#cb9-136" aria-hidden="true" tabindex="-1"></a><span class="co"> alternatively been calculated as just hits divided by at-bats, but my</span></span> +<span id="cb9-137"><a href="#cb9-137" aria-hidden="true" tabindex="-1"></a><span class="co"> understanding is that this method underrates players who are good at getting</span></span> +<span id="cb9-138"><a href="#cb9-138" aria-hidden="true" tabindex="-1"></a><span class="co"> walks.</span></span> +<span id="cb9-139"><a href="#cb9-139" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-140"><a href="#cb9-140" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span> +<span id="cb9-141"><a href="#cb9-141" aria-hidden="true" tabindex="-1"></a> pitchers <span class="op">=</span> appearances.loc[</span> +<span id="cb9-142"><a href="#cb9-142" aria-hidden="true" tabindex="-1"></a> <span class="kw">lambda</span> df: df[<span class="st">"G_p"</span>] <span class="op">==</span> df[<span class="st">"G_all"</span>], <span class="st">"playerID"</span></span> +<span id="cb9-143"><a href="#cb9-143" aria-hidden="true" tabindex="-1"></a> ].unique()</span> +<span id="cb9-144"><a href="#cb9-144" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-145"><a href="#cb9-145" aria-hidden="true" tabindex="-1"></a> <span class="kw">def</span> filter_batters(df: pd.DataFrame):</span> +<span id="cb9-146"><a href="#cb9-146" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> (</span> +<span id="cb9-147"><a href="#cb9-147" aria-hidden="true" tabindex="-1"></a> (df[<span class="st">"AB"</span>] <span class="op">>=</span> <span class="dv">20</span>)</span> +<span id="cb9-148"><a href="#cb9-148" aria-hidden="true" tabindex="-1"></a> <span class="op">&</span> (df[<span class="st">"season"</span>].ge(<span class="dv">2017</span>))</span> +<span id="cb9-149"><a href="#cb9-149" aria-hidden="true" tabindex="-1"></a> <span class="op">&</span> (<span class="op">~</span>df[<span class="st">"player"</span>].isin(pitchers))</span> +<span id="cb9-150"><a href="#cb9-150" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-151"><a href="#cb9-151" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-152"><a href="#cb9-152" aria-hidden="true" tabindex="-1"></a> measurements_main, measurements_post <span class="op">=</span> (</span> +<span id="cb9-153"><a href="#cb9-153" aria-hidden="true" tabindex="-1"></a> m.rename(columns<span class="op">=</span>{<span class="st">"yearID"</span>: <span class="st">"season"</span>, <span class="st">"playerID"</span>: <span class="st">"player"</span>})</span> +<span id="cb9-154"><a href="#cb9-154" aria-hidden="true" tabindex="-1"></a> .assign(</span> +<span id="cb9-155"><a href="#cb9-155" aria-hidden="true" tabindex="-1"></a> player_season<span class="op">=</span><span class="kw">lambda</span> df: df[<span class="st">"player"</span>].<span class="bu">str</span>.cat(</span> +<span id="cb9-156"><a href="#cb9-156" aria-hidden="true" tabindex="-1"></a> df[<span class="st">"season"</span>].astype(<span class="bu">str</span>)</span> +<span id="cb9-157"><a href="#cb9-157" aria-hidden="true" tabindex="-1"></a> ),</span> +<span id="cb9-158"><a href="#cb9-158" aria-hidden="true" tabindex="-1"></a> n_attempt<span class="op">=</span><span class="kw">lambda</span> df: df[[<span class="st">"AB"</span>, <span class="st">"BB"</span>, <span class="st">"HBP"</span>, <span class="st">"SF"</span>]]</span> +<span id="cb9-159"><a href="#cb9-159" aria-hidden="true" tabindex="-1"></a> .fillna(<span class="dv">0</span>)</span> +<span id="cb9-160"><a href="#cb9-160" aria-hidden="true" tabindex="-1"></a> .<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">1</span>)</span> +<span id="cb9-161"><a href="#cb9-161" aria-hidden="true" tabindex="-1"></a> .astype(<span class="bu">int</span>),</span> +<span id="cb9-162"><a href="#cb9-162" aria-hidden="true" tabindex="-1"></a> n_success<span class="op">=</span><span class="kw">lambda</span> df: (</span> +<span id="cb9-163"><a href="#cb9-163" aria-hidden="true" tabindex="-1"></a> df[[<span class="st">"H"</span>, <span class="st">"BB"</span>, <span class="st">"HBP"</span>]].fillna(<span class="dv">0</span>).<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">1</span>).astype(<span class="bu">int</span>)</span> +<span id="cb9-164"><a href="#cb9-164" aria-hidden="true" tabindex="-1"></a> ),</span> +<span id="cb9-165"><a href="#cb9-165" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-166"><a href="#cb9-166" aria-hidden="true" tabindex="-1"></a> .loc[</span> +<span id="cb9-167"><a href="#cb9-167" aria-hidden="true" tabindex="-1"></a> filter_batters,</span> +<span id="cb9-168"><a href="#cb9-168" aria-hidden="true" tabindex="-1"></a> [<span class="st">"player_season"</span>, <span class="st">"season"</span>, <span class="st">"n_attempt"</span>, <span class="st">"n_success"</span>],</span> +<span id="cb9-169"><a href="#cb9-169" aria-hidden="true" tabindex="-1"></a> ]</span> +<span id="cb9-170"><a href="#cb9-170" aria-hidden="true" tabindex="-1"></a> .copy()</span> +<span id="cb9-171"><a href="#cb9-171" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> m <span class="kw">in</span> [measurements_main, measurements_post]</span> +<span id="cb9-172"><a href="#cb9-172" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-173"><a href="#cb9-173" aria-hidden="true" tabindex="-1"></a> measurements <span class="op">=</span> (</span> +<span id="cb9-174"><a href="#cb9-174" aria-hidden="true" tabindex="-1"></a> pd.concat([measurements_main, measurements_post])</span> +<span id="cb9-175"><a href="#cb9-175" aria-hidden="true" tabindex="-1"></a> .groupby([<span class="st">"player_season"</span>, <span class="st">"season"</span>])</span> +<span id="cb9-176"><a href="#cb9-176" aria-hidden="true" tabindex="-1"></a> .<span class="bu">sum</span>()</span> +<span id="cb9-177"><a href="#cb9-177" aria-hidden="true" tabindex="-1"></a> .reset_index()</span> +<span id="cb9-178"><a href="#cb9-178" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-179"><a href="#cb9-179" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> PreparedData(</span> +<span id="cb9-180"><a href="#cb9-180" aria-hidden="true" tabindex="-1"></a> name<span class="op">=</span><span class="st">"bdb"</span>,</span> +<span id="cb9-181"><a href="#cb9-181" aria-hidden="true" tabindex="-1"></a> coords<span class="op">=</span>{</span> +<span id="cb9-182"><a href="#cb9-182" aria-hidden="true" tabindex="-1"></a> <span class="st">"player_season"</span>: measurements[<span class="st">"player_season"</span>].tolist(),</span> +<span id="cb9-183"><a href="#cb9-183" aria-hidden="true" tabindex="-1"></a> <span class="st">"season"</span>: measurements[<span class="st">"season"</span>].tolist(),</span> +<span id="cb9-184"><a href="#cb9-184" aria-hidden="true" tabindex="-1"></a> },</span> +<span id="cb9-185"><a href="#cb9-185" aria-hidden="true" tabindex="-1"></a> measurements<span class="op">=</span>measurements,</span> +<span id="cb9-186"><a href="#cb9-186" aria-hidden="true" tabindex="-1"></a> )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>I had to update the <code>prepare_data</code> function to take into account the two raw data sources:</p> +<div class="sourceCode" id="cb10" data-startfrom="33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 32;"><span id="cb10-33"><a href="#cb10-33" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb10-34"><a href="#cb10-34" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data():</span> +<span id="cb10-35"><a href="#cb10-35" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Run main function."""</span></span> +<span id="cb10-36"><a href="#cb10-36" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(<span class="st">"Reading raw data..."</span>)</span> +<span id="cb10-37"><a href="#cb10-37" aria-hidden="true" tabindex="-1"></a> raw_data <span class="op">=</span> {</span> +<span id="cb10-38"><a href="#cb10-38" aria-hidden="true" tabindex="-1"></a> k: [pd.read_csv(<span class="bu">file</span>, index_col<span class="op">=</span><span class="va">None</span>) <span class="cf">for</span> <span class="bu">file</span> <span class="kw">in</span> v]</span> +<span id="cb10-39"><a href="#cb10-39" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> k, v <span class="kw">in</span> RAW_DATA_FILES.items()</span> +<span id="cb10-40"><a href="#cb10-40" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb10-41"><a href="#cb10-41" aria-hidden="true" tabindex="-1"></a> data_preparation_functions_to_run <span class="op">=</span> {</span> +<span id="cb10-42"><a href="#cb10-42" aria-hidden="true" tabindex="-1"></a> <span class="st">"2006"</span>: prepare_data_2006,</span> +<span id="cb10-43"><a href="#cb10-43" aria-hidden="true" tabindex="-1"></a> <span class="st">"bdb"</span>: prepare_data_bdb,</span> +<span id="cb10-44"><a href="#cb10-44" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb10-45"><a href="#cb10-45" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(<span class="st">"Preparing data..."</span>)</span> +<span id="cb10-46"><a href="#cb10-46" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> name, dpf <span class="kw">in</span> data_preparation_functions_to_run.items():</span> +<span id="cb10-47"><a href="#cb10-47" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(<span class="ss">f"Running data preparation function </span><span class="sc">{</span>dpf<span class="sc">.</span><span class="va">__name__</span><span class="sc">}</span><span class="ss">..."</span>)</span> +<span id="cb10-48"><a href="#cb10-48" aria-hidden="true" tabindex="-1"></a> prepared_data <span class="op">=</span> dpf(<span class="op">*</span>raw_data[name])</span> +<span id="cb10-49"><a href="#cb10-49" aria-hidden="true" tabindex="-1"></a> output_dir <span class="op">=</span> os.path.join(PREPARED_DIR, prepared_data.name)</span> +<span id="cb10-50"><a href="#cb10-50" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(<span class="ss">f"</span><span class="ch">\t</span><span class="ss">writing files to </span><span class="sc">{</span>output_dir<span class="sc">}</span><span class="ss">"</span>)</span> +<span id="cb10-51"><a href="#cb10-51" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> <span class="kw">not</span> os.path.exists(PREPARED_DIR):</span> +<span id="cb10-52"><a href="#cb10-52" aria-hidden="true" tabindex="-1"></a> os.mkdir(PREPARED_DIR)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>To finish off I deleted the unused global variables <code>MEASUREMENTS_FILE</code>, <code>NEW_COLNAMES</code>, <code>DROPNA_COLS</code> and <code>DIMS</code>, then checked if the function <code>load_prepared_data</code> needed any changes: I was pretty sure it didn’t.</p> <div class="page-columns page-full"><p>To check that all this worked, I ran the data preparation script manually:<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a></p><div class="no-row-height column-margin column-container"><li id="fn2"><p><sup>2</sup> I could also have just run <code>make analysis</code> again</p></li></div></div> -<div class="sourceCode" id="cb10"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="op">></span> source <span class="ex">.venv/bin/activate</span></span> -<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="kw">(</span><span class="ex">baseball</span><span class="kw">)</span> <span class="op">></span> python <span class="ex">baseball/prepare_data.py</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb11"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="op">></span> source <span class="ex">.venv/bin/activate</span></span> +<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="kw">(</span><span class="ex">baseball</span><span class="kw">)</span> <span class="op">></span> python <span class="ex">baseball/prepare_data.py</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>Now the folder <code>data/prepared/bdb</code> contained a file <code>data/prepared/bdb/measurements.csv</code> that looked like this:</p> <pre class="{csv}"><code>,player_season,season,n_attempt,n_success 0,abreujo02,2018,553,180 @@ -3406,213 +3430,191 @@ <h1>Specifying statistical models</h1> \end{align*}\]</span></p> <p>Again I would choose priors for the hyperparameters that put most of the alphas between 0.1 and 0.4.</p> <p>To implement these models using Stan I first added the following function to the file <code>custom_functions.stan</code>. This was simply copied from <a href="https://mc-stan.org/users/documentation/case-studies/gpareto_functions.html#conclusion-on-the-data-analysis">the relevant part of the geomagnetic storms case study</a>.</p> -<div class="sourceCode" id="cb12"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="dt">real</span> gpareto_lpdf(<span class="dt">vector</span> y, <span class="dt">real</span> ymin, <span class="dt">real</span> k, <span class="dt">real</span> sigma) {</span> -<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a> <span class="co">// generalised Pareto log pdf </span></span> -<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> N = rows(y);</span> -<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> inv_k = inv(k);</span> -<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (k<<span class="dv">0</span> && max(y-ymin)/sigma > -inv_k)</span> -<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">reject</span>(<span class="st">"k<0 and max(y-ymin)/sigma > -1/k; found k, sigma ="</span>, k, <span class="st">", "</span>, sigma);</span> -<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (sigma<=<span class="dv">0</span>)</span> -<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a> <span class="kw">reject</span>(<span class="st">"sigma<=0; found sigma ="</span>, sigma);</span> -<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (fabs(k) > <span class="fl">1e-15</span>)</span> -<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> -(<span class="dv">1</span>+inv_k)*sum(log1p((y-ymin) * (k/sigma))) -N*log(sigma);</span> -<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span></span> -<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> -sum(y-ymin)/sigma -N*log(sigma); <span class="co">// limit k->0</span></span> -<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb13"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="dt">real</span> gpareto_lpdf(<span class="dt">vector</span> y, <span class="dt">real</span> ymin, <span class="dt">real</span> k, <span class="dt">real</span> sigma) {</span> +<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a> <span class="co">// generalised Pareto log pdf </span></span> +<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> N = rows(y);</span> +<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> inv_k = inv(k);</span> +<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (k<<span class="dv">0</span> && max(y-ymin)/sigma > -inv_k)</span> +<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">reject</span>(<span class="st">"k<0 and max(y-ymin)/sigma > -1/k; found k, sigma ="</span>, k, <span class="st">", "</span>, sigma);</span> +<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (sigma<=<span class="dv">0</span>)</span> +<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a> <span class="kw">reject</span>(<span class="st">"sigma<=0; found sigma ="</span>, sigma);</span> +<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (fabs(k) > <span class="fl">1e-15</span>)</span> +<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> -(<span class="dv">1</span>+inv_k)*sum(log1p((y-ymin) * (k/sigma))) -N*log(sigma);</span> +<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span></span> +<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> -sum(y-ymin)/sigma -N*log(sigma); <span class="co">// limit k->0</span></span> +<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>Next I wrote a file <code>gpareto.stan</code>:</p> -<div class="sourceCode" id="cb13"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="kw">functions</span> {</span> -<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="co">#include custom_functions.stan</span></span> -<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="kw">data</span> {</span> -<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> N; <span class="co">// items</span></span> -<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> K; <span class="co">// trials</span></span> -<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> y; <span class="co">// successes</span></span> -<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> min_alpha; <span class="co">// noone worse than this would be in the dataset</span></span> -<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> max_alpha;</span> -<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_sigma;</span> -<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_k;</span> -<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>,<span class="kw">upper</span>=<span class="dv">1</span>> likelihood;</span> -<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb13-14"><a href="#cb13-14" aria-hidden="true" tabindex="-1"></a><span class="kw">parameters</span> {</span> -<span id="cb13-15"><a href="#cb13-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=<span class="fl">0.001</span>> sigma; <span class="co">// scale parameter of generalised pareto distribution</span></span> -<span id="cb13-16"><a href="#cb13-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=-sigma/(max_alpha-min_alpha)> k; <span class="co">// shape parameter of generalised pareto distribution</span></span> -<span id="cb13-17"><a href="#cb13-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span><<span class="kw">lower</span>=min_alpha,<span class="kw">upper</span>=max_alpha>[N] alpha; <span class="co">// success log-odds</span></span> -<span id="cb13-18"><a href="#cb13-18" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb13-19"><a href="#cb13-19" aria-hidden="true" tabindex="-1"></a><span class="kw">model</span> {</span> -<span id="cb13-20"><a href="#cb13-20" aria-hidden="true" tabindex="-1"></a> sigma ~ normal(prior_sigma[<span class="dv">1</span>], prior_sigma[<span class="dv">2</span>]);</span> -<span id="cb13-21"><a href="#cb13-21" aria-hidden="true" tabindex="-1"></a> k ~ normal(prior_k[<span class="dv">1</span>], prior_k[<span class="dv">2</span>]);</span> -<span id="cb13-22"><a href="#cb13-22" aria-hidden="true" tabindex="-1"></a> alpha ~ gpareto(min_alpha, k, sigma);</span> -<span id="cb13-23"><a href="#cb13-23" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (likelihood){</span> -<span id="cb13-24"><a href="#cb13-24" aria-hidden="true" tabindex="-1"></a> y ~ binomial_logit(K, alpha);</span> -<span id="cb13-25"><a href="#cb13-25" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb13-26"><a href="#cb13-26" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb13-27"><a href="#cb13-27" aria-hidden="true" tabindex="-1"></a><span class="kw">generated quantities</span> {</span> -<span id="cb13-28"><a href="#cb13-28" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] yrep;</span> -<span id="cb13-29"><a href="#cb13-29" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] llik;</span> -<span id="cb13-30"><a href="#cb13-30" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span>:N){</span> -<span id="cb13-31"><a href="#cb13-31" aria-hidden="true" tabindex="-1"></a> yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));</span> -<span id="cb13-32"><a href="#cb13-32" aria-hidden="true" tabindex="-1"></a> llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);</span> -<span id="cb13-33"><a href="#cb13-33" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb13-34"><a href="#cb13-34" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> -<p>Finally I wrote a file <code>normal.stan</code>:</p> -<div class="sourceCode" id="cb14"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="kw">data</span> {</span> -<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> N; <span class="co">// items</span></span> -<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> K; <span class="co">// trials</span></span> -<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> y; <span class="co">// successes</span></span> -<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_mu;</span> -<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_tau;</span> -<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_b_K;</span> -<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>,<span class="kw">upper</span>=<span class="dv">1</span>> likelihood;</span> -<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="kw">transformed data</span> {</span> -<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] log_K = log(to_vector(K));</span> -<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] log_K_std = (log_K - mean(log_K)) / sd(log_K);</span> +<div class="sourceCode" id="cb14"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="kw">functions</span> {</span> +<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="co">#include custom_functions.stan</span></span> +<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="kw">data</span> {</span> +<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> N; <span class="co">// items</span></span> +<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> K; <span class="co">// trials</span></span> +<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> y; <span class="co">// successes</span></span> +<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> min_alpha; <span class="co">// noone worse than this would be in the dataset</span></span> +<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> max_alpha;</span> +<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_sigma;</span> +<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_k;</span> +<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>,<span class="kw">upper</span>=<span class="dv">1</span>> likelihood;</span> <span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a>}</span> <span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a><span class="kw">parameters</span> {</span> -<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> mu; <span class="co">// population mean of success log-odds</span></span> -<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=<span class="dv">0</span>> tau; <span class="co">// population sd of success log-odds</span></span> -<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> b_K;</span> -<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] alpha_std; <span class="co">// success log-odds (standardized)</span></span> -<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a><span class="kw">model</span> {</span> -<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a> b_K ~ normal(prior_b_K[<span class="dv">1</span>], prior_b_K[<span class="dv">2</span>]);</span> -<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a> mu ~ normal(prior_mu[<span class="dv">1</span>], prior_mu[<span class="dv">2</span>]);</span> -<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a> tau ~ normal(prior_tau[<span class="dv">1</span>], prior_tau[<span class="dv">2</span>]);</span> -<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a> alpha_std ~ normal(<span class="dv">0</span>, <span class="dv">1</span>);</span> -<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (likelihood){</span> -<span id="cb14-26"><a href="#cb14-26" aria-hidden="true" tabindex="-1"></a> y ~ binomial_logit(K, mu + b_K * log_K_std + tau * alpha_std);</span> -<span id="cb14-27"><a href="#cb14-27" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb14-28"><a href="#cb14-28" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb14-29"><a href="#cb14-29" aria-hidden="true" tabindex="-1"></a><span class="kw">generated quantities</span> {</span> -<span id="cb14-30"><a href="#cb14-30" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] alpha = mu + b_K * log_K_std + tau * alpha_std;</span> -<span id="cb14-31"><a href="#cb14-31" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] yrep;</span> -<span id="cb14-32"><a href="#cb14-32" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] llik;</span> -<span id="cb14-33"><a href="#cb14-33" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span>:N){</span> -<span id="cb14-34"><a href="#cb14-34" aria-hidden="true" tabindex="-1"></a> yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));</span> -<span id="cb14-35"><a href="#cb14-35" aria-hidden="true" tabindex="-1"></a> llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);</span> -<span id="cb14-36"><a href="#cb14-36" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb14-37"><a href="#cb14-37" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=<span class="fl">0.001</span>> sigma; <span class="co">// scale parameter of generalised pareto distribution</span></span> +<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=-sigma/(max_alpha-min_alpha)> k; <span class="co">// shape parameter of generalised pareto distribution</span></span> +<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span><<span class="kw">lower</span>=min_alpha,<span class="kw">upper</span>=max_alpha>[N] alpha; <span class="co">// success log-odds</span></span> +<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a><span class="kw">model</span> {</span> +<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a> sigma ~ normal(prior_sigma[<span class="dv">1</span>], prior_sigma[<span class="dv">2</span>]);</span> +<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a> k ~ normal(prior_k[<span class="dv">1</span>], prior_k[<span class="dv">2</span>]);</span> +<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a> alpha ~ gpareto(min_alpha, k, sigma);</span> +<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (likelihood){</span> +<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a> y ~ binomial_logit(K, alpha);</span> +<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb14-26"><a href="#cb14-26" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb14-27"><a href="#cb14-27" aria-hidden="true" tabindex="-1"></a><span class="kw">generated quantities</span> {</span> +<span id="cb14-28"><a href="#cb14-28" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] yrep;</span> +<span id="cb14-29"><a href="#cb14-29" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] llik;</span> +<span id="cb14-30"><a href="#cb14-30" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span>:N){</span> +<span id="cb14-31"><a href="#cb14-31" aria-hidden="true" tabindex="-1"></a> yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));</span> +<span id="cb14-32"><a href="#cb14-32" aria-hidden="true" tabindex="-1"></a> llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);</span> +<span id="cb14-33"><a href="#cb14-33" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb14-34"><a href="#cb14-34" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>Finally I wrote a file <code>normal.stan</code>:</p> +<div class="sourceCode" id="cb15"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="kw">data</span> {</span> +<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> N; <span class="co">// items</span></span> +<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> K; <span class="co">// trials</span></span> +<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> y; <span class="co">// successes</span></span> +<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_mu;</span> +<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_tau;</span> +<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_b_K;</span> +<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>,<span class="kw">upper</span>=<span class="dv">1</span>> likelihood;</span> +<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="kw">transformed data</span> {</span> +<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] log_K = log(to_vector(K));</span> +<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] log_K_std = (log_K - mean(log_K)) / sd(log_K);</span> +<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a><span class="kw">parameters</span> {</span> +<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> mu; <span class="co">// population mean of success log-odds</span></span> +<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=<span class="dv">0</span>> tau; <span class="co">// population sd of success log-odds</span></span> +<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> b_K;</span> +<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] alpha_std; <span class="co">// success log-odds (standardized)</span></span> +<span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb15-20"><a href="#cb15-20" aria-hidden="true" tabindex="-1"></a><span class="kw">model</span> {</span> +<span id="cb15-21"><a href="#cb15-21" aria-hidden="true" tabindex="-1"></a> b_K ~ normal(prior_b_K[<span class="dv">1</span>], prior_b_K[<span class="dv">2</span>]);</span> +<span id="cb15-22"><a href="#cb15-22" aria-hidden="true" tabindex="-1"></a> mu ~ normal(prior_mu[<span class="dv">1</span>], prior_mu[<span class="dv">2</span>]);</span> +<span id="cb15-23"><a href="#cb15-23" aria-hidden="true" tabindex="-1"></a> tau ~ normal(prior_tau[<span class="dv">1</span>], prior_tau[<span class="dv">2</span>]);</span> +<span id="cb15-24"><a href="#cb15-24" aria-hidden="true" tabindex="-1"></a> alpha_std ~ normal(<span class="dv">0</span>, <span class="dv">1</span>);</span> +<span id="cb15-25"><a href="#cb15-25" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (likelihood){</span> +<span id="cb15-26"><a href="#cb15-26" aria-hidden="true" tabindex="-1"></a> y ~ binomial_logit(K, mu + b_K * log_K_std + tau * alpha_std);</span> +<span id="cb15-27"><a href="#cb15-27" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb15-28"><a href="#cb15-28" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb15-29"><a href="#cb15-29" aria-hidden="true" tabindex="-1"></a><span class="kw">generated quantities</span> {</span> +<span id="cb15-30"><a href="#cb15-30" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] alpha = mu + b_K * log_K_std + tau * alpha_std;</span> +<span id="cb15-31"><a href="#cb15-31" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] yrep;</span> +<span id="cb15-32"><a href="#cb15-32" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] llik;</span> +<span id="cb15-33"><a href="#cb15-33" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span>:N){</span> +<span id="cb15-34"><a href="#cb15-34" aria-hidden="true" tabindex="-1"></a> yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));</span> +<span id="cb15-35"><a href="#cb15-35" aria-hidden="true" tabindex="-1"></a> llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);</span> +<span id="cb15-36"><a href="#cb15-36" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb15-37"><a href="#cb15-37" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> </section> -<section id="generating-stan-inputs" class="level1"> +<section id="generating-stan-inputs" class="level1 page-columns page-full"> <h1>Generating Stan inputs</h1> <p>Next I needed to tell the analysis how to turn some prepared data into a dictionary that can be used as input for Stan. Bibat assumes that this task is handled by functions that live in the file <code>baseball/stan_input_functions.py</code>, each of which takes in a <code>PreparedData</code> and returns a Python dictionary. You can write as many Stan input functions as you like and choose which one to run for any given inference.</p> -<p>I started by defining some Stan input functions that pass arbitary prepared data on to each of the models:</p> -<div class="sourceCode" id="cb15" data-startfrom="11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 10;"><span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_normal(ppd: PreparedData) <span class="op">-></span> Dict:</span> -<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a> <span class="co">"""General function for creating a Stan input."""</span></span> -<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span> -<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a> <span class="st">"N"</span>: <span class="bu">len</span>(ppd.measurements),</span> -<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a> <span class="st">"K"</span>: ppd.measurements[<span class="st">"n_attempt"</span>].values,</span> -<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a> <span class="st">"y"</span>: ppd.measurements[<span class="st">"n_success"</span>].values,</span> -<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_mu"</span>: [logit(<span class="fl">0.25</span>), <span class="fl">0.2</span>],</span> -<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_tau"</span>: [<span class="fl">0.2</span>, <span class="fl">0.1</span>],</span> -<span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_b_K"</span>: [<span class="dv">0</span>, <span class="fl">0.03</span>],</span> -<span id="cb15-20"><a href="#cb15-20" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb15-21"><a href="#cb15-21" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb15-22"><a href="#cb15-22" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb15-23"><a href="#cb15-23" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_gpareto(ppd: PreparedData) <span class="op">-></span> Dict:</span> -<span id="cb15-24"><a href="#cb15-24" aria-hidden="true" tabindex="-1"></a> <span class="co">"""General function for creating a Stan input."""</span></span> -<span id="cb15-25"><a href="#cb15-25" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span> -<span id="cb15-26"><a href="#cb15-26" aria-hidden="true" tabindex="-1"></a> <span class="st">"N"</span>: <span class="bu">len</span>(ppd.measurements),</span> -<span id="cb15-27"><a href="#cb15-27" aria-hidden="true" tabindex="-1"></a> <span class="st">"K"</span>: ppd.measurements[<span class="st">"n_attempt"</span>].values,</span> -<span id="cb15-28"><a href="#cb15-28" aria-hidden="true" tabindex="-1"></a> <span class="st">"y"</span>: ppd.measurements[<span class="st">"n_success"</span>].values,</span> -<span id="cb15-29"><a href="#cb15-29" aria-hidden="true" tabindex="-1"></a> <span class="st">"min_alpha"</span>: logit(<span class="fl">0.07</span>),</span> -<span id="cb15-30"><a href="#cb15-30" aria-hidden="true" tabindex="-1"></a> <span class="st">"max_alpha"</span>: logit(<span class="fl">0.5</span>),</span> -<span id="cb15-31"><a href="#cb15-31" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_sigma"</span>: [<span class="fl">1.5</span>, <span class="fl">0.4</span>],</span> -<span id="cb15-32"><a href="#cb15-32" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_k"</span>: [<span class="op">-</span><span class="fl">0.5</span>, <span class="dv">1</span>],</span> -<span id="cb15-33"><a href="#cb15-33" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb15-34"><a href="#cb15-34" aria-hidden="true" tabindex="-1"></a></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="page-columns page-full"><p>I started by defining some Stan input functions that pass arbitary prepared data on to each of the models:<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a></p><div class="no-row-height column-margin column-container"><li id="fn3"><p><sup>3</sup> Note that this code uses the scipy function <code>logit</code>, which it imported like this: <code>from scipy.special import logit</code></p></li></div></div> +<div class="sourceCode" id="cb16" data-startfrom="11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 10;"><span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_normal(ppd: PreparedData) <span class="op">-></span> Dict:</span> +<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a> <span class="co">"""General function for creating a Stan input."""</span></span> +<span id="cb16-13"><a href="#cb16-13" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span> +<span id="cb16-14"><a href="#cb16-14" aria-hidden="true" tabindex="-1"></a> <span class="st">"N"</span>: <span class="bu">len</span>(ppd.measurements),</span> +<span id="cb16-15"><a href="#cb16-15" aria-hidden="true" tabindex="-1"></a> <span class="st">"K"</span>: ppd.measurements[<span class="st">"n_attempt"</span>].values,</span> +<span id="cb16-16"><a href="#cb16-16" aria-hidden="true" tabindex="-1"></a> <span class="st">"y"</span>: ppd.measurements[<span class="st">"n_success"</span>].values,</span> +<span id="cb16-17"><a href="#cb16-17" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_mu"</span>: [logit(<span class="fl">0.25</span>), <span class="fl">0.2</span>],</span> +<span id="cb16-18"><a href="#cb16-18" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_tau"</span>: [<span class="fl">0.2</span>, <span class="fl">0.1</span>],</span> +<span id="cb16-19"><a href="#cb16-19" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_b_K"</span>: [<span class="dv">0</span>, <span class="fl">0.03</span>],</span> +<span id="cb16-20"><a href="#cb16-20" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb16-21"><a href="#cb16-21" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb16-22"><a href="#cb16-22" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb16-23"><a href="#cb16-23" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_gpareto(ppd: PreparedData) <span class="op">-></span> Dict:</span> +<span id="cb16-24"><a href="#cb16-24" aria-hidden="true" tabindex="-1"></a> <span class="co">"""General function for creating a Stan input."""</span></span> +<span id="cb16-25"><a href="#cb16-25" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span> +<span id="cb16-26"><a href="#cb16-26" aria-hidden="true" tabindex="-1"></a> <span class="st">"N"</span>: <span class="bu">len</span>(ppd.measurements),</span> +<span id="cb16-27"><a href="#cb16-27" aria-hidden="true" tabindex="-1"></a> <span class="st">"K"</span>: ppd.measurements[<span class="st">"n_attempt"</span>].values,</span> +<span id="cb16-28"><a href="#cb16-28" aria-hidden="true" tabindex="-1"></a> <span class="st">"y"</span>: ppd.measurements[<span class="st">"n_success"</span>].values,</span> +<span id="cb16-29"><a href="#cb16-29" aria-hidden="true" tabindex="-1"></a> <span class="st">"min_alpha"</span>: logit(<span class="fl">0.07</span>),</span> +<span id="cb16-30"><a href="#cb16-30" aria-hidden="true" tabindex="-1"></a> <span class="st">"max_alpha"</span>: logit(<span class="fl">0.5</span>),</span> +<span id="cb16-31"><a href="#cb16-31" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_sigma"</span>: [<span class="fl">1.5</span>, <span class="fl">0.4</span>],</span> +<span id="cb16-32"><a href="#cb16-32" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_k"</span>: [<span class="op">-</span><span class="fl">0.5</span>, <span class="dv">1</span>],</span> +<span id="cb16-33"><a href="#cb16-33" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb16-34"><a href="#cb16-34" aria-hidden="true" tabindex="-1"></a></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>But why stop there? It can also be useful to generate Stan input data from a model with a model, by running it in simulation model with hardcoded parameter values. Here are some functions that do this for both of our models:</p> -<div class="sourceCode" id="cb16" data-startfrom="36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 35;"><span id="cb16-36"><a href="#cb16-36" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_normal_fake(ppd: PreparedData) <span class="op">-></span> Dict:</span> -<span id="cb16-37"><a href="#cb16-37" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate fake Stan input consistent with the normal model."""</span></span> -<span id="cb16-38"><a href="#cb16-38" aria-hidden="true" tabindex="-1"></a> N <span class="op">=</span> <span class="bu">len</span>(ppd.measurements)</span> -<span id="cb16-39"><a href="#cb16-39" aria-hidden="true" tabindex="-1"></a> rng <span class="op">=</span> np.random.default_rng(seed<span class="op">=</span><span class="dv">1234</span>)</span> -<span id="cb16-40"><a href="#cb16-40" aria-hidden="true" tabindex="-1"></a> true_param_values <span class="op">=</span> {</span> -<span id="cb16-41"><a href="#cb16-41" aria-hidden="true" tabindex="-1"></a> <span class="st">"mu"</span>: logit(<span class="fl">0.25</span>),</span> -<span id="cb16-42"><a href="#cb16-42" aria-hidden="true" tabindex="-1"></a> <span class="st">"tau"</span>: <span class="fl">0.18</span>, <span class="co"># 2sds is 0.19 to 0.32 batting average</span></span> -<span id="cb16-43"><a href="#cb16-43" aria-hidden="true" tabindex="-1"></a> <span class="st">"b_K"</span>: <span class="fl">0.04</span>, <span class="co"># slight effect of more attempts</span></span> -<span id="cb16-44"><a href="#cb16-44" aria-hidden="true" tabindex="-1"></a> <span class="st">"alpha_std"</span>: rng.random.normal(loc<span class="op">=</span><span class="dv">0</span>, scale<span class="op">=</span><span class="dv">1</span>, size<span class="op">=</span>N),</span> -<span id="cb16-45"><a href="#cb16-45" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb16-46"><a href="#cb16-46" aria-hidden="true" tabindex="-1"></a> K <span class="op">=</span> ppd.measurements[<span class="st">"n_attempt"</span>].values</span> -<span id="cb16-47"><a href="#cb16-47" aria-hidden="true" tabindex="-1"></a> log_K_std <span class="op">=</span> (np.log(K) <span class="op">-</span> np.log(K).mean()) <span class="op">/</span> np.log(K).std()</span> -<span id="cb16-48"><a href="#cb16-48" aria-hidden="true" tabindex="-1"></a> alpha <span class="op">=</span> (</span> -<span id="cb16-49"><a href="#cb16-49" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"mu"</span>]</span> -<span id="cb16-50"><a href="#cb16-50" aria-hidden="true" tabindex="-1"></a> <span class="op">+</span> true_param_values[<span class="st">"b_K"</span>] <span class="op">*</span> log_K_std</span> -<span id="cb16-51"><a href="#cb16-51" aria-hidden="true" tabindex="-1"></a> <span class="op">+</span> true_param_values[<span class="st">"tau"</span>] <span class="op">*</span> true_param_values[<span class="st">"alpha_std"</span>]</span> -<span id="cb16-52"><a href="#cb16-52" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb16-53"><a href="#cb16-53" aria-hidden="true" tabindex="-1"></a> y <span class="op">=</span> rng.random.binomial(K, expit(alpha))</span> -<span id="cb16-54"><a href="#cb16-54" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"N"</span>: N, <span class="st">"K"</span>: K, <span class="st">"y"</span>: y} <span class="op">|</span> true_param_values</span> -<span id="cb16-55"><a href="#cb16-55" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-56"><a href="#cb16-56" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-57"><a href="#cb16-57" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_gpareto_fake(ppd: PreparedData) <span class="op">-></span> Dict:</span> -<span id="cb16-58"><a href="#cb16-58" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate fake Stan input consistent with the gpareto model."""</span></span> -<span id="cb16-59"><a href="#cb16-59" aria-hidden="true" tabindex="-1"></a> N <span class="op">=</span> <span class="bu">len</span>(ppd.measurements)</span> -<span id="cb16-60"><a href="#cb16-60" aria-hidden="true" tabindex="-1"></a> K <span class="op">=</span> ppd.measurements[<span class="st">"n_attempt"</span>].values</span> -<span id="cb16-61"><a href="#cb16-61" aria-hidden="true" tabindex="-1"></a> min_alpha <span class="op">=</span> <span class="fl">0.1</span></span> -<span id="cb16-62"><a href="#cb16-62" aria-hidden="true" tabindex="-1"></a> rng <span class="op">=</span> np.random.default_rng(seed<span class="op">=</span><span class="dv">1234</span>)</span> -<span id="cb16-63"><a href="#cb16-63" aria-hidden="true" tabindex="-1"></a> true_param_values <span class="op">=</span> {<span class="st">"sigma"</span>: <span class="op">-</span><span class="fl">1.098</span>, <span class="st">"k"</span>: <span class="fl">0.18</span>}</span> -<span id="cb16-64"><a href="#cb16-64" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"alpha"</span>] <span class="op">=</span> gpareto_rvs(</span> -<span id="cb16-65"><a href="#cb16-65" aria-hidden="true" tabindex="-1"></a> rng,</span> -<span id="cb16-66"><a href="#cb16-66" aria-hidden="true" tabindex="-1"></a> N,</span> -<span id="cb16-67"><a href="#cb16-67" aria-hidden="true" tabindex="-1"></a> min_alpha,</span> -<span id="cb16-68"><a href="#cb16-68" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"k"</span>],</span> -<span id="cb16-69"><a href="#cb16-69" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"sigma"</span>],</span> -<span id="cb16-70"><a href="#cb16-70" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb16-71"><a href="#cb16-71" aria-hidden="true" tabindex="-1"></a> y <span class="op">=</span> rng.binomial(K, expit(true_param_values[<span class="st">"alpha"</span>]))</span> -<span id="cb16-72"><a href="#cb16-72" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"N"</span>: N, <span class="st">"K"</span>: K, <span class="st">"y"</span>: y, <span class="st">"min_alpha"</span>: min_alpha} <span class="op">|</span> true_param_values</span> -<span id="cb16-73"><a href="#cb16-73" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-74"><a href="#cb16-74" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-75"><a href="#cb16-75" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gpareto_rvs(</span> -<span id="cb16-76"><a href="#cb16-76" aria-hidden="true" tabindex="-1"></a> rng: np.random.Generator, size: <span class="bu">int</span>, mu: <span class="bu">float</span>, k: <span class="bu">float</span>, sigma: <span class="bu">float</span></span> -<span id="cb16-77"><a href="#cb16-77" aria-hidden="true" tabindex="-1"></a>):</span> -<span id="cb16-78"><a href="#cb16-78" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate random numbers from a generalised pareto distribution.</span></span> -<span id="cb16-79"><a href="#cb16-79" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-80"><a href="#cb16-80" aria-hidden="true" tabindex="-1"></a><span class="co"> See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for</span></span> -<span id="cb16-81"><a href="#cb16-81" aria-hidden="true" tabindex="-1"></a><span class="co"> source.</span></span> -<span id="cb16-82"><a href="#cb16-82" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-83"><a href="#cb16-83" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span> -<span id="cb16-84"><a href="#cb16-84" aria-hidden="true" tabindex="-1"></a> U <span class="op">=</span> rng.uniform(size)</span> -<span id="cb16-85"><a href="#cb16-85" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> k <span class="op">==</span> <span class="dv">0</span>:</span> -<span id="cb16-86"><a href="#cb16-86" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> mu <span class="op">-</span> sigma <span class="op">*</span> np.log(U)</span> -<span id="cb16-87"><a href="#cb16-87" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span>:</span> -<span id="cb16-88"><a href="#cb16-88" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> mu <span class="op">+</span> (sigma <span class="op">*</span> (U<span class="op">**-</span>k) <span class="op">-</span> <span class="dv">1</span>) <span class="op">/</span> sigma</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb17" data-startfrom="36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 35;"><span id="cb17-36"><a href="#cb17-36" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_normal_fake(ppd: PreparedData) <span class="op">-></span> Dict:</span> +<span id="cb17-37"><a href="#cb17-37" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate fake Stan input consistent with the normal model."""</span></span> +<span id="cb17-38"><a href="#cb17-38" aria-hidden="true" tabindex="-1"></a> N <span class="op">=</span> <span class="bu">len</span>(ppd.measurements)</span> +<span id="cb17-39"><a href="#cb17-39" aria-hidden="true" tabindex="-1"></a> rng <span class="op">=</span> np.random.default_rng(seed<span class="op">=</span><span class="dv">1234</span>)</span> +<span id="cb17-40"><a href="#cb17-40" aria-hidden="true" tabindex="-1"></a> true_param_values <span class="op">=</span> {</span> +<span id="cb17-41"><a href="#cb17-41" aria-hidden="true" tabindex="-1"></a> <span class="st">"mu"</span>: logit(<span class="fl">0.25</span>),</span> +<span id="cb17-42"><a href="#cb17-42" aria-hidden="true" tabindex="-1"></a> <span class="st">"tau"</span>: <span class="fl">0.18</span>, <span class="co"># 2sds is 0.19 to 0.32 batting average</span></span> +<span id="cb17-43"><a href="#cb17-43" aria-hidden="true" tabindex="-1"></a> <span class="st">"b_K"</span>: <span class="fl">0.04</span>, <span class="co"># slight effect of more attempts</span></span> +<span id="cb17-44"><a href="#cb17-44" aria-hidden="true" tabindex="-1"></a> <span class="st">"alpha_std"</span>: rng.random.normal(loc<span class="op">=</span><span class="dv">0</span>, scale<span class="op">=</span><span class="dv">1</span>, size<span class="op">=</span>N),</span> +<span id="cb17-45"><a href="#cb17-45" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb17-46"><a href="#cb17-46" aria-hidden="true" tabindex="-1"></a> K <span class="op">=</span> ppd.measurements[<span class="st">"n_attempt"</span>].values</span> +<span id="cb17-47"><a href="#cb17-47" aria-hidden="true" tabindex="-1"></a> log_K_std <span class="op">=</span> (np.log(K) <span class="op">-</span> np.log(K).mean()) <span class="op">/</span> np.log(K).std()</span> +<span id="cb17-48"><a href="#cb17-48" aria-hidden="true" tabindex="-1"></a> alpha <span class="op">=</span> (</span> +<span id="cb17-49"><a href="#cb17-49" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"mu"</span>]</span> +<span id="cb17-50"><a href="#cb17-50" aria-hidden="true" tabindex="-1"></a> <span class="op">+</span> true_param_values[<span class="st">"b_K"</span>] <span class="op">*</span> log_K_std</span> +<span id="cb17-51"><a href="#cb17-51" aria-hidden="true" tabindex="-1"></a> <span class="op">+</span> true_param_values[<span class="st">"tau"</span>] <span class="op">*</span> true_param_values[<span class="st">"alpha_std"</span>]</span> +<span id="cb17-52"><a href="#cb17-52" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb17-53"><a href="#cb17-53" aria-hidden="true" tabindex="-1"></a> y <span class="op">=</span> rng.random.binomial(K, expit(alpha))</span> +<span id="cb17-54"><a href="#cb17-54" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"N"</span>: N, <span class="st">"K"</span>: K, <span class="st">"y"</span>: y} <span class="op">|</span> true_param_values</span> +<span id="cb17-55"><a href="#cb17-55" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-56"><a href="#cb17-56" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-57"><a href="#cb17-57" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_gpareto_fake(ppd: PreparedData) <span class="op">-></span> Dict:</span> +<span id="cb17-58"><a href="#cb17-58" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate fake Stan input consistent with the gpareto model."""</span></span> +<span id="cb17-59"><a href="#cb17-59" aria-hidden="true" tabindex="-1"></a> N <span class="op">=</span> <span class="bu">len</span>(ppd.measurements)</span> +<span id="cb17-60"><a href="#cb17-60" aria-hidden="true" tabindex="-1"></a> K <span class="op">=</span> ppd.measurements[<span class="st">"n_attempt"</span>].values</span> +<span id="cb17-61"><a href="#cb17-61" aria-hidden="true" tabindex="-1"></a> min_alpha <span class="op">=</span> <span class="fl">0.1</span></span> +<span id="cb17-62"><a href="#cb17-62" aria-hidden="true" tabindex="-1"></a> rng <span class="op">=</span> np.random.default_rng(seed<span class="op">=</span><span class="dv">1234</span>)</span> +<span id="cb17-63"><a href="#cb17-63" aria-hidden="true" tabindex="-1"></a> true_param_values <span class="op">=</span> {<span class="st">"sigma"</span>: <span class="op">-</span><span class="fl">1.098</span>, <span class="st">"k"</span>: <span class="fl">0.18</span>}</span> +<span id="cb17-64"><a href="#cb17-64" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"alpha"</span>] <span class="op">=</span> gpareto_rvs(</span> +<span id="cb17-65"><a href="#cb17-65" aria-hidden="true" tabindex="-1"></a> rng,</span> +<span id="cb17-66"><a href="#cb17-66" aria-hidden="true" tabindex="-1"></a> N,</span> +<span id="cb17-67"><a href="#cb17-67" aria-hidden="true" tabindex="-1"></a> min_alpha,</span> +<span id="cb17-68"><a href="#cb17-68" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"k"</span>],</span> +<span id="cb17-69"><a href="#cb17-69" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"sigma"</span>],</span> +<span id="cb17-70"><a href="#cb17-70" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb17-71"><a href="#cb17-71" aria-hidden="true" tabindex="-1"></a> y <span class="op">=</span> rng.binomial(K, expit(true_param_values[<span class="st">"alpha"</span>]))</span> +<span id="cb17-72"><a href="#cb17-72" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"N"</span>: N, <span class="st">"K"</span>: K, <span class="st">"y"</span>: y, <span class="st">"min_alpha"</span>: min_alpha} <span class="op">|</span> true_param_values</span> +<span id="cb17-73"><a href="#cb17-73" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-74"><a href="#cb17-74" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-75"><a href="#cb17-75" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gpareto_rvs(</span> +<span id="cb17-76"><a href="#cb17-76" aria-hidden="true" tabindex="-1"></a> rng: np.random.Generator, size: <span class="bu">int</span>, mu: <span class="bu">float</span>, k: <span class="bu">float</span>, sigma: <span class="bu">float</span></span> +<span id="cb17-77"><a href="#cb17-77" aria-hidden="true" tabindex="-1"></a>):</span> +<span id="cb17-78"><a href="#cb17-78" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate random numbers from a generalised pareto distribution.</span></span> +<span id="cb17-79"><a href="#cb17-79" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-80"><a href="#cb17-80" aria-hidden="true" tabindex="-1"></a><span class="co"> See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for</span></span> +<span id="cb17-81"><a href="#cb17-81" aria-hidden="true" tabindex="-1"></a><span class="co"> source.</span></span> +<span id="cb17-82"><a href="#cb17-82" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-83"><a href="#cb17-83" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span> +<span id="cb17-84"><a href="#cb17-84" aria-hidden="true" tabindex="-1"></a> U <span class="op">=</span> rng.uniform(size)</span> +<span id="cb17-85"><a href="#cb17-85" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> k <span class="op">==</span> <span class="dv">0</span>:</span> +<span id="cb17-86"><a href="#cb17-86" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> mu <span class="op">-</span> sigma <span class="op">*</span> np.log(U)</span> +<span id="cb17-87"><a href="#cb17-87" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span>:</span> +<span id="cb17-88"><a href="#cb17-88" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> mu <span class="op">+</span> (sigma <span class="op">*</span> (U<span class="op">**-</span>k) <span class="op">-</span> <span class="dv">1</span>) <span class="op">/</span> sigma</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> </section> <section id="specifying-inferences" class="level1"> <h1>Specifying inferences</h1> <p>Now all the building blocks for making statistical inferences - raw data, data preparation rules, statistical models and recipes for turning prepared data into model inputs - were in place. The last step before actually running Stan was to write down how put the blocks together. Bibat has another concept for this, called ‘inferences’.</p> <p>An inference in bibat is a folder containing a special file called <code>config.toml</code>. This file sets out what inferences you want to make: which statistical model, which prepared data function, which Stan input function, which parameters have which dimensions, which sampling modes to use and how to configure the sampler. The folder will later be filled up with the results of performing the specified inferences.</p> <p>I started by deleting the example inferences and creating two fresh folders, leaving me with an <code>inferences</code> folder looking like this:</p> -<div class="sourceCode" id="cb17"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="ex">inferences</span></span> -<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="ex">├──</span> gpareto2006</span> -<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> └── config.toml</span> -<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="ex">└──</span> normal2006</span> -<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a> <span class="ex">└──</span> config.toml</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb18"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="ex">inferences</span></span> +<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="ex">├──</span> gpareto2006</span> +<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> └── config.toml</span> +<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="ex">└──</span> normal2006</span> +<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a> <span class="ex">└──</span> config.toml</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>Here is the file <code>inferences/gpareto2006/config.toml</code>:</p> -<div class="sourceCode" id="cb18"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"gpareto2006"</span></span> -<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"gpareto.stan"</span></span> -<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"2006"</span></span> -<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_gpareto"</span></span> -<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> -<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> -<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a><span class="dt">alpha</span> <span class="op">=</span> <span class="op">[</span><span class="st">"player"</span><span class="op">]</span></span> -<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">stanc_options</span><span class="kw">]</span></span> -<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a><span class="dt">warn-pedantic</span> <span class="op">=</span> <span class="cn">true</span></span> -<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> -<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> -<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb18-16"><a href="#cb18-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb18-17"><a href="#cb18-17" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb18-18"><a href="#cb18-18" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">.</span><span class="dt">prior</span><span class="kw">]</span></span> -<span id="cb18-19"><a href="#cb18-19" aria-hidden="true" tabindex="-1"></a><span class="dt">chains</span> <span class="op">=</span> <span class="dv">2</span></span> -<span id="cb18-20"><a href="#cb18-20" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">1000</span></span> -<span id="cb18-21"><a href="#cb18-21" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> -<p>Here is the file <code>inferences/normal2006/config.toml</code>:</p> -<div class="sourceCode" id="cb19"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"normal2006"</span></span> -<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"normal.stan"</span></span> +<div class="sourceCode" id="cb19"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"gpareto2006"</span></span> +<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"gpareto.stan"</span></span> <span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"2006"</span></span> -<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_normal"</span></span> +<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_gpareto"</span></span> <span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> <span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a></span> <span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> @@ -3624,21 +3626,50 @@ <h1>Specifying inferences</h1> <span id="cb19-13"><a href="#cb19-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> <span id="cb19-14"><a href="#cb19-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> <span id="cb19-15"><a href="#cb19-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb19-16"><a href="#cb19-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> -<p>Note that: * The Stan file, prepared data folder and stan input function are referred to by strings. The analysis should raise an error if you enter a non-existing value. * Both inferences are set to run in “prior” and “posterior” modes - the other pre-defined mode is “kfold”, but you can also write your own! * You can enter arbitrary arguments to cmdstanpy’s <code>CmdStanModel.sample</code> method in the <code>[sample_kwargs]</code> table. * You can enter mode-specific overrides in <code>[sample_kwargs.<MODE>]</code>. This can be handy if you want to run more or fewer iterations for a certain mode.</p> +<span id="cb19-16"><a href="#cb19-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb19-17"><a href="#cb19-17" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb19-18"><a href="#cb19-18" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">.</span><span class="dt">prior</span><span class="kw">]</span></span> +<span id="cb19-19"><a href="#cb19-19" aria-hidden="true" tabindex="-1"></a><span class="dt">chains</span> <span class="op">=</span> <span class="dv">2</span></span> +<span id="cb19-20"><a href="#cb19-20" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">1000</span></span> +<span id="cb19-21"><a href="#cb19-21" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>Here is the file <code>inferences/normal2006/config.toml</code>:</p> +<div class="sourceCode" id="cb20"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"normal2006"</span></span> +<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"normal.stan"</span></span> +<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"2006"</span></span> +<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_normal"</span></span> +<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> +<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> +<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a><span class="dt">alpha</span> <span class="op">=</span> <span class="op">[</span><span class="st">"player"</span><span class="op">]</span></span> +<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">stanc_options</span><span class="kw">]</span></span> +<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a><span class="dt">warn-pedantic</span> <span class="op">=</span> <span class="cn">true</span></span> +<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb20-13"><a href="#cb20-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> +<span id="cb20-14"><a href="#cb20-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> +<span id="cb20-15"><a href="#cb20-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb20-16"><a href="#cb20-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>Note that:</p> +<ul> +<li>The Stan file, prepared data folder and stan input function are referred to by strings. The analysis should raise an error if you enter a non-existing value.</li> +<li>Both inferences are set to run in “prior” and “posterior” modes - the other pre-defined mode is “kfold”, but you can also write your own!</li> +<li>You can enter arbitrary arguments to cmdstanpy’s <code>CmdStanModel.sample</code> method in the <code>[sample_kwargs]</code> table.</li> +<li>You can enter mode-specific overrides in <code>[sample_kwargs.<MODE>]</code>. This can be handy if you want to run more or fewer iterations for a certain mode.</li> +</ul> <p>Now when I ran <code>make analysis</code> again, I saw messages indicating that Stan had run, and found that the <code>inferences</code> subfolders had been populated:</p> -<div class="sourceCode" id="cb20"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="ex">inferences</span></span> -<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="ex">├──</span> gpareto2006</span> -<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> ├── config.toml</span> -<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> └── idata.json</span> -<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="ex">└──</span> normal2006</span> -<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a> <span class="ex">├──</span> config.toml</span> -<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a> <span class="ex">└──</span> idata.json</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb21"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="ex">inferences</span></span> +<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="ex">├──</span> gpareto2006</span> +<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> ├── config.toml</span> +<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> └── idata.json</span> +<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="ex">└──</span> normal2006</span> +<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a> <span class="ex">├──</span> config.toml</span> +<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a> <span class="ex">└──</span> idata.json</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> </section> <section id="investigating-the-inferences" class="level1"> <h1>Investigating the inferences</h1> -<p>Now that the inferences are ready it’s time to check them out. Bibat provides a jupyter notebook at <code>baseball/investigate.ipynb</code> for exactly this purpose.</p> -<p>A lot of code from the example analysis’s notebook was reusable, so I largely followed its structure, with a few tweaks.</p> +<p>Now that the inferences are ready it’s time to check them out. Bibat provides a Jupyter notebook at <code>baseball/investigate.ipynb</code> for exactly this purpose. The notebook’s main job is to create plots and save them in the <code>plots</code> directory when it is executed with the command <code>jupyter execute investigate.ipynb</code>, which is the final step in the chain of commands that is triggered by <code>make analysis</code>.</p> +<p>A notebook is arguably a nicer home for code that creates plots than a plain python script because it allows for literate documentation and an iterative workflow. A notebook makes it easy to, for example, add some code to change the scale of a plot, execute the code and see the new results, then update the relevant documentation all in the same place.</p> +<p>The code from the example analysis’s notebook for loading <code>InferenceData</code> was reusable with a few tweaks to avoid missing file errors, so I kept it. On the other hand, I wanted to make some different plots from the ones in the example analysis, including some that required loading prepared data. To check out everything I did, see <a href="https://github.com/teddygroves/bibat/blob/main/bibat/examples/baseball/baseball/investigate.ipynb">here</a>.</p> </section> <section id="choosing-priors-using-push-forward-calibration" class="level1"> <h1>Choosing priors using push-forward calibration</h1> @@ -3648,31 +3679,35 @@ <h1>Choosing priors using push-forward calibration</h1> <section id="extending-the-analysis-to-the-baseballdatabank-data" class="level1"> <h1>Extending the analysis to the baseballdatabank data</h1> <p>To model the more recent data, all I had to do was create some new inference folders with appropriate <code>prepared_data_dir</code> fields in their <code>config.toml</code> files. For example, here is the <code>config.toml</code> file for the <code>gparetobdb</code> inference:</p> -<div class="sourceCode" id="cb21"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"gparetobdb"</span></span> -<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"gpareto.stan"</span></span> -<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"bdb"</span></span> -<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_gpareto"</span></span> -<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> -<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> -<span id="cb21-8"><a href="#cb21-8" aria-hidden="true" tabindex="-1"></a><span class="dt">alpha</span> <span class="op">=</span> <span class="op">[</span><span class="st">"player"</span><span class="op">]</span></span> -<span id="cb21-9"><a href="#cb21-9" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb21-10"><a href="#cb21-10" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">stanc_options</span><span class="kw">]</span></span> -<span id="cb21-11"><a href="#cb21-11" aria-hidden="true" tabindex="-1"></a><span class="dt">warn-pedantic</span> <span class="op">=</span> <span class="cn">true</span></span> -<span id="cb21-12"><a href="#cb21-12" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb21-13"><a href="#cb21-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> -<span id="cb21-14"><a href="#cb21-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> -<span id="cb21-15"><a href="#cb21-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb21-16"><a href="#cb21-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb21-17"><a href="#cb21-17" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb21-18"><a href="#cb21-18" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">.</span><span class="dt">prior</span><span class="kw">]</span></span> -<span id="cb21-19"><a href="#cb21-19" aria-hidden="true" tabindex="-1"></a><span class="dt">chains</span> <span class="op">=</span> <span class="dv">2</span></span> -<span id="cb21-20"><a href="#cb21-20" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb21-21"><a href="#cb21-21" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb22"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"gparetobdb"</span></span> +<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"gpareto.stan"</span></span> +<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"bdb"</span></span> +<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_gpareto"</span></span> +<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> +<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> +<span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a><span class="dt">alpha</span> <span class="op">=</span> <span class="op">[</span><span class="st">"player"</span><span class="op">]</span></span> +<span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">stanc_options</span><span class="kw">]</span></span> +<span id="cb22-11"><a href="#cb22-11" aria-hidden="true" tabindex="-1"></a><span class="dt">warn-pedantic</span> <span class="op">=</span> <span class="cn">true</span></span> +<span id="cb22-12"><a href="#cb22-12" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb22-13"><a href="#cb22-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> +<span id="cb22-14"><a href="#cb22-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> +<span id="cb22-15"><a href="#cb22-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb22-16"><a href="#cb22-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb22-17"><a href="#cb22-17" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb22-18"><a href="#cb22-18" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">.</span><span class="dt">prior</span><span class="kw">]</span></span> +<span id="cb22-19"><a href="#cb22-19" aria-hidden="true" tabindex="-1"></a><span class="dt">chains</span> <span class="op">=</span> <span class="dv">2</span></span> +<span id="cb22-20"><a href="#cb22-20" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb22-21"><a href="#cb22-21" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>After running <code>make analysis</code> one more time, I went back to the notebook <code>baseball/investigate.ipynb</code> and made plots of both models’ posterior 1%-99% success rate intervals for both datasets:</p> <p><img src="" class="img-fluid"></p> <p>I think this is very interesting. Both models’ prior distributions had similar regularisation levels, and they more or less agree about the abilities of the players with the most at-bats, both in terms of locations and the widths of the plausible intervals for the true success rate. However, the models ended up with dramatically different certainty levels about the abilities of players with few at-bats. This pattern was true both for the small 2006 dataset and the much larger baseballdatabank dataset.</p> </section> +<section id="documenting-the-analysis" class="level1"> +<h1>Documenting the analysis</h1> +<p>The final step was to document my analysis. To do this I edited the file <code>docs/report.qmd</code>, then ran <code>quarto render docs/report.qmd</code>, which produced the very html document that you are probably reading now! You can find the complete <code>report.qmd</code> file <a href="https://github.com/teddygroves/bibat/blob/main/bibat/examples/baseball/docs/report.qmd">here</a>.</p> +</section> </main> @@ -3696,7 +3731,7 @@ <h1>Extending the analysis to the baseballdatabank data</h1> toggleBodyColorMode(bsSheetEl); } } - toggleBodyColorPrimary(); + toggleBodyColorPrimary(); const icon = ""; const anchorJS = new window.AnchorJS(); anchorJS.options = { @@ -3706,7 +3741,7 @@ <h1>Extending the analysis to the baseballdatabank data</h1> anchorJS.add('.anchored'); const isCodeAnnotation = (el) => { for (const clz of el.classList) { - if (clz.startsWith('code-annotation-')) { + if (clz.startsWith('code-annotation-')) { return true; } } @@ -3737,11 +3772,11 @@ <h1>Extending the analysis to the baseballdatabank data</h1> button.setAttribute("data-bs-toggle", "tooltip"); button.setAttribute("data-bs-placement", "left"); button.setAttribute("data-bs-title", "Copied!"); - tooltip = new bootstrap.Tooltip(button, - { trigger: "manual", + tooltip = new bootstrap.Tooltip(button, + { trigger: "manual", customClass: "code-copy-button-tooltip", offset: [0, -8]}); - tooltip.show(); + tooltip.show(); } setTimeout(function() { if (tooltip) { @@ -3771,7 +3806,7 @@ <h1>Extending the analysis to the baseballdatabank data</h1> theme: 'quarto', placement: 'bottom-start' }; - window.tippy(el, config); + window.tippy(el, config); } const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]'); for (var i=0; i<noterefs.length; i++) { @@ -3816,7 +3851,7 @@ <h1>Extending the analysis to the baseballdatabank data</h1> height = bottom - top; } if (top !== null && height !== null && parent !== null) { - // cook up a div (if necessary) and position it + // cook up a div (if necessary) and position it let div = window.document.getElementById("code-annotation-line-highlight"); if (div === null) { div = window.document.createElement("div"); @@ -3914,4 +3949,4 @@ <h1>Extending the analysis to the baseballdatabank data</h1> -</body></html> \ No newline at end of file +</body></html> diff --git a/bibat/examples/baseball/docs/report.qmd b/bibat/examples/baseball/docs/report.qmd index 42cacc4..0d02239 100644 --- a/bibat/examples/baseball/docs/report.qmd +++ b/bibat/examples/baseball/docs/report.qmd @@ -89,7 +89,7 @@ Choose an open source license from these options: (MIT, BSD-3-Clause, No license How would you like to document your project? (Quarto, Sphinx, No docs) [Quarto]: Would you like to create a tests directory? [y]: n Would you like to create a .github directory? [y]: n -> +> ``` After I answered the wizard's questions bibat creted a new folder called @@ -186,7 +186,7 @@ Finally I removed the example analysis's raw data: The first step in preparing data is to decide what prepared data looks like for the purposes of our analysis. Bibat provides dataclasses called `PreparedData` and `MeasurementsDF` to help get started with this, which I found in the file -`baseball/prepared_data.py`. +`baseball/data_preparation.py`. As it happens, prepared data looks very similar in this analysis and the example. All I had to do was change the `MeasurementsDF` definition a @@ -210,16 +210,26 @@ The next step is to write functions that return `PreparedData` objects. In this case I wrote a couple of data preparation functions: `prepare_data_2006` and `prepare_data_bdb`: -```{.python include="../baseball/data_preparation.py" start-line=111} +```{.python include="../baseball/data_preparation.py" start-line=101} ``` +I had to update the `prepare_data` function to take into account the +two raw data sources: + +```{.python include="../baseball/data_preparation.py" start-line=33 end-line=52} +``` + +To finish off I deleted the unused global variables `MEASUREMENTS_FILE`, +`NEW_COLNAMES`, `DROPNA_COLS` and `DIMS`, then checked if the function +`load_prepared_data` needed any changes: I was pretty sure it didn't. + To check that all this worked, I ran the data preparation script manually:^[I could also have just run `make analysis` again] ```{.zsh} > source .venv/bin/activate (baseball) > python baseball/prepare_data.py -``` +``` Now the folder `data/prepared/bdb` contained a file `data/prepared/bdb/measurements.csv` that looked like this: @@ -277,7 +287,7 @@ study](https://mc-stan.org/users/documentation/case-studies/gpareto_functions.ht ```{.stan} real gpareto_lpdf(vector y, real ymin, real k, real sigma) { - // generalised Pareto log pdf + // generalised Pareto log pdf int N = rows(y); real inv_k = inv(k); if (k<0 && max(y-ymin)/sigma > -inv_k) @@ -311,7 +321,8 @@ can write as many Stan input functions as you like and choose which one to run for any given inference. I started by defining some Stan input functions that pass arbitary prepared -data on to each of the models: +data on to each of the models:^[Note that this code uses the scipy function +`logit`, which it imported like this: `from scipy.special import logit`] ```{.python include="../baseball/stan_input_functions.py" start-line=11 end-line=34} ``` @@ -360,13 +371,14 @@ Here is the file `inferences/normal2006/config.toml`: ``` Note that: -* The Stan file, prepared data folder and stan input function are referred to by + +- The Stan file, prepared data folder and stan input function are referred to by strings. The analysis should raise an error if you enter a non-existing value. -* Both inferences are set to run in "prior" and "posterior" modes - the other +- Both inferences are set to run in "prior" and "posterior" modes - the other pre-defined mode is "kfold", but you can also write your own! -* You can enter arbitrary arguments to cmdstanpy's `CmdStanModel.sample` method +- You can enter arbitrary arguments to cmdstanpy's `CmdStanModel.sample` method in the `[sample_kwargs]` table. -* You can enter mode-specific overrides in `[sample_kwargs.<MODE>]`. This can be +- You can enter mode-specific overrides in `[sample_kwargs.<MODE>]`. This can be handy if you want to run more or fewer iterations for a certain mode. Now when I ran `make analysis` again, I saw messages indicating that Stan had @@ -385,10 +397,24 @@ inferences # Investigating the inferences Now that the inferences are ready it's time to check them out. Bibat provides a -jupyter notebook at `baseball/investigate.ipynb` for exactly this purpose. - -A lot of code from the example analysis's notebook was reusable, so I largely -followed its structure, with a few tweaks. +Jupyter notebook at `baseball/investigate.ipynb` for exactly this purpose. The +notebook's main job is to create plots and save them in the `plots` directory +when it is executed with the command `jupyter execute investigate.ipynb`, which +is the final step in the chain of commands that is triggered by `make +analysis`. + +A notebook is arguably a nicer home for code that creates plots than a plain +python script because it allows for literate documentation and an iterative +workflow. A notebook makes it easy to, for example, add some code to change the +scale of a plot, execute the code and see the new results, then update the +relevant documentation all in the same place. + +The code from the example analysis's notebook for loading `InferenceData` was +reusable with a few tweaks to avoid missing file errors, so I kept it. On the +other hand, I wanted to make some different plots from the ones in the example +analysis, including some that required loading prepared data. To check out +everything I did, see +[here](https://github.com/teddygroves/bibat/blob/main/bibat/examples/baseball/baseball/investigate.ipynb). # Choosing priors using push-forward calibration @@ -427,4 +453,10 @@ dramatically different certainty levels about the abilities of players with few at-bats. This pattern was true both for the small 2006 dataset and the much larger baseballdatabank dataset. +# Documenting the analysis +The final step was to document my analysis. To do this I edited the file +`docs/report.qmd`, then ran `quarto render docs/report.qmd`, which produced the +very html document that you are probably reading now! You can find the complete +`report.qmd` file +[here](https://github.com/teddygroves/bibat/blob/main/bibat/examples/baseball/docs/report.qmd). diff --git a/docs/_static/report.html b/docs/_static/report.html index 5a2efc9..61c38e3 100644 --- a/docs/_static/report.html +++ b/docs/_static/report.html @@ -3095,7 +3095,7 @@ )) {"Symbol"in self&&"iterator"in Symbol&&"function"==typeof Array.prototype[Symbol.iterator]?CreateMethodProperty(Array.prototype,"values",Array.prototype[Symbol.iterator]):CreateMethodProperty(Array.prototype,"values",function r(){var r=ToObject(this);return new ArrayIterator(r,"value")});}if (!("Symbol"in self&&"iterator"in self.Symbol&&!!Array.prototype[self.Symbol.iterator] )) {CreateMethodProperty(Array.prototype,Symbol.iterator,Array.prototype.values);}var StringIterator=function(){var t=function(e){if(!(this instanceof t))return new t(e);e=String(e),Iterator.call(this,e),Object.defineProperty(this,"__length__",{value:e.length,configurable:!1,enumerable:!1,writable:!1})};return Object.setPrototypeOf&&Object.setPrototypeOf(t,Iterator),t.prototype=Object.create(Iterator.prototype,{constructor:{value:t,configurable:!0,enumerable:!1,writable:!0},_next:{value:function(){if(this.__list__)return this.__nextIndex__<this.__length__?this.__nextIndex__++:void this._unBind()},configurable:!0,enumerable:!1,writable:!0},_resolve:{value:function(t){var e,r=this.__list__[t];return this.__nextIndex__===this.__length__?r:(e=r.charCodeAt(0),e>=55296&&e<=56319?r+this.__list__[this.__nextIndex__++]:r)},configurable:!0,enumerable:!1,writable:!0},toString:{value:function(){return"[object String Iterator]"},configurable:!0,enumerable:!1,writable:!0}}),t}();if (!("Symbol"in self&&"iterator"in self.Symbol&&!!String.prototype[self.Symbol.iterator] )) {CreateMethodProperty(String.prototype,Symbol.iterator,function(){var r=RequireObjectCoercible(this),t=ToString(r);return new StringIterator(t)});}})('object' === typeof window && window || 'object' === typeof self && self || 'object' === typeof global && global || {});</script> - + <script> (function () { var script = document.createElement("script"); @@ -3114,7 +3114,7 @@ <div id="quarto-margin-sidebar" class="sidebar margin-sidebar"> <nav id="TOC" role="doc-toc" class="toc-active"> <h2 id="toc-title">Table of contents</h2> - + <ul> <li><a href="#setup" id="toc-setup" class="nav-link active" data-scroll-target="#setup">Setup</a></li> <li><a href="#getting-raw-data" id="toc-getting-raw-data" class="nav-link" data-scroll-target="#getting-raw-data">Getting raw data</a></li> @@ -3125,6 +3125,7 @@ <h2 id="toc-title">Table of contents</h2> <li><a href="#investigating-the-inferences" id="toc-investigating-the-inferences" class="nav-link" data-scroll-target="#investigating-the-inferences">Investigating the inferences</a></li> <li><a href="#choosing-priors-using-push-forward-calibration" id="toc-choosing-priors-using-push-forward-calibration" class="nav-link" data-scroll-target="#choosing-priors-using-push-forward-calibration">Choosing priors using push-forward calibration</a></li> <li><a href="#extending-the-analysis-to-the-baseballdatabank-data" id="toc-extending-the-analysis-to-the-baseballdatabank-data" class="nav-link" data-scroll-target="#extending-the-analysis-to-the-baseballdatabank-data">Extending the analysis to the baseballdatabank data</a></li> + <li><a href="#documenting-the-analysis" id="toc-documenting-the-analysis" class="nav-link" data-scroll-target="#documenting-the-analysis">Documenting the analysis</a></li> </ul> </nav> </div> @@ -3146,11 +3147,11 @@ <h1 class="title">Modelling at-bats in baseball using the generalised Pareto dis <p>Teddy Groves </p> </div> </div> - - - + + + </div> - + </header> @@ -3278,7 +3279,7 @@ <h1>Getting raw data</h1> </section> <section id="preparing-the-data" class="level1 page-columns page-full"> <h1>Preparing the data</h1> -<p>The first step in preparing data is to decide what prepared data looks like for the purposes of our analysis. Bibat provides dataclasses called <code>PreparedData</code> and <code>MeasurementsDF</code> to help get started with this, which I found in the file <code>baseball/prepared_data.py</code>.</p> +<p>The first step in preparing data is to decide what prepared data looks like for the purposes of our analysis. Bibat provides dataclasses called <code>PreparedData</code> and <code>MeasurementsDF</code> to help get started with this, which I found in the file <code>baseball/data_preparation.py</code>.</p> <div class="page-columns page-full"><p>As it happens, prepared data looks very similar in this analysis and the example. All I had to do was change the <code>MeasurementsDF</code> definition a little<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>:</p><div class="no-row-height column-margin column-container"><li id="fn1"><p><sup>1</sup> note that this class uses <a href="https://pandera.readthedocs.io">pandera</a>, a handy library for defining what a pandas dataframe should look like</p></li></div></div> <div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="kw">class</span> MeasurementsDF(pa.SchemaModel):</span> <span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a> <span class="co">"""A PreparedData should have a measurements dataframe like this.</span></span> @@ -3291,94 +3292,117 @@ <h1>Preparing the data</h1> <span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a> n_attempt: Series[<span class="bu">int</span>] <span class="op">=</span> pa.Field(ge <span class="op">=</span> <span class="dv">1</span>)</span> <span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a> n_success: Series[<span class="bu">int</span>] <span class="op">=</span> pa.Field(ge <span class="op">=</span> <span class="dv">0</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>The next step is to write functions that return <code>PreparedData</code> objects. In this case I wrote a couple of data preparation functions: <code>prepare_data_2006</code> and <code>prepare_data_bdb</code>:</p> -<div class="sourceCode" id="cb9" data-startfrom="111"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 110;"><span id="cb9-111"><a href="#cb9-111" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data_2006(measurements_raw: pd.DataFrame) <span class="op">-></span> PreparedData:</span> -<span id="cb9-112"><a href="#cb9-112" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Prepare the 2006 data."""</span></span> -<span id="cb9-113"><a href="#cb9-113" aria-hidden="true" tabindex="-1"></a> measurements <span class="op">=</span> measurements_raw.rename(</span> -<span id="cb9-114"><a href="#cb9-114" aria-hidden="true" tabindex="-1"></a> columns<span class="op">=</span>{<span class="st">"K"</span>: <span class="st">"n_attempt"</span>, <span class="st">"y"</span>: <span class="st">"n_success"</span>}</span> -<span id="cb9-115"><a href="#cb9-115" aria-hidden="true" tabindex="-1"></a> ).assign(</span> -<span id="cb9-116"><a href="#cb9-116" aria-hidden="true" tabindex="-1"></a> season<span class="op">=</span><span class="st">"2006"</span>,</span> -<span id="cb9-117"><a href="#cb9-117" aria-hidden="true" tabindex="-1"></a> player_season<span class="op">=</span><span class="kw">lambda</span> df: [<span class="ss">f"2006-player-</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">"</span> <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="bu">len</span>(df))],</span> -<span id="cb9-118"><a href="#cb9-118" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-119"><a href="#cb9-119" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> PreparedData(</span> -<span id="cb9-120"><a href="#cb9-120" aria-hidden="true" tabindex="-1"></a> name<span class="op">=</span><span class="st">"2006"</span>,</span> -<span id="cb9-121"><a href="#cb9-121" aria-hidden="true" tabindex="-1"></a> coords<span class="op">=</span>{</span> -<span id="cb9-122"><a href="#cb9-122" aria-hidden="true" tabindex="-1"></a> <span class="st">"player_season"</span>: measurements[<span class="st">"player_season"</span>].tolist(),</span> -<span id="cb9-123"><a href="#cb9-123" aria-hidden="true" tabindex="-1"></a> <span class="st">"season"</span>: measurements[<span class="st">"season"</span>].tolist(),</span> -<span id="cb9-124"><a href="#cb9-124" aria-hidden="true" tabindex="-1"></a> },</span> -<span id="cb9-125"><a href="#cb9-125" aria-hidden="true" tabindex="-1"></a> measurements<span class="op">=</span>measurements,</span> -<span id="cb9-126"><a href="#cb9-126" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-127"><a href="#cb9-127" aria-hidden="true" tabindex="-1"></a></span> +<div class="sourceCode" id="cb9" data-startfrom="101"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 100;"><span id="cb9-101"><a href="#cb9-101" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-102"><a href="#cb9-102" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data_2006(measurements_raw: pd.DataFrame) <span class="op">-></span> PreparedData:</span> +<span id="cb9-103"><a href="#cb9-103" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Prepare the 2006 data."""</span></span> +<span id="cb9-104"><a href="#cb9-104" aria-hidden="true" tabindex="-1"></a> measurements <span class="op">=</span> measurements_raw.rename(</span> +<span id="cb9-105"><a href="#cb9-105" aria-hidden="true" tabindex="-1"></a> columns<span class="op">=</span>{<span class="st">"K"</span>: <span class="st">"n_attempt"</span>, <span class="st">"y"</span>: <span class="st">"n_success"</span>}</span> +<span id="cb9-106"><a href="#cb9-106" aria-hidden="true" tabindex="-1"></a> ).assign(</span> +<span id="cb9-107"><a href="#cb9-107" aria-hidden="true" tabindex="-1"></a> season<span class="op">=</span><span class="st">"2006"</span>,</span> +<span id="cb9-108"><a href="#cb9-108" aria-hidden="true" tabindex="-1"></a> player_season<span class="op">=</span><span class="kw">lambda</span> df: [<span class="ss">f"2006-player-</span><span class="sc">{</span>i<span class="op">+</span><span class="dv">1</span><span class="sc">}</span><span class="ss">"</span> <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="bu">len</span>(df))],</span> +<span id="cb9-109"><a href="#cb9-109" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-110"><a href="#cb9-110" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> PreparedData(</span> +<span id="cb9-111"><a href="#cb9-111" aria-hidden="true" tabindex="-1"></a> name<span class="op">=</span><span class="st">"2006"</span>,</span> +<span id="cb9-112"><a href="#cb9-112" aria-hidden="true" tabindex="-1"></a> coords<span class="op">=</span>{</span> +<span id="cb9-113"><a href="#cb9-113" aria-hidden="true" tabindex="-1"></a> <span class="st">"player_season"</span>: measurements[<span class="st">"player_season"</span>].tolist(),</span> +<span id="cb9-114"><a href="#cb9-114" aria-hidden="true" tabindex="-1"></a> <span class="st">"season"</span>: measurements[<span class="st">"season"</span>].tolist(),</span> +<span id="cb9-115"><a href="#cb9-115" aria-hidden="true" tabindex="-1"></a> },</span> +<span id="cb9-116"><a href="#cb9-116" aria-hidden="true" tabindex="-1"></a> measurements<span class="op">=</span>measurements,</span> +<span id="cb9-117"><a href="#cb9-117" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-118"><a href="#cb9-118" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-119"><a href="#cb9-119" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-120"><a href="#cb9-120" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data_bdb(</span> +<span id="cb9-121"><a href="#cb9-121" aria-hidden="true" tabindex="-1"></a> measurements_main: pd.DataFrame,</span> +<span id="cb9-122"><a href="#cb9-122" aria-hidden="true" tabindex="-1"></a> measurements_post: pd.DataFrame,</span> +<span id="cb9-123"><a href="#cb9-123" aria-hidden="true" tabindex="-1"></a> appearances: pd.DataFrame,</span> +<span id="cb9-124"><a href="#cb9-124" aria-hidden="true" tabindex="-1"></a>) <span class="op">-></span> PreparedData:</span> +<span id="cb9-125"><a href="#cb9-125" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Prepare the baseballdatabank data.</span></span> +<span id="cb9-126"><a href="#cb9-126" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-127"><a href="#cb9-127" aria-hidden="true" tabindex="-1"></a><span class="co"> There are a few substantive data choices here.</span></span> <span id="cb9-128"><a href="#cb9-128" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-129"><a href="#cb9-129" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data_bdb(</span> -<span id="cb9-130"><a href="#cb9-130" aria-hidden="true" tabindex="-1"></a> measurements_main: pd.DataFrame,</span> -<span id="cb9-131"><a href="#cb9-131" aria-hidden="true" tabindex="-1"></a> measurements_post: pd.DataFrame,</span> -<span id="cb9-132"><a href="#cb9-132" aria-hidden="true" tabindex="-1"></a> appearances: pd.DataFrame,</span> -<span id="cb9-133"><a href="#cb9-133" aria-hidden="true" tabindex="-1"></a>) <span class="op">-></span> PreparedData:</span> -<span id="cb9-134"><a href="#cb9-134" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Prepare the baseballdatabank data.</span></span> -<span id="cb9-135"><a href="#cb9-135" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-136"><a href="#cb9-136" aria-hidden="true" tabindex="-1"></a><span class="co"> There are a few substantive data choices here.</span></span> -<span id="cb9-137"><a href="#cb9-137" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-138"><a href="#cb9-138" aria-hidden="true" tabindex="-1"></a><span class="co"> First, the function excludes players who have a '1' in their position as</span></span> -<span id="cb9-139"><a href="#cb9-139" aria-hidden="true" tabindex="-1"></a><span class="co"> these are likely pitchers, as well as players with fewer than 20 at bats.</span></span> -<span id="cb9-140"><a href="#cb9-140" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-141"><a href="#cb9-141" aria-hidden="true" tabindex="-1"></a><span class="co"> Second, the function defines a successes and attempts according to the</span></span> -<span id="cb9-142"><a href="#cb9-142" aria-hidden="true" tabindex="-1"></a><span class="co"> 'on-base percentage' metric, so a success is a time when a player got a hit,</span></span> -<span id="cb9-143"><a href="#cb9-143" aria-hidden="true" tabindex="-1"></a><span class="co"> a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a</span></span> -<span id="cb9-144"><a href="#cb9-144" aria-hidden="true" tabindex="-1"></a><span class="co"> base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have</span></span> -<span id="cb9-145"><a href="#cb9-145" aria-hidden="true" tabindex="-1"></a><span class="co"> alternatively been calculated as just hits divided by at-bats, but my</span></span> -<span id="cb9-146"><a href="#cb9-146" aria-hidden="true" tabindex="-1"></a><span class="co"> understanding is that this method underrates players who are good at getting</span></span> -<span id="cb9-147"><a href="#cb9-147" aria-hidden="true" tabindex="-1"></a><span class="co"> walks.</span></span> -<span id="cb9-148"><a href="#cb9-148" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-149"><a href="#cb9-149" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span> -<span id="cb9-150"><a href="#cb9-150" aria-hidden="true" tabindex="-1"></a> pitchers <span class="op">=</span> appearances.loc[</span> -<span id="cb9-151"><a href="#cb9-151" aria-hidden="true" tabindex="-1"></a> <span class="kw">lambda</span> df: df[<span class="st">"G_p"</span>] <span class="op">==</span> df[<span class="st">"G_all"</span>], <span class="st">"playerID"</span></span> -<span id="cb9-152"><a href="#cb9-152" aria-hidden="true" tabindex="-1"></a> ].unique()</span> -<span id="cb9-153"><a href="#cb9-153" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-154"><a href="#cb9-154" aria-hidden="true" tabindex="-1"></a> <span class="kw">def</span> filter_batters(df: pd.DataFrame):</span> -<span id="cb9-155"><a href="#cb9-155" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> (</span> -<span id="cb9-156"><a href="#cb9-156" aria-hidden="true" tabindex="-1"></a> (df[<span class="st">"AB"</span>] <span class="op">>=</span> <span class="dv">20</span>)</span> -<span id="cb9-157"><a href="#cb9-157" aria-hidden="true" tabindex="-1"></a> <span class="op">&</span> (df[<span class="st">"season"</span>].ge(<span class="dv">2017</span>))</span> -<span id="cb9-158"><a href="#cb9-158" aria-hidden="true" tabindex="-1"></a> <span class="op">&</span> (<span class="op">~</span>df[<span class="st">"player"</span>].isin(pitchers))</span> -<span id="cb9-159"><a href="#cb9-159" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-160"><a href="#cb9-160" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb9-161"><a href="#cb9-161" aria-hidden="true" tabindex="-1"></a> measurements_main, measurements_post <span class="op">=</span> (</span> -<span id="cb9-162"><a href="#cb9-162" aria-hidden="true" tabindex="-1"></a> m.rename(columns<span class="op">=</span>{<span class="st">"yearID"</span>: <span class="st">"season"</span>, <span class="st">"playerID"</span>: <span class="st">"player"</span>})</span> -<span id="cb9-163"><a href="#cb9-163" aria-hidden="true" tabindex="-1"></a> .assign(</span> -<span id="cb9-164"><a href="#cb9-164" aria-hidden="true" tabindex="-1"></a> player_season<span class="op">=</span><span class="kw">lambda</span> df: df[<span class="st">"player"</span>].<span class="bu">str</span>.cat(</span> -<span id="cb9-165"><a href="#cb9-165" aria-hidden="true" tabindex="-1"></a> df[<span class="st">"season"</span>].astype(<span class="bu">str</span>)</span> -<span id="cb9-166"><a href="#cb9-166" aria-hidden="true" tabindex="-1"></a> ),</span> -<span id="cb9-167"><a href="#cb9-167" aria-hidden="true" tabindex="-1"></a> n_attempt<span class="op">=</span><span class="kw">lambda</span> df: df[[<span class="st">"AB"</span>, <span class="st">"BB"</span>, <span class="st">"HBP"</span>, <span class="st">"SF"</span>]]</span> -<span id="cb9-168"><a href="#cb9-168" aria-hidden="true" tabindex="-1"></a> .fillna(<span class="dv">0</span>)</span> -<span id="cb9-169"><a href="#cb9-169" aria-hidden="true" tabindex="-1"></a> .<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">1</span>)</span> -<span id="cb9-170"><a href="#cb9-170" aria-hidden="true" tabindex="-1"></a> .astype(<span class="bu">int</span>),</span> -<span id="cb9-171"><a href="#cb9-171" aria-hidden="true" tabindex="-1"></a> n_success<span class="op">=</span><span class="kw">lambda</span> df: (</span> -<span id="cb9-172"><a href="#cb9-172" aria-hidden="true" tabindex="-1"></a> df[[<span class="st">"H"</span>, <span class="st">"BB"</span>, <span class="st">"HBP"</span>]].fillna(<span class="dv">0</span>).<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">1</span>).astype(<span class="bu">int</span>)</span> -<span id="cb9-173"><a href="#cb9-173" aria-hidden="true" tabindex="-1"></a> ),</span> -<span id="cb9-174"><a href="#cb9-174" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-175"><a href="#cb9-175" aria-hidden="true" tabindex="-1"></a> .loc[</span> -<span id="cb9-176"><a href="#cb9-176" aria-hidden="true" tabindex="-1"></a> filter_batters,</span> -<span id="cb9-177"><a href="#cb9-177" aria-hidden="true" tabindex="-1"></a> [<span class="st">"player_season"</span>, <span class="st">"season"</span>, <span class="st">"n_attempt"</span>, <span class="st">"n_success"</span>],</span> -<span id="cb9-178"><a href="#cb9-178" aria-hidden="true" tabindex="-1"></a> ]</span> -<span id="cb9-179"><a href="#cb9-179" aria-hidden="true" tabindex="-1"></a> .copy()</span> -<span id="cb9-180"><a href="#cb9-180" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> m <span class="kw">in</span> [measurements_main, measurements_post]</span> -<span id="cb9-181"><a href="#cb9-181" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-182"><a href="#cb9-182" aria-hidden="true" tabindex="-1"></a> measurements <span class="op">=</span> (</span> -<span id="cb9-183"><a href="#cb9-183" aria-hidden="true" tabindex="-1"></a> pd.concat([measurements_main, measurements_post])</span> -<span id="cb9-184"><a href="#cb9-184" aria-hidden="true" tabindex="-1"></a> .groupby([<span class="st">"player_season"</span>, <span class="st">"season"</span>])</span> -<span id="cb9-185"><a href="#cb9-185" aria-hidden="true" tabindex="-1"></a> .<span class="bu">sum</span>()</span> -<span id="cb9-186"><a href="#cb9-186" aria-hidden="true" tabindex="-1"></a> .reset_index()</span> -<span id="cb9-187"><a href="#cb9-187" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb9-188"><a href="#cb9-188" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> PreparedData(</span> -<span id="cb9-189"><a href="#cb9-189" aria-hidden="true" tabindex="-1"></a> name<span class="op">=</span><span class="st">"bdb"</span>,</span> -<span id="cb9-190"><a href="#cb9-190" aria-hidden="true" tabindex="-1"></a> coords<span class="op">=</span>{</span> -<span id="cb9-191"><a href="#cb9-191" aria-hidden="true" tabindex="-1"></a> <span class="st">"player_season"</span>: measurements[<span class="st">"player_season"</span>].tolist(),</span> -<span id="cb9-192"><a href="#cb9-192" aria-hidden="true" tabindex="-1"></a> <span class="st">"season"</span>: measurements[<span class="st">"season"</span>].tolist(),</span> -<span id="cb9-193"><a href="#cb9-193" aria-hidden="true" tabindex="-1"></a> },</span> -<span id="cb9-194"><a href="#cb9-194" aria-hidden="true" tabindex="-1"></a> measurements<span class="op">=</span>measurements,</span> -<span id="cb9-195"><a href="#cb9-195" aria-hidden="true" tabindex="-1"></a> )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<span id="cb9-129"><a href="#cb9-129" aria-hidden="true" tabindex="-1"></a><span class="co"> First, the function excludes players who have a '1' in their position as</span></span> +<span id="cb9-130"><a href="#cb9-130" aria-hidden="true" tabindex="-1"></a><span class="co"> these are likely pitchers, as well as players with fewer than 20 at bats.</span></span> +<span id="cb9-131"><a href="#cb9-131" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-132"><a href="#cb9-132" aria-hidden="true" tabindex="-1"></a><span class="co"> Second, the function defines a successes and attempts according to the</span></span> +<span id="cb9-133"><a href="#cb9-133" aria-hidden="true" tabindex="-1"></a><span class="co"> 'on-base percentage' metric, so a success is a time when a player got a hit,</span></span> +<span id="cb9-134"><a href="#cb9-134" aria-hidden="true" tabindex="-1"></a><span class="co"> a base on ball/walk or a hit-by-pitch and an attempt is an at-bat or a</span></span> +<span id="cb9-135"><a href="#cb9-135" aria-hidden="true" tabindex="-1"></a><span class="co"> base-on-ball/walk or a hit-by-pitch or a sacrifice fly. This could have</span></span> +<span id="cb9-136"><a href="#cb9-136" aria-hidden="true" tabindex="-1"></a><span class="co"> alternatively been calculated as just hits divided by at-bats, but my</span></span> +<span id="cb9-137"><a href="#cb9-137" aria-hidden="true" tabindex="-1"></a><span class="co"> understanding is that this method underrates players who are good at getting</span></span> +<span id="cb9-138"><a href="#cb9-138" aria-hidden="true" tabindex="-1"></a><span class="co"> walks.</span></span> +<span id="cb9-139"><a href="#cb9-139" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-140"><a href="#cb9-140" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span> +<span id="cb9-141"><a href="#cb9-141" aria-hidden="true" tabindex="-1"></a> pitchers <span class="op">=</span> appearances.loc[</span> +<span id="cb9-142"><a href="#cb9-142" aria-hidden="true" tabindex="-1"></a> <span class="kw">lambda</span> df: df[<span class="st">"G_p"</span>] <span class="op">==</span> df[<span class="st">"G_all"</span>], <span class="st">"playerID"</span></span> +<span id="cb9-143"><a href="#cb9-143" aria-hidden="true" tabindex="-1"></a> ].unique()</span> +<span id="cb9-144"><a href="#cb9-144" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-145"><a href="#cb9-145" aria-hidden="true" tabindex="-1"></a> <span class="kw">def</span> filter_batters(df: pd.DataFrame):</span> +<span id="cb9-146"><a href="#cb9-146" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> (</span> +<span id="cb9-147"><a href="#cb9-147" aria-hidden="true" tabindex="-1"></a> (df[<span class="st">"AB"</span>] <span class="op">>=</span> <span class="dv">20</span>)</span> +<span id="cb9-148"><a href="#cb9-148" aria-hidden="true" tabindex="-1"></a> <span class="op">&</span> (df[<span class="st">"season"</span>].ge(<span class="dv">2017</span>))</span> +<span id="cb9-149"><a href="#cb9-149" aria-hidden="true" tabindex="-1"></a> <span class="op">&</span> (<span class="op">~</span>df[<span class="st">"player"</span>].isin(pitchers))</span> +<span id="cb9-150"><a href="#cb9-150" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-151"><a href="#cb9-151" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb9-152"><a href="#cb9-152" aria-hidden="true" tabindex="-1"></a> measurements_main, measurements_post <span class="op">=</span> (</span> +<span id="cb9-153"><a href="#cb9-153" aria-hidden="true" tabindex="-1"></a> m.rename(columns<span class="op">=</span>{<span class="st">"yearID"</span>: <span class="st">"season"</span>, <span class="st">"playerID"</span>: <span class="st">"player"</span>})</span> +<span id="cb9-154"><a href="#cb9-154" aria-hidden="true" tabindex="-1"></a> .assign(</span> +<span id="cb9-155"><a href="#cb9-155" aria-hidden="true" tabindex="-1"></a> player_season<span class="op">=</span><span class="kw">lambda</span> df: df[<span class="st">"player"</span>].<span class="bu">str</span>.cat(</span> +<span id="cb9-156"><a href="#cb9-156" aria-hidden="true" tabindex="-1"></a> df[<span class="st">"season"</span>].astype(<span class="bu">str</span>)</span> +<span id="cb9-157"><a href="#cb9-157" aria-hidden="true" tabindex="-1"></a> ),</span> +<span id="cb9-158"><a href="#cb9-158" aria-hidden="true" tabindex="-1"></a> n_attempt<span class="op">=</span><span class="kw">lambda</span> df: df[[<span class="st">"AB"</span>, <span class="st">"BB"</span>, <span class="st">"HBP"</span>, <span class="st">"SF"</span>]]</span> +<span id="cb9-159"><a href="#cb9-159" aria-hidden="true" tabindex="-1"></a> .fillna(<span class="dv">0</span>)</span> +<span id="cb9-160"><a href="#cb9-160" aria-hidden="true" tabindex="-1"></a> .<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">1</span>)</span> +<span id="cb9-161"><a href="#cb9-161" aria-hidden="true" tabindex="-1"></a> .astype(<span class="bu">int</span>),</span> +<span id="cb9-162"><a href="#cb9-162" aria-hidden="true" tabindex="-1"></a> n_success<span class="op">=</span><span class="kw">lambda</span> df: (</span> +<span id="cb9-163"><a href="#cb9-163" aria-hidden="true" tabindex="-1"></a> df[[<span class="st">"H"</span>, <span class="st">"BB"</span>, <span class="st">"HBP"</span>]].fillna(<span class="dv">0</span>).<span class="bu">sum</span>(axis<span class="op">=</span><span class="dv">1</span>).astype(<span class="bu">int</span>)</span> +<span id="cb9-164"><a href="#cb9-164" aria-hidden="true" tabindex="-1"></a> ),</span> +<span id="cb9-165"><a href="#cb9-165" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-166"><a href="#cb9-166" aria-hidden="true" tabindex="-1"></a> .loc[</span> +<span id="cb9-167"><a href="#cb9-167" aria-hidden="true" tabindex="-1"></a> filter_batters,</span> +<span id="cb9-168"><a href="#cb9-168" aria-hidden="true" tabindex="-1"></a> [<span class="st">"player_season"</span>, <span class="st">"season"</span>, <span class="st">"n_attempt"</span>, <span class="st">"n_success"</span>],</span> +<span id="cb9-169"><a href="#cb9-169" aria-hidden="true" tabindex="-1"></a> ]</span> +<span id="cb9-170"><a href="#cb9-170" aria-hidden="true" tabindex="-1"></a> .copy()</span> +<span id="cb9-171"><a href="#cb9-171" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> m <span class="kw">in</span> [measurements_main, measurements_post]</span> +<span id="cb9-172"><a href="#cb9-172" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-173"><a href="#cb9-173" aria-hidden="true" tabindex="-1"></a> measurements <span class="op">=</span> (</span> +<span id="cb9-174"><a href="#cb9-174" aria-hidden="true" tabindex="-1"></a> pd.concat([measurements_main, measurements_post])</span> +<span id="cb9-175"><a href="#cb9-175" aria-hidden="true" tabindex="-1"></a> .groupby([<span class="st">"player_season"</span>, <span class="st">"season"</span>])</span> +<span id="cb9-176"><a href="#cb9-176" aria-hidden="true" tabindex="-1"></a> .<span class="bu">sum</span>()</span> +<span id="cb9-177"><a href="#cb9-177" aria-hidden="true" tabindex="-1"></a> .reset_index()</span> +<span id="cb9-178"><a href="#cb9-178" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb9-179"><a href="#cb9-179" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> PreparedData(</span> +<span id="cb9-180"><a href="#cb9-180" aria-hidden="true" tabindex="-1"></a> name<span class="op">=</span><span class="st">"bdb"</span>,</span> +<span id="cb9-181"><a href="#cb9-181" aria-hidden="true" tabindex="-1"></a> coords<span class="op">=</span>{</span> +<span id="cb9-182"><a href="#cb9-182" aria-hidden="true" tabindex="-1"></a> <span class="st">"player_season"</span>: measurements[<span class="st">"player_season"</span>].tolist(),</span> +<span id="cb9-183"><a href="#cb9-183" aria-hidden="true" tabindex="-1"></a> <span class="st">"season"</span>: measurements[<span class="st">"season"</span>].tolist(),</span> +<span id="cb9-184"><a href="#cb9-184" aria-hidden="true" tabindex="-1"></a> },</span> +<span id="cb9-185"><a href="#cb9-185" aria-hidden="true" tabindex="-1"></a> measurements<span class="op">=</span>measurements,</span> +<span id="cb9-186"><a href="#cb9-186" aria-hidden="true" tabindex="-1"></a> )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>I had to update the <code>prepare_data</code> function to take into account the two raw data sources:</p> +<div class="sourceCode" id="cb10" data-startfrom="33"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 32;"><span id="cb10-33"><a href="#cb10-33" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb10-34"><a href="#cb10-34" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> prepare_data():</span> +<span id="cb10-35"><a href="#cb10-35" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Run main function."""</span></span> +<span id="cb10-36"><a href="#cb10-36" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(<span class="st">"Reading raw data..."</span>)</span> +<span id="cb10-37"><a href="#cb10-37" aria-hidden="true" tabindex="-1"></a> raw_data <span class="op">=</span> {</span> +<span id="cb10-38"><a href="#cb10-38" aria-hidden="true" tabindex="-1"></a> k: [pd.read_csv(<span class="bu">file</span>, index_col<span class="op">=</span><span class="va">None</span>) <span class="cf">for</span> <span class="bu">file</span> <span class="kw">in</span> v]</span> +<span id="cb10-39"><a href="#cb10-39" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> k, v <span class="kw">in</span> RAW_DATA_FILES.items()</span> +<span id="cb10-40"><a href="#cb10-40" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb10-41"><a href="#cb10-41" aria-hidden="true" tabindex="-1"></a> data_preparation_functions_to_run <span class="op">=</span> {</span> +<span id="cb10-42"><a href="#cb10-42" aria-hidden="true" tabindex="-1"></a> <span class="st">"2006"</span>: prepare_data_2006,</span> +<span id="cb10-43"><a href="#cb10-43" aria-hidden="true" tabindex="-1"></a> <span class="st">"bdb"</span>: prepare_data_bdb,</span> +<span id="cb10-44"><a href="#cb10-44" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb10-45"><a href="#cb10-45" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(<span class="st">"Preparing data..."</span>)</span> +<span id="cb10-46"><a href="#cb10-46" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> name, dpf <span class="kw">in</span> data_preparation_functions_to_run.items():</span> +<span id="cb10-47"><a href="#cb10-47" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(<span class="ss">f"Running data preparation function </span><span class="sc">{</span>dpf<span class="sc">.</span><span class="va">__name__</span><span class="sc">}</span><span class="ss">..."</span>)</span> +<span id="cb10-48"><a href="#cb10-48" aria-hidden="true" tabindex="-1"></a> prepared_data <span class="op">=</span> dpf(<span class="op">*</span>raw_data[name])</span> +<span id="cb10-49"><a href="#cb10-49" aria-hidden="true" tabindex="-1"></a> output_dir <span class="op">=</span> os.path.join(PREPARED_DIR, prepared_data.name)</span> +<span id="cb10-50"><a href="#cb10-50" aria-hidden="true" tabindex="-1"></a> <span class="bu">print</span>(<span class="ss">f"</span><span class="ch">\t</span><span class="ss">writing files to </span><span class="sc">{</span>output_dir<span class="sc">}</span><span class="ss">"</span>)</span> +<span id="cb10-51"><a href="#cb10-51" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> <span class="kw">not</span> os.path.exists(PREPARED_DIR):</span> +<span id="cb10-52"><a href="#cb10-52" aria-hidden="true" tabindex="-1"></a> os.mkdir(PREPARED_DIR)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>To finish off I deleted the unused global variables <code>MEASUREMENTS_FILE</code>, <code>NEW_COLNAMES</code>, <code>DROPNA_COLS</code> and <code>DIMS</code>, then checked if the function <code>load_prepared_data</code> needed any changes: I was pretty sure it didn’t.</p> <div class="page-columns page-full"><p>To check that all this worked, I ran the data preparation script manually:<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a></p><div class="no-row-height column-margin column-container"><li id="fn2"><p><sup>2</sup> I could also have just run <code>make analysis</code> again</p></li></div></div> -<div class="sourceCode" id="cb10"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="op">></span> source <span class="ex">.venv/bin/activate</span></span> -<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="kw">(</span><span class="ex">baseball</span><span class="kw">)</span> <span class="op">></span> python <span class="ex">baseball/prepare_data.py</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb11"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="op">></span> source <span class="ex">.venv/bin/activate</span></span> +<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="kw">(</span><span class="ex">baseball</span><span class="kw">)</span> <span class="op">></span> python <span class="ex">baseball/prepare_data.py</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>Now the folder <code>data/prepared/bdb</code> contained a file <code>data/prepared/bdb/measurements.csv</code> that looked like this:</p> <pre class="{csv}"><code>,player_season,season,n_attempt,n_success 0,abreujo02,2018,553,180 @@ -3406,213 +3430,191 @@ <h1>Specifying statistical models</h1> \end{align*}\]</span></p> <p>Again I would choose priors for the hyperparameters that put most of the alphas between 0.1 and 0.4.</p> <p>To implement these models using Stan I first added the following function to the file <code>custom_functions.stan</code>. This was simply copied from <a href="https://mc-stan.org/users/documentation/case-studies/gpareto_functions.html#conclusion-on-the-data-analysis">the relevant part of the geomagnetic storms case study</a>.</p> -<div class="sourceCode" id="cb12"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="dt">real</span> gpareto_lpdf(<span class="dt">vector</span> y, <span class="dt">real</span> ymin, <span class="dt">real</span> k, <span class="dt">real</span> sigma) {</span> -<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a> <span class="co">// generalised Pareto log pdf </span></span> -<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> N = rows(y);</span> -<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> inv_k = inv(k);</span> -<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (k<<span class="dv">0</span> && max(y-ymin)/sigma > -inv_k)</span> -<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">reject</span>(<span class="st">"k<0 and max(y-ymin)/sigma > -1/k; found k, sigma ="</span>, k, <span class="st">", "</span>, sigma);</span> -<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (sigma<=<span class="dv">0</span>)</span> -<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a> <span class="kw">reject</span>(<span class="st">"sigma<=0; found sigma ="</span>, sigma);</span> -<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (fabs(k) > <span class="fl">1e-15</span>)</span> -<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> -(<span class="dv">1</span>+inv_k)*sum(log1p((y-ymin) * (k/sigma))) -N*log(sigma);</span> -<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span></span> -<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> -sum(y-ymin)/sigma -N*log(sigma); <span class="co">// limit k->0</span></span> -<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb13"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="dt">real</span> gpareto_lpdf(<span class="dt">vector</span> y, <span class="dt">real</span> ymin, <span class="dt">real</span> k, <span class="dt">real</span> sigma) {</span> +<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a> <span class="co">// generalised Pareto log pdf </span></span> +<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> N = rows(y);</span> +<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> inv_k = inv(k);</span> +<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (k<<span class="dv">0</span> && max(y-ymin)/sigma > -inv_k)</span> +<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">reject</span>(<span class="st">"k<0 and max(y-ymin)/sigma > -1/k; found k, sigma ="</span>, k, <span class="st">", "</span>, sigma);</span> +<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (sigma<=<span class="dv">0</span>)</span> +<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a> <span class="kw">reject</span>(<span class="st">"sigma<=0; found sigma ="</span>, sigma);</span> +<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (fabs(k) > <span class="fl">1e-15</span>)</span> +<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> -(<span class="dv">1</span>+inv_k)*sum(log1p((y-ymin) * (k/sigma))) -N*log(sigma);</span> +<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span></span> +<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> -sum(y-ymin)/sigma -N*log(sigma); <span class="co">// limit k->0</span></span> +<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>Next I wrote a file <code>gpareto.stan</code>:</p> -<div class="sourceCode" id="cb13"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="kw">functions</span> {</span> -<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="co">#include custom_functions.stan</span></span> -<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="kw">data</span> {</span> -<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> N; <span class="co">// items</span></span> -<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> K; <span class="co">// trials</span></span> -<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> y; <span class="co">// successes</span></span> -<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> min_alpha; <span class="co">// noone worse than this would be in the dataset</span></span> -<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> max_alpha;</span> -<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_sigma;</span> -<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_k;</span> -<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>,<span class="kw">upper</span>=<span class="dv">1</span>> likelihood;</span> -<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb13-14"><a href="#cb13-14" aria-hidden="true" tabindex="-1"></a><span class="kw">parameters</span> {</span> -<span id="cb13-15"><a href="#cb13-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=<span class="fl">0.001</span>> sigma; <span class="co">// scale parameter of generalised pareto distribution</span></span> -<span id="cb13-16"><a href="#cb13-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=-sigma/(max_alpha-min_alpha)> k; <span class="co">// shape parameter of generalised pareto distribution</span></span> -<span id="cb13-17"><a href="#cb13-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span><<span class="kw">lower</span>=min_alpha,<span class="kw">upper</span>=max_alpha>[N] alpha; <span class="co">// success log-odds</span></span> -<span id="cb13-18"><a href="#cb13-18" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb13-19"><a href="#cb13-19" aria-hidden="true" tabindex="-1"></a><span class="kw">model</span> {</span> -<span id="cb13-20"><a href="#cb13-20" aria-hidden="true" tabindex="-1"></a> sigma ~ normal(prior_sigma[<span class="dv">1</span>], prior_sigma[<span class="dv">2</span>]);</span> -<span id="cb13-21"><a href="#cb13-21" aria-hidden="true" tabindex="-1"></a> k ~ normal(prior_k[<span class="dv">1</span>], prior_k[<span class="dv">2</span>]);</span> -<span id="cb13-22"><a href="#cb13-22" aria-hidden="true" tabindex="-1"></a> alpha ~ gpareto(min_alpha, k, sigma);</span> -<span id="cb13-23"><a href="#cb13-23" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (likelihood){</span> -<span id="cb13-24"><a href="#cb13-24" aria-hidden="true" tabindex="-1"></a> y ~ binomial_logit(K, alpha);</span> -<span id="cb13-25"><a href="#cb13-25" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb13-26"><a href="#cb13-26" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb13-27"><a href="#cb13-27" aria-hidden="true" tabindex="-1"></a><span class="kw">generated quantities</span> {</span> -<span id="cb13-28"><a href="#cb13-28" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] yrep;</span> -<span id="cb13-29"><a href="#cb13-29" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] llik;</span> -<span id="cb13-30"><a href="#cb13-30" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span>:N){</span> -<span id="cb13-31"><a href="#cb13-31" aria-hidden="true" tabindex="-1"></a> yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));</span> -<span id="cb13-32"><a href="#cb13-32" aria-hidden="true" tabindex="-1"></a> llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);</span> -<span id="cb13-33"><a href="#cb13-33" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb13-34"><a href="#cb13-34" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> -<p>Finally I wrote a file <code>normal.stan</code>:</p> -<div class="sourceCode" id="cb14"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="kw">data</span> {</span> -<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> N; <span class="co">// items</span></span> -<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> K; <span class="co">// trials</span></span> -<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> y; <span class="co">// successes</span></span> -<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_mu;</span> -<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_tau;</span> -<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_b_K;</span> -<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>,<span class="kw">upper</span>=<span class="dv">1</span>> likelihood;</span> -<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="kw">transformed data</span> {</span> -<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] log_K = log(to_vector(K));</span> -<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] log_K_std = (log_K - mean(log_K)) / sd(log_K);</span> +<div class="sourceCode" id="cb14"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="kw">functions</span> {</span> +<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="co">#include custom_functions.stan</span></span> +<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="kw">data</span> {</span> +<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> N; <span class="co">// items</span></span> +<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> K; <span class="co">// trials</span></span> +<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> y; <span class="co">// successes</span></span> +<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> min_alpha; <span class="co">// noone worse than this would be in the dataset</span></span> +<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> max_alpha;</span> +<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_sigma;</span> +<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_k;</span> +<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>,<span class="kw">upper</span>=<span class="dv">1</span>> likelihood;</span> <span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a>}</span> <span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a><span class="kw">parameters</span> {</span> -<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> mu; <span class="co">// population mean of success log-odds</span></span> -<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=<span class="dv">0</span>> tau; <span class="co">// population sd of success log-odds</span></span> -<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> b_K;</span> -<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] alpha_std; <span class="co">// success log-odds (standardized)</span></span> -<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a><span class="kw">model</span> {</span> -<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a> b_K ~ normal(prior_b_K[<span class="dv">1</span>], prior_b_K[<span class="dv">2</span>]);</span> -<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a> mu ~ normal(prior_mu[<span class="dv">1</span>], prior_mu[<span class="dv">2</span>]);</span> -<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a> tau ~ normal(prior_tau[<span class="dv">1</span>], prior_tau[<span class="dv">2</span>]);</span> -<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a> alpha_std ~ normal(<span class="dv">0</span>, <span class="dv">1</span>);</span> -<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (likelihood){</span> -<span id="cb14-26"><a href="#cb14-26" aria-hidden="true" tabindex="-1"></a> y ~ binomial_logit(K, mu + b_K * log_K_std + tau * alpha_std);</span> -<span id="cb14-27"><a href="#cb14-27" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb14-28"><a href="#cb14-28" aria-hidden="true" tabindex="-1"></a>}</span> -<span id="cb14-29"><a href="#cb14-29" aria-hidden="true" tabindex="-1"></a><span class="kw">generated quantities</span> {</span> -<span id="cb14-30"><a href="#cb14-30" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] alpha = mu + b_K * log_K_std + tau * alpha_std;</span> -<span id="cb14-31"><a href="#cb14-31" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] yrep;</span> -<span id="cb14-32"><a href="#cb14-32" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] llik;</span> -<span id="cb14-33"><a href="#cb14-33" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span>:N){</span> -<span id="cb14-34"><a href="#cb14-34" aria-hidden="true" tabindex="-1"></a> yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));</span> -<span id="cb14-35"><a href="#cb14-35" aria-hidden="true" tabindex="-1"></a> llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);</span> -<span id="cb14-36"><a href="#cb14-36" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb14-37"><a href="#cb14-37" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=<span class="fl">0.001</span>> sigma; <span class="co">// scale parameter of generalised pareto distribution</span></span> +<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=-sigma/(max_alpha-min_alpha)> k; <span class="co">// shape parameter of generalised pareto distribution</span></span> +<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span><<span class="kw">lower</span>=min_alpha,<span class="kw">upper</span>=max_alpha>[N] alpha; <span class="co">// success log-odds</span></span> +<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a><span class="kw">model</span> {</span> +<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a> sigma ~ normal(prior_sigma[<span class="dv">1</span>], prior_sigma[<span class="dv">2</span>]);</span> +<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a> k ~ normal(prior_k[<span class="dv">1</span>], prior_k[<span class="dv">2</span>]);</span> +<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a> alpha ~ gpareto(min_alpha, k, sigma);</span> +<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (likelihood){</span> +<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a> y ~ binomial_logit(K, alpha);</span> +<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb14-26"><a href="#cb14-26" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb14-27"><a href="#cb14-27" aria-hidden="true" tabindex="-1"></a><span class="kw">generated quantities</span> {</span> +<span id="cb14-28"><a href="#cb14-28" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] yrep;</span> +<span id="cb14-29"><a href="#cb14-29" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] llik;</span> +<span id="cb14-30"><a href="#cb14-30" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span>:N){</span> +<span id="cb14-31"><a href="#cb14-31" aria-hidden="true" tabindex="-1"></a> yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));</span> +<span id="cb14-32"><a href="#cb14-32" aria-hidden="true" tabindex="-1"></a> llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);</span> +<span id="cb14-33"><a href="#cb14-33" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb14-34"><a href="#cb14-34" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>Finally I wrote a file <code>normal.stan</code>:</p> +<div class="sourceCode" id="cb15"><pre class="sourceCode stan code-with-copy"><code class="sourceCode stan"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="kw">data</span> {</span> +<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> N; <span class="co">// items</span></span> +<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> K; <span class="co">// trials</span></span> +<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[N] <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>> y; <span class="co">// successes</span></span> +<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_mu;</span> +<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_tau;</span> +<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">array</span>[<span class="dv">2</span>] <span class="dt">real</span> prior_b_K;</span> +<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><<span class="kw">lower</span>=<span class="dv">0</span>,<span class="kw">upper</span>=<span class="dv">1</span>> likelihood;</span> +<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="kw">transformed data</span> {</span> +<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] log_K = log(to_vector(K));</span> +<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] log_K_std = (log_K - mean(log_K)) / sd(log_K);</span> +<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a><span class="kw">parameters</span> {</span> +<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> mu; <span class="co">// population mean of success log-odds</span></span> +<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span><<span class="kw">lower</span>=<span class="dv">0</span>> tau; <span class="co">// population sd of success log-odds</span></span> +<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">real</span> b_K;</span> +<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] alpha_std; <span class="co">// success log-odds (standardized)</span></span> +<span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb15-20"><a href="#cb15-20" aria-hidden="true" tabindex="-1"></a><span class="kw">model</span> {</span> +<span id="cb15-21"><a href="#cb15-21" aria-hidden="true" tabindex="-1"></a> b_K ~ normal(prior_b_K[<span class="dv">1</span>], prior_b_K[<span class="dv">2</span>]);</span> +<span id="cb15-22"><a href="#cb15-22" aria-hidden="true" tabindex="-1"></a> mu ~ normal(prior_mu[<span class="dv">1</span>], prior_mu[<span class="dv">2</span>]);</span> +<span id="cb15-23"><a href="#cb15-23" aria-hidden="true" tabindex="-1"></a> tau ~ normal(prior_tau[<span class="dv">1</span>], prior_tau[<span class="dv">2</span>]);</span> +<span id="cb15-24"><a href="#cb15-24" aria-hidden="true" tabindex="-1"></a> alpha_std ~ normal(<span class="dv">0</span>, <span class="dv">1</span>);</span> +<span id="cb15-25"><a href="#cb15-25" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (likelihood){</span> +<span id="cb15-26"><a href="#cb15-26" aria-hidden="true" tabindex="-1"></a> y ~ binomial_logit(K, mu + b_K * log_K_std + tau * alpha_std);</span> +<span id="cb15-27"><a href="#cb15-27" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb15-28"><a href="#cb15-28" aria-hidden="true" tabindex="-1"></a>}</span> +<span id="cb15-29"><a href="#cb15-29" aria-hidden="true" tabindex="-1"></a><span class="kw">generated quantities</span> {</span> +<span id="cb15-30"><a href="#cb15-30" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] alpha = mu + b_K * log_K_std + tau * alpha_std;</span> +<span id="cb15-31"><a href="#cb15-31" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] yrep;</span> +<span id="cb15-32"><a href="#cb15-32" aria-hidden="true" tabindex="-1"></a> <span class="dt">vector</span>[N] llik;</span> +<span id="cb15-33"><a href="#cb15-33" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span>:N){</span> +<span id="cb15-34"><a href="#cb15-34" aria-hidden="true" tabindex="-1"></a> yrep[n] = binomial_rng(K[n], inv_logit(alpha[n]));</span> +<span id="cb15-35"><a href="#cb15-35" aria-hidden="true" tabindex="-1"></a> llik[n] = binomial_logit_lpmf(y[n] | K[n], alpha[n]);</span> +<span id="cb15-36"><a href="#cb15-36" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb15-37"><a href="#cb15-37" aria-hidden="true" tabindex="-1"></a>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> </section> -<section id="generating-stan-inputs" class="level1"> +<section id="generating-stan-inputs" class="level1 page-columns page-full"> <h1>Generating Stan inputs</h1> <p>Next I needed to tell the analysis how to turn some prepared data into a dictionary that can be used as input for Stan. Bibat assumes that this task is handled by functions that live in the file <code>baseball/stan_input_functions.py</code>, each of which takes in a <code>PreparedData</code> and returns a Python dictionary. You can write as many Stan input functions as you like and choose which one to run for any given inference.</p> -<p>I started by defining some Stan input functions that pass arbitary prepared data on to each of the models:</p> -<div class="sourceCode" id="cb15" data-startfrom="11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 10;"><span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_normal(ppd: PreparedData) <span class="op">-></span> Dict:</span> -<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a> <span class="co">"""General function for creating a Stan input."""</span></span> -<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span> -<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a> <span class="st">"N"</span>: <span class="bu">len</span>(ppd.measurements),</span> -<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a> <span class="st">"K"</span>: ppd.measurements[<span class="st">"n_attempt"</span>].values,</span> -<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a> <span class="st">"y"</span>: ppd.measurements[<span class="st">"n_success"</span>].values,</span> -<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_mu"</span>: [logit(<span class="fl">0.25</span>), <span class="fl">0.2</span>],</span> -<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_tau"</span>: [<span class="fl">0.2</span>, <span class="fl">0.1</span>],</span> -<span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_b_K"</span>: [<span class="dv">0</span>, <span class="fl">0.03</span>],</span> -<span id="cb15-20"><a href="#cb15-20" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb15-21"><a href="#cb15-21" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb15-22"><a href="#cb15-22" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb15-23"><a href="#cb15-23" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_gpareto(ppd: PreparedData) <span class="op">-></span> Dict:</span> -<span id="cb15-24"><a href="#cb15-24" aria-hidden="true" tabindex="-1"></a> <span class="co">"""General function for creating a Stan input."""</span></span> -<span id="cb15-25"><a href="#cb15-25" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span> -<span id="cb15-26"><a href="#cb15-26" aria-hidden="true" tabindex="-1"></a> <span class="st">"N"</span>: <span class="bu">len</span>(ppd.measurements),</span> -<span id="cb15-27"><a href="#cb15-27" aria-hidden="true" tabindex="-1"></a> <span class="st">"K"</span>: ppd.measurements[<span class="st">"n_attempt"</span>].values,</span> -<span id="cb15-28"><a href="#cb15-28" aria-hidden="true" tabindex="-1"></a> <span class="st">"y"</span>: ppd.measurements[<span class="st">"n_success"</span>].values,</span> -<span id="cb15-29"><a href="#cb15-29" aria-hidden="true" tabindex="-1"></a> <span class="st">"min_alpha"</span>: logit(<span class="fl">0.07</span>),</span> -<span id="cb15-30"><a href="#cb15-30" aria-hidden="true" tabindex="-1"></a> <span class="st">"max_alpha"</span>: logit(<span class="fl">0.5</span>),</span> -<span id="cb15-31"><a href="#cb15-31" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_sigma"</span>: [<span class="fl">1.5</span>, <span class="fl">0.4</span>],</span> -<span id="cb15-32"><a href="#cb15-32" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_k"</span>: [<span class="op">-</span><span class="fl">0.5</span>, <span class="dv">1</span>],</span> -<span id="cb15-33"><a href="#cb15-33" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb15-34"><a href="#cb15-34" aria-hidden="true" tabindex="-1"></a></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="page-columns page-full"><p>I started by defining some Stan input functions that pass arbitary prepared data on to each of the models:<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a></p><div class="no-row-height column-margin column-container"><li id="fn3"><p><sup>3</sup> Note that this code uses the scipy function <code>logit</code>, which it imported like this: <code>from scipy.special import logit</code></p></li></div></div> +<div class="sourceCode" id="cb16" data-startfrom="11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 10;"><span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_normal(ppd: PreparedData) <span class="op">-></span> Dict:</span> +<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a> <span class="co">"""General function for creating a Stan input."""</span></span> +<span id="cb16-13"><a href="#cb16-13" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span> +<span id="cb16-14"><a href="#cb16-14" aria-hidden="true" tabindex="-1"></a> <span class="st">"N"</span>: <span class="bu">len</span>(ppd.measurements),</span> +<span id="cb16-15"><a href="#cb16-15" aria-hidden="true" tabindex="-1"></a> <span class="st">"K"</span>: ppd.measurements[<span class="st">"n_attempt"</span>].values,</span> +<span id="cb16-16"><a href="#cb16-16" aria-hidden="true" tabindex="-1"></a> <span class="st">"y"</span>: ppd.measurements[<span class="st">"n_success"</span>].values,</span> +<span id="cb16-17"><a href="#cb16-17" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_mu"</span>: [logit(<span class="fl">0.25</span>), <span class="fl">0.2</span>],</span> +<span id="cb16-18"><a href="#cb16-18" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_tau"</span>: [<span class="fl">0.2</span>, <span class="fl">0.1</span>],</span> +<span id="cb16-19"><a href="#cb16-19" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_b_K"</span>: [<span class="dv">0</span>, <span class="fl">0.03</span>],</span> +<span id="cb16-20"><a href="#cb16-20" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb16-21"><a href="#cb16-21" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb16-22"><a href="#cb16-22" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb16-23"><a href="#cb16-23" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_gpareto(ppd: PreparedData) <span class="op">-></span> Dict:</span> +<span id="cb16-24"><a href="#cb16-24" aria-hidden="true" tabindex="-1"></a> <span class="co">"""General function for creating a Stan input."""</span></span> +<span id="cb16-25"><a href="#cb16-25" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span> +<span id="cb16-26"><a href="#cb16-26" aria-hidden="true" tabindex="-1"></a> <span class="st">"N"</span>: <span class="bu">len</span>(ppd.measurements),</span> +<span id="cb16-27"><a href="#cb16-27" aria-hidden="true" tabindex="-1"></a> <span class="st">"K"</span>: ppd.measurements[<span class="st">"n_attempt"</span>].values,</span> +<span id="cb16-28"><a href="#cb16-28" aria-hidden="true" tabindex="-1"></a> <span class="st">"y"</span>: ppd.measurements[<span class="st">"n_success"</span>].values,</span> +<span id="cb16-29"><a href="#cb16-29" aria-hidden="true" tabindex="-1"></a> <span class="st">"min_alpha"</span>: logit(<span class="fl">0.07</span>),</span> +<span id="cb16-30"><a href="#cb16-30" aria-hidden="true" tabindex="-1"></a> <span class="st">"max_alpha"</span>: logit(<span class="fl">0.5</span>),</span> +<span id="cb16-31"><a href="#cb16-31" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_sigma"</span>: [<span class="fl">1.5</span>, <span class="fl">0.4</span>],</span> +<span id="cb16-32"><a href="#cb16-32" aria-hidden="true" tabindex="-1"></a> <span class="st">"prior_k"</span>: [<span class="op">-</span><span class="fl">0.5</span>, <span class="dv">1</span>],</span> +<span id="cb16-33"><a href="#cb16-33" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb16-34"><a href="#cb16-34" aria-hidden="true" tabindex="-1"></a></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>But why stop there? It can also be useful to generate Stan input data from a model with a model, by running it in simulation model with hardcoded parameter values. Here are some functions that do this for both of our models:</p> -<div class="sourceCode" id="cb16" data-startfrom="36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 35;"><span id="cb16-36"><a href="#cb16-36" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_normal_fake(ppd: PreparedData) <span class="op">-></span> Dict:</span> -<span id="cb16-37"><a href="#cb16-37" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate fake Stan input consistent with the normal model."""</span></span> -<span id="cb16-38"><a href="#cb16-38" aria-hidden="true" tabindex="-1"></a> N <span class="op">=</span> <span class="bu">len</span>(ppd.measurements)</span> -<span id="cb16-39"><a href="#cb16-39" aria-hidden="true" tabindex="-1"></a> rng <span class="op">=</span> np.random.default_rng(seed<span class="op">=</span><span class="dv">1234</span>)</span> -<span id="cb16-40"><a href="#cb16-40" aria-hidden="true" tabindex="-1"></a> true_param_values <span class="op">=</span> {</span> -<span id="cb16-41"><a href="#cb16-41" aria-hidden="true" tabindex="-1"></a> <span class="st">"mu"</span>: logit(<span class="fl">0.25</span>),</span> -<span id="cb16-42"><a href="#cb16-42" aria-hidden="true" tabindex="-1"></a> <span class="st">"tau"</span>: <span class="fl">0.18</span>, <span class="co"># 2sds is 0.19 to 0.32 batting average</span></span> -<span id="cb16-43"><a href="#cb16-43" aria-hidden="true" tabindex="-1"></a> <span class="st">"b_K"</span>: <span class="fl">0.04</span>, <span class="co"># slight effect of more attempts</span></span> -<span id="cb16-44"><a href="#cb16-44" aria-hidden="true" tabindex="-1"></a> <span class="st">"alpha_std"</span>: rng.random.normal(loc<span class="op">=</span><span class="dv">0</span>, scale<span class="op">=</span><span class="dv">1</span>, size<span class="op">=</span>N),</span> -<span id="cb16-45"><a href="#cb16-45" aria-hidden="true" tabindex="-1"></a> }</span> -<span id="cb16-46"><a href="#cb16-46" aria-hidden="true" tabindex="-1"></a> K <span class="op">=</span> ppd.measurements[<span class="st">"n_attempt"</span>].values</span> -<span id="cb16-47"><a href="#cb16-47" aria-hidden="true" tabindex="-1"></a> log_K_std <span class="op">=</span> (np.log(K) <span class="op">-</span> np.log(K).mean()) <span class="op">/</span> np.log(K).std()</span> -<span id="cb16-48"><a href="#cb16-48" aria-hidden="true" tabindex="-1"></a> alpha <span class="op">=</span> (</span> -<span id="cb16-49"><a href="#cb16-49" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"mu"</span>]</span> -<span id="cb16-50"><a href="#cb16-50" aria-hidden="true" tabindex="-1"></a> <span class="op">+</span> true_param_values[<span class="st">"b_K"</span>] <span class="op">*</span> log_K_std</span> -<span id="cb16-51"><a href="#cb16-51" aria-hidden="true" tabindex="-1"></a> <span class="op">+</span> true_param_values[<span class="st">"tau"</span>] <span class="op">*</span> true_param_values[<span class="st">"alpha_std"</span>]</span> -<span id="cb16-52"><a href="#cb16-52" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb16-53"><a href="#cb16-53" aria-hidden="true" tabindex="-1"></a> y <span class="op">=</span> rng.random.binomial(K, expit(alpha))</span> -<span id="cb16-54"><a href="#cb16-54" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"N"</span>: N, <span class="st">"K"</span>: K, <span class="st">"y"</span>: y} <span class="op">|</span> true_param_values</span> -<span id="cb16-55"><a href="#cb16-55" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-56"><a href="#cb16-56" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-57"><a href="#cb16-57" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_gpareto_fake(ppd: PreparedData) <span class="op">-></span> Dict:</span> -<span id="cb16-58"><a href="#cb16-58" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate fake Stan input consistent with the gpareto model."""</span></span> -<span id="cb16-59"><a href="#cb16-59" aria-hidden="true" tabindex="-1"></a> N <span class="op">=</span> <span class="bu">len</span>(ppd.measurements)</span> -<span id="cb16-60"><a href="#cb16-60" aria-hidden="true" tabindex="-1"></a> K <span class="op">=</span> ppd.measurements[<span class="st">"n_attempt"</span>].values</span> -<span id="cb16-61"><a href="#cb16-61" aria-hidden="true" tabindex="-1"></a> min_alpha <span class="op">=</span> <span class="fl">0.1</span></span> -<span id="cb16-62"><a href="#cb16-62" aria-hidden="true" tabindex="-1"></a> rng <span class="op">=</span> np.random.default_rng(seed<span class="op">=</span><span class="dv">1234</span>)</span> -<span id="cb16-63"><a href="#cb16-63" aria-hidden="true" tabindex="-1"></a> true_param_values <span class="op">=</span> {<span class="st">"sigma"</span>: <span class="op">-</span><span class="fl">1.098</span>, <span class="st">"k"</span>: <span class="fl">0.18</span>}</span> -<span id="cb16-64"><a href="#cb16-64" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"alpha"</span>] <span class="op">=</span> gpareto_rvs(</span> -<span id="cb16-65"><a href="#cb16-65" aria-hidden="true" tabindex="-1"></a> rng,</span> -<span id="cb16-66"><a href="#cb16-66" aria-hidden="true" tabindex="-1"></a> N,</span> -<span id="cb16-67"><a href="#cb16-67" aria-hidden="true" tabindex="-1"></a> min_alpha,</span> -<span id="cb16-68"><a href="#cb16-68" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"k"</span>],</span> -<span id="cb16-69"><a href="#cb16-69" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"sigma"</span>],</span> -<span id="cb16-70"><a href="#cb16-70" aria-hidden="true" tabindex="-1"></a> )</span> -<span id="cb16-71"><a href="#cb16-71" aria-hidden="true" tabindex="-1"></a> y <span class="op">=</span> rng.binomial(K, expit(true_param_values[<span class="st">"alpha"</span>]))</span> -<span id="cb16-72"><a href="#cb16-72" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"N"</span>: N, <span class="st">"K"</span>: K, <span class="st">"y"</span>: y, <span class="st">"min_alpha"</span>: min_alpha} <span class="op">|</span> true_param_values</span> -<span id="cb16-73"><a href="#cb16-73" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-74"><a href="#cb16-74" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-75"><a href="#cb16-75" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gpareto_rvs(</span> -<span id="cb16-76"><a href="#cb16-76" aria-hidden="true" tabindex="-1"></a> rng: np.random.Generator, size: <span class="bu">int</span>, mu: <span class="bu">float</span>, k: <span class="bu">float</span>, sigma: <span class="bu">float</span></span> -<span id="cb16-77"><a href="#cb16-77" aria-hidden="true" tabindex="-1"></a>):</span> -<span id="cb16-78"><a href="#cb16-78" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate random numbers from a generalised pareto distribution.</span></span> -<span id="cb16-79"><a href="#cb16-79" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-80"><a href="#cb16-80" aria-hidden="true" tabindex="-1"></a><span class="co"> See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for</span></span> -<span id="cb16-81"><a href="#cb16-81" aria-hidden="true" tabindex="-1"></a><span class="co"> source.</span></span> -<span id="cb16-82"><a href="#cb16-82" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb16-83"><a href="#cb16-83" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span> -<span id="cb16-84"><a href="#cb16-84" aria-hidden="true" tabindex="-1"></a> U <span class="op">=</span> rng.uniform(size)</span> -<span id="cb16-85"><a href="#cb16-85" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> k <span class="op">==</span> <span class="dv">0</span>:</span> -<span id="cb16-86"><a href="#cb16-86" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> mu <span class="op">-</span> sigma <span class="op">*</span> np.log(U)</span> -<span id="cb16-87"><a href="#cb16-87" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span>:</span> -<span id="cb16-88"><a href="#cb16-88" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> mu <span class="op">+</span> (sigma <span class="op">*</span> (U<span class="op">**-</span>k) <span class="op">-</span> <span class="dv">1</span>) <span class="op">/</span> sigma</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb17" data-startfrom="36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python" style="counter-reset: source-line 35;"><span id="cb17-36"><a href="#cb17-36" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_normal_fake(ppd: PreparedData) <span class="op">-></span> Dict:</span> +<span id="cb17-37"><a href="#cb17-37" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate fake Stan input consistent with the normal model."""</span></span> +<span id="cb17-38"><a href="#cb17-38" aria-hidden="true" tabindex="-1"></a> N <span class="op">=</span> <span class="bu">len</span>(ppd.measurements)</span> +<span id="cb17-39"><a href="#cb17-39" aria-hidden="true" tabindex="-1"></a> rng <span class="op">=</span> np.random.default_rng(seed<span class="op">=</span><span class="dv">1234</span>)</span> +<span id="cb17-40"><a href="#cb17-40" aria-hidden="true" tabindex="-1"></a> true_param_values <span class="op">=</span> {</span> +<span id="cb17-41"><a href="#cb17-41" aria-hidden="true" tabindex="-1"></a> <span class="st">"mu"</span>: logit(<span class="fl">0.25</span>),</span> +<span id="cb17-42"><a href="#cb17-42" aria-hidden="true" tabindex="-1"></a> <span class="st">"tau"</span>: <span class="fl">0.18</span>, <span class="co"># 2sds is 0.19 to 0.32 batting average</span></span> +<span id="cb17-43"><a href="#cb17-43" aria-hidden="true" tabindex="-1"></a> <span class="st">"b_K"</span>: <span class="fl">0.04</span>, <span class="co"># slight effect of more attempts</span></span> +<span id="cb17-44"><a href="#cb17-44" aria-hidden="true" tabindex="-1"></a> <span class="st">"alpha_std"</span>: rng.random.normal(loc<span class="op">=</span><span class="dv">0</span>, scale<span class="op">=</span><span class="dv">1</span>, size<span class="op">=</span>N),</span> +<span id="cb17-45"><a href="#cb17-45" aria-hidden="true" tabindex="-1"></a> }</span> +<span id="cb17-46"><a href="#cb17-46" aria-hidden="true" tabindex="-1"></a> K <span class="op">=</span> ppd.measurements[<span class="st">"n_attempt"</span>].values</span> +<span id="cb17-47"><a href="#cb17-47" aria-hidden="true" tabindex="-1"></a> log_K_std <span class="op">=</span> (np.log(K) <span class="op">-</span> np.log(K).mean()) <span class="op">/</span> np.log(K).std()</span> +<span id="cb17-48"><a href="#cb17-48" aria-hidden="true" tabindex="-1"></a> alpha <span class="op">=</span> (</span> +<span id="cb17-49"><a href="#cb17-49" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"mu"</span>]</span> +<span id="cb17-50"><a href="#cb17-50" aria-hidden="true" tabindex="-1"></a> <span class="op">+</span> true_param_values[<span class="st">"b_K"</span>] <span class="op">*</span> log_K_std</span> +<span id="cb17-51"><a href="#cb17-51" aria-hidden="true" tabindex="-1"></a> <span class="op">+</span> true_param_values[<span class="st">"tau"</span>] <span class="op">*</span> true_param_values[<span class="st">"alpha_std"</span>]</span> +<span id="cb17-52"><a href="#cb17-52" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb17-53"><a href="#cb17-53" aria-hidden="true" tabindex="-1"></a> y <span class="op">=</span> rng.random.binomial(K, expit(alpha))</span> +<span id="cb17-54"><a href="#cb17-54" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"N"</span>: N, <span class="st">"K"</span>: K, <span class="st">"y"</span>: y} <span class="op">|</span> true_param_values</span> +<span id="cb17-55"><a href="#cb17-55" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-56"><a href="#cb17-56" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-57"><a href="#cb17-57" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_stan_input_gpareto_fake(ppd: PreparedData) <span class="op">-></span> Dict:</span> +<span id="cb17-58"><a href="#cb17-58" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate fake Stan input consistent with the gpareto model."""</span></span> +<span id="cb17-59"><a href="#cb17-59" aria-hidden="true" tabindex="-1"></a> N <span class="op">=</span> <span class="bu">len</span>(ppd.measurements)</span> +<span id="cb17-60"><a href="#cb17-60" aria-hidden="true" tabindex="-1"></a> K <span class="op">=</span> ppd.measurements[<span class="st">"n_attempt"</span>].values</span> +<span id="cb17-61"><a href="#cb17-61" aria-hidden="true" tabindex="-1"></a> min_alpha <span class="op">=</span> <span class="fl">0.1</span></span> +<span id="cb17-62"><a href="#cb17-62" aria-hidden="true" tabindex="-1"></a> rng <span class="op">=</span> np.random.default_rng(seed<span class="op">=</span><span class="dv">1234</span>)</span> +<span id="cb17-63"><a href="#cb17-63" aria-hidden="true" tabindex="-1"></a> true_param_values <span class="op">=</span> {<span class="st">"sigma"</span>: <span class="op">-</span><span class="fl">1.098</span>, <span class="st">"k"</span>: <span class="fl">0.18</span>}</span> +<span id="cb17-64"><a href="#cb17-64" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"alpha"</span>] <span class="op">=</span> gpareto_rvs(</span> +<span id="cb17-65"><a href="#cb17-65" aria-hidden="true" tabindex="-1"></a> rng,</span> +<span id="cb17-66"><a href="#cb17-66" aria-hidden="true" tabindex="-1"></a> N,</span> +<span id="cb17-67"><a href="#cb17-67" aria-hidden="true" tabindex="-1"></a> min_alpha,</span> +<span id="cb17-68"><a href="#cb17-68" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"k"</span>],</span> +<span id="cb17-69"><a href="#cb17-69" aria-hidden="true" tabindex="-1"></a> true_param_values[<span class="st">"sigma"</span>],</span> +<span id="cb17-70"><a href="#cb17-70" aria-hidden="true" tabindex="-1"></a> )</span> +<span id="cb17-71"><a href="#cb17-71" aria-hidden="true" tabindex="-1"></a> y <span class="op">=</span> rng.binomial(K, expit(true_param_values[<span class="st">"alpha"</span>]))</span> +<span id="cb17-72"><a href="#cb17-72" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {<span class="st">"N"</span>: N, <span class="st">"K"</span>: K, <span class="st">"y"</span>: y, <span class="st">"min_alpha"</span>: min_alpha} <span class="op">|</span> true_param_values</span> +<span id="cb17-73"><a href="#cb17-73" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-74"><a href="#cb17-74" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-75"><a href="#cb17-75" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gpareto_rvs(</span> +<span id="cb17-76"><a href="#cb17-76" aria-hidden="true" tabindex="-1"></a> rng: np.random.Generator, size: <span class="bu">int</span>, mu: <span class="bu">float</span>, k: <span class="bu">float</span>, sigma: <span class="bu">float</span></span> +<span id="cb17-77"><a href="#cb17-77" aria-hidden="true" tabindex="-1"></a>):</span> +<span id="cb17-78"><a href="#cb17-78" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Generate random numbers from a generalised pareto distribution.</span></span> +<span id="cb17-79"><a href="#cb17-79" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-80"><a href="#cb17-80" aria-hidden="true" tabindex="-1"></a><span class="co"> See https://en.wikipedia.org/wiki/Generalized_Pareto_distribution for</span></span> +<span id="cb17-81"><a href="#cb17-81" aria-hidden="true" tabindex="-1"></a><span class="co"> source.</span></span> +<span id="cb17-82"><a href="#cb17-82" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb17-83"><a href="#cb17-83" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span> +<span id="cb17-84"><a href="#cb17-84" aria-hidden="true" tabindex="-1"></a> U <span class="op">=</span> rng.uniform(size)</span> +<span id="cb17-85"><a href="#cb17-85" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> k <span class="op">==</span> <span class="dv">0</span>:</span> +<span id="cb17-86"><a href="#cb17-86" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> mu <span class="op">-</span> sigma <span class="op">*</span> np.log(U)</span> +<span id="cb17-87"><a href="#cb17-87" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span>:</span> +<span id="cb17-88"><a href="#cb17-88" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> mu <span class="op">+</span> (sigma <span class="op">*</span> (U<span class="op">**-</span>k) <span class="op">-</span> <span class="dv">1</span>) <span class="op">/</span> sigma</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> </section> <section id="specifying-inferences" class="level1"> <h1>Specifying inferences</h1> <p>Now all the building blocks for making statistical inferences - raw data, data preparation rules, statistical models and recipes for turning prepared data into model inputs - were in place. The last step before actually running Stan was to write down how put the blocks together. Bibat has another concept for this, called ‘inferences’.</p> <p>An inference in bibat is a folder containing a special file called <code>config.toml</code>. This file sets out what inferences you want to make: which statistical model, which prepared data function, which Stan input function, which parameters have which dimensions, which sampling modes to use and how to configure the sampler. The folder will later be filled up with the results of performing the specified inferences.</p> <p>I started by deleting the example inferences and creating two fresh folders, leaving me with an <code>inferences</code> folder looking like this:</p> -<div class="sourceCode" id="cb17"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="ex">inferences</span></span> -<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="ex">├──</span> gpareto2006</span> -<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> └── config.toml</span> -<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="ex">└──</span> normal2006</span> -<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a> <span class="ex">└──</span> config.toml</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb18"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="ex">inferences</span></span> +<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="ex">├──</span> gpareto2006</span> +<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> └── config.toml</span> +<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="ex">└──</span> normal2006</span> +<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a> <span class="ex">└──</span> config.toml</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>Here is the file <code>inferences/gpareto2006/config.toml</code>:</p> -<div class="sourceCode" id="cb18"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"gpareto2006"</span></span> -<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"gpareto.stan"</span></span> -<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"2006"</span></span> -<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_gpareto"</span></span> -<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> -<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> -<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a><span class="dt">alpha</span> <span class="op">=</span> <span class="op">[</span><span class="st">"player"</span><span class="op">]</span></span> -<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">stanc_options</span><span class="kw">]</span></span> -<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a><span class="dt">warn-pedantic</span> <span class="op">=</span> <span class="cn">true</span></span> -<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> -<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> -<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb18-16"><a href="#cb18-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb18-17"><a href="#cb18-17" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb18-18"><a href="#cb18-18" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">.</span><span class="dt">prior</span><span class="kw">]</span></span> -<span id="cb18-19"><a href="#cb18-19" aria-hidden="true" tabindex="-1"></a><span class="dt">chains</span> <span class="op">=</span> <span class="dv">2</span></span> -<span id="cb18-20"><a href="#cb18-20" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">1000</span></span> -<span id="cb18-21"><a href="#cb18-21" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> -<p>Here is the file <code>inferences/normal2006/config.toml</code>:</p> -<div class="sourceCode" id="cb19"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"normal2006"</span></span> -<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"normal.stan"</span></span> +<div class="sourceCode" id="cb19"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"gpareto2006"</span></span> +<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"gpareto.stan"</span></span> <span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"2006"</span></span> -<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_normal"</span></span> +<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_gpareto"</span></span> <span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> <span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a></span> <span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> @@ -3624,21 +3626,50 @@ <h1>Specifying inferences</h1> <span id="cb19-13"><a href="#cb19-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> <span id="cb19-14"><a href="#cb19-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> <span id="cb19-15"><a href="#cb19-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb19-16"><a href="#cb19-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> -<p>Note that: * The Stan file, prepared data folder and stan input function are referred to by strings. The analysis should raise an error if you enter a non-existing value. * Both inferences are set to run in “prior” and “posterior” modes - the other pre-defined mode is “kfold”, but you can also write your own! * You can enter arbitrary arguments to cmdstanpy’s <code>CmdStanModel.sample</code> method in the <code>[sample_kwargs]</code> table. * You can enter mode-specific overrides in <code>[sample_kwargs.<MODE>]</code>. This can be handy if you want to run more or fewer iterations for a certain mode.</p> +<span id="cb19-16"><a href="#cb19-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb19-17"><a href="#cb19-17" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb19-18"><a href="#cb19-18" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">.</span><span class="dt">prior</span><span class="kw">]</span></span> +<span id="cb19-19"><a href="#cb19-19" aria-hidden="true" tabindex="-1"></a><span class="dt">chains</span> <span class="op">=</span> <span class="dv">2</span></span> +<span id="cb19-20"><a href="#cb19-20" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">1000</span></span> +<span id="cb19-21"><a href="#cb19-21" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>Here is the file <code>inferences/normal2006/config.toml</code>:</p> +<div class="sourceCode" id="cb20"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"normal2006"</span></span> +<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"normal.stan"</span></span> +<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"2006"</span></span> +<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_normal"</span></span> +<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> +<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> +<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a><span class="dt">alpha</span> <span class="op">=</span> <span class="op">[</span><span class="st">"player"</span><span class="op">]</span></span> +<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">stanc_options</span><span class="kw">]</span></span> +<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a><span class="dt">warn-pedantic</span> <span class="op">=</span> <span class="cn">true</span></span> +<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb20-13"><a href="#cb20-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> +<span id="cb20-14"><a href="#cb20-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> +<span id="cb20-15"><a href="#cb20-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb20-16"><a href="#cb20-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<p>Note that:</p> +<ul> +<li>The Stan file, prepared data folder and stan input function are referred to by strings. The analysis should raise an error if you enter a non-existing value.</li> +<li>Both inferences are set to run in “prior” and “posterior” modes - the other pre-defined mode is “kfold”, but you can also write your own!</li> +<li>You can enter arbitrary arguments to cmdstanpy’s <code>CmdStanModel.sample</code> method in the <code>[sample_kwargs]</code> table.</li> +<li>You can enter mode-specific overrides in <code>[sample_kwargs.<MODE>]</code>. This can be handy if you want to run more or fewer iterations for a certain mode.</li> +</ul> <p>Now when I ran <code>make analysis</code> again, I saw messages indicating that Stan had run, and found that the <code>inferences</code> subfolders had been populated:</p> -<div class="sourceCode" id="cb20"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="ex">inferences</span></span> -<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="ex">├──</span> gpareto2006</span> -<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> ├── config.toml</span> -<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> └── idata.json</span> -<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="ex">└──</span> normal2006</span> -<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a> <span class="ex">├──</span> config.toml</span> -<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a> <span class="ex">└──</span> idata.json</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb21"><pre class="sourceCode zsh code-with-copy"><code class="sourceCode zsh"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="ex">inferences</span></span> +<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="ex">├──</span> gpareto2006</span> +<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> ├── config.toml</span> +<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a><span class="ex">│ </span> └── idata.json</span> +<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="ex">└──</span> normal2006</span> +<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a> <span class="ex">├──</span> config.toml</span> +<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a> <span class="ex">└──</span> idata.json</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> </section> <section id="investigating-the-inferences" class="level1"> <h1>Investigating the inferences</h1> -<p>Now that the inferences are ready it’s time to check them out. Bibat provides a jupyter notebook at <code>baseball/investigate.ipynb</code> for exactly this purpose.</p> -<p>A lot of code from the example analysis’s notebook was reusable, so I largely followed its structure, with a few tweaks.</p> +<p>Now that the inferences are ready it’s time to check them out. Bibat provides a Jupyter notebook at <code>baseball/investigate.ipynb</code> for exactly this purpose. The notebook’s main job is to create plots and save them in the <code>plots</code> directory when it is executed with the command <code>jupyter execute investigate.ipynb</code>, which is the final step in the chain of commands that is triggered by <code>make analysis</code>.</p> +<p>A notebook is arguably a nicer home for code that creates plots than a plain python script because it allows for literate documentation and an iterative workflow. A notebook makes it easy to, for example, add some code to change the scale of a plot, execute the code and see the new results, then update the relevant documentation all in the same place.</p> +<p>The code from the example analysis’s notebook for loading <code>InferenceData</code> was reusable with a few tweaks to avoid missing file errors, so I kept it. On the other hand, I wanted to make some different plots from the ones in the example analysis, including some that required loading prepared data. To check out everything I did, see <a href="https://github.com/teddygroves/bibat/blob/main/bibat/examples/baseball/baseball/investigate.ipynb">here</a>.</p> </section> <section id="choosing-priors-using-push-forward-calibration" class="level1"> <h1>Choosing priors using push-forward calibration</h1> @@ -3648,31 +3679,35 @@ <h1>Choosing priors using push-forward calibration</h1> <section id="extending-the-analysis-to-the-baseballdatabank-data" class="level1"> <h1>Extending the analysis to the baseballdatabank data</h1> <p>To model the more recent data, all I had to do was create some new inference folders with appropriate <code>prepared_data_dir</code> fields in their <code>config.toml</code> files. For example, here is the <code>config.toml</code> file for the <code>gparetobdb</code> inference:</p> -<div class="sourceCode" id="cb21"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"gparetobdb"</span></span> -<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"gpareto.stan"</span></span> -<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"bdb"</span></span> -<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_gpareto"</span></span> -<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> -<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> -<span id="cb21-8"><a href="#cb21-8" aria-hidden="true" tabindex="-1"></a><span class="dt">alpha</span> <span class="op">=</span> <span class="op">[</span><span class="st">"player"</span><span class="op">]</span></span> -<span id="cb21-9"><a href="#cb21-9" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb21-10"><a href="#cb21-10" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">stanc_options</span><span class="kw">]</span></span> -<span id="cb21-11"><a href="#cb21-11" aria-hidden="true" tabindex="-1"></a><span class="dt">warn-pedantic</span> <span class="op">=</span> <span class="cn">true</span></span> -<span id="cb21-12"><a href="#cb21-12" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb21-13"><a href="#cb21-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> -<span id="cb21-14"><a href="#cb21-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> -<span id="cb21-15"><a href="#cb21-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb21-16"><a href="#cb21-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb21-17"><a href="#cb21-17" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb21-18"><a href="#cb21-18" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">.</span><span class="dt">prior</span><span class="kw">]</span></span> -<span id="cb21-19"><a href="#cb21-19" aria-hidden="true" tabindex="-1"></a><span class="dt">chains</span> <span class="op">=</span> <span class="dv">2</span></span> -<span id="cb21-20"><a href="#cb21-20" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> -<span id="cb21-21"><a href="#cb21-21" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> +<div class="sourceCode" id="cb22"><pre class="sourceCode toml code-with-copy"><code class="sourceCode toml"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="dt">name</span> <span class="op">=</span> <span class="st">"gparetobdb"</span></span> +<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_file</span> <span class="op">=</span> <span class="st">"gpareto.stan"</span></span> +<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a><span class="dt">prepared_data_dir</span> <span class="op">=</span> <span class="st">"bdb"</span></span> +<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a><span class="dt">stan_input_function</span> <span class="op">=</span> <span class="st">"get_stan_input_gpareto"</span></span> +<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a><span class="dt">modes</span> <span class="op">=</span> <span class="op">[</span><span class="st">"prior"</span><span class="op">,</span> <span class="st">"posterior"</span><span class="op">]</span></span> +<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">dims</span><span class="kw">]</span></span> +<span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a><span class="dt">alpha</span> <span class="op">=</span> <span class="op">[</span><span class="st">"player"</span><span class="op">]</span></span> +<span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">stanc_options</span><span class="kw">]</span></span> +<span id="cb22-11"><a href="#cb22-11" aria-hidden="true" tabindex="-1"></a><span class="dt">warn-pedantic</span> <span class="op">=</span> <span class="cn">true</span></span> +<span id="cb22-12"><a href="#cb22-12" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb22-13"><a href="#cb22-13" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">]</span></span> +<span id="cb22-14"><a href="#cb22-14" aria-hidden="true" tabindex="-1"></a><span class="dt">save_warmup</span> <span class="op">=</span> <span class="cn">false</span></span> +<span id="cb22-15"><a href="#cb22-15" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb22-16"><a href="#cb22-16" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb22-17"><a href="#cb22-17" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb22-18"><a href="#cb22-18" aria-hidden="true" tabindex="-1"></a><span class="kw">[</span><span class="dt">sample_kwargs</span><span class="kw">.</span><span class="dt">prior</span><span class="kw">]</span></span> +<span id="cb22-19"><a href="#cb22-19" aria-hidden="true" tabindex="-1"></a><span class="dt">chains</span> <span class="op">=</span> <span class="dv">2</span></span> +<span id="cb22-20"><a href="#cb22-20" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_warmup</span> <span class="op">=</span> <span class="dv">2000</span></span> +<span id="cb22-21"><a href="#cb22-21" aria-hidden="true" tabindex="-1"></a><span class="dt">iter_sampling</span> <span class="op">=</span> <span class="dv">1000</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div> <p>After running <code>make analysis</code> one more time, I went back to the notebook <code>baseball/investigate.ipynb</code> and made plots of both models’ posterior 1%-99% success rate intervals for both datasets:</p> <p><img src="" class="img-fluid"></p> <p>I think this is very interesting. Both models’ prior distributions had similar regularisation levels, and they more or less agree about the abilities of the players with the most at-bats, both in terms of locations and the widths of the plausible intervals for the true success rate. However, the models ended up with dramatically different certainty levels about the abilities of players with few at-bats. This pattern was true both for the small 2006 dataset and the much larger baseballdatabank dataset.</p> </section> +<section id="documenting-the-analysis" class="level1"> +<h1>Documenting the analysis</h1> +<p>The final step was to document my analysis. To do this I edited the file <code>docs/report.qmd</code>, then ran <code>quarto render docs/report.qmd</code>, which produced the very html document that you are probably reading now! You can find the complete <code>report.qmd</code> file <a href="https://github.com/teddygroves/bibat/blob/main/bibat/examples/baseball/docs/report.qmd">here</a>.</p> +</section> </main> @@ -3696,7 +3731,7 @@ <h1>Extending the analysis to the baseballdatabank data</h1> toggleBodyColorMode(bsSheetEl); } } - toggleBodyColorPrimary(); + toggleBodyColorPrimary(); const icon = ""; const anchorJS = new window.AnchorJS(); anchorJS.options = { @@ -3706,7 +3741,7 @@ <h1>Extending the analysis to the baseballdatabank data</h1> anchorJS.add('.anchored'); const isCodeAnnotation = (el) => { for (const clz of el.classList) { - if (clz.startsWith('code-annotation-')) { + if (clz.startsWith('code-annotation-')) { return true; } } @@ -3737,11 +3772,11 @@ <h1>Extending the analysis to the baseballdatabank data</h1> button.setAttribute("data-bs-toggle", "tooltip"); button.setAttribute("data-bs-placement", "left"); button.setAttribute("data-bs-title", "Copied!"); - tooltip = new bootstrap.Tooltip(button, - { trigger: "manual", + tooltip = new bootstrap.Tooltip(button, + { trigger: "manual", customClass: "code-copy-button-tooltip", offset: [0, -8]}); - tooltip.show(); + tooltip.show(); } setTimeout(function() { if (tooltip) { @@ -3771,7 +3806,7 @@ <h1>Extending the analysis to the baseballdatabank data</h1> theme: 'quarto', placement: 'bottom-start' }; - window.tippy(el, config); + window.tippy(el, config); } const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]'); for (var i=0; i<noterefs.length; i++) { @@ -3816,7 +3851,7 @@ <h1>Extending the analysis to the baseballdatabank data</h1> height = bottom - top; } if (top !== null && height !== null && parent !== null) { - // cook up a div (if necessary) and position it + // cook up a div (if necessary) and position it let div = window.document.getElementById("code-annotation-line-highlight"); if (div === null) { div = window.document.createElement("div"); @@ -3914,4 +3949,4 @@ <h1>Extending the analysis to the baseballdatabank data</h1> -</body></html> \ No newline at end of file +</body></html> From 854e6ad8abf97aae7df676500da41e3ea02cd6b0 Mon Sep 17 00:00:00 2001 From: teddygroves <groves.teddy@gmail.com> Date: Tue, 23 May 2023 13:35:18 +0200 Subject: [PATCH 4/5] Fix isort first party package identification --- bibat/examples/baseball/baseball/data_preparation.py | 1 + bibat/examples/baseball/baseball/inference_configuration.py | 3 ++- bibat/examples/baseball/baseball/sample.py | 3 ++- bibat/examples/baseball/baseball/stan_input_functions.py | 3 ++- pyproject.toml | 2 ++ 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bibat/examples/baseball/baseball/data_preparation.py b/bibat/examples/baseball/baseball/data_preparation.py index d427d57..fe3b8d0 100644 --- a/bibat/examples/baseball/baseball/data_preparation.py +++ b/bibat/examples/baseball/baseball/data_preparation.py @@ -16,6 +16,7 @@ NAME_FILE = "name.txt" COORDS_FILE = "coords.json" +MEASUREMENTS_FILE = "measurements.csv" N_CV_FOLDS = 10 HERE = os.path.dirname(__file__) DATA_DIR = os.path.join(HERE, "..", "data") diff --git a/bibat/examples/baseball/baseball/inference_configuration.py b/bibat/examples/baseball/baseball/inference_configuration.py index 9088d73..7380db7 100644 --- a/bibat/examples/baseball/baseball/inference_configuration.py +++ b/bibat/examples/baseball/baseball/inference_configuration.py @@ -4,9 +4,10 @@ from typing import Callable, Dict, List, Optional import toml -from baseball import stan_input_functions from pydantic import BaseModel, Field, root_validator, validator +from baseball import stan_input_functions + AVAILABLE_MODES = ["prior", "posterior", "kfold"] HERE = os.path.dirname(os.path.abspath(__file__)) STAN_DIR = os.path.join(HERE, "stan") diff --git a/bibat/examples/baseball/baseball/sample.py b/bibat/examples/baseball/baseball/sample.py index 0f4cc64..36c07b6 100644 --- a/bibat/examples/baseball/baseball/sample.py +++ b/bibat/examples/baseball/baseball/sample.py @@ -7,6 +7,8 @@ import cmdstanpy import numpy as np import xarray as xr +from sklearn.model_selection import KFold + from baseball.data_preparation import load_prepared_data from baseball.inference_configuration import ( AVAILABLE_MODES, @@ -14,7 +16,6 @@ load_inference_configuration, ) from baseball.util import CoordDict -from sklearn.model_selection import KFold HERE = os.path.dirname(__file__) RUNS_DIR = os.path.join(HERE, "..", "inferences") diff --git a/bibat/examples/baseball/baseball/stan_input_functions.py b/bibat/examples/baseball/baseball/stan_input_functions.py index 74a8b4c..7d4d9c0 100644 --- a/bibat/examples/baseball/baseball/stan_input_functions.py +++ b/bibat/examples/baseball/baseball/stan_input_functions.py @@ -4,9 +4,10 @@ from typing import Dict import numpy as np -from baseball.data_preparation import PreparedData from scipy.special import expit, logit +from baseball.data_preparation import PreparedData + def get_stan_input_normal(ppd: PreparedData) -> Dict: """General function for creating a Stan input.""" diff --git a/pyproject.toml b/pyproject.toml index 9fe6b04..f346a8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,8 @@ extend-exclude = ''' ''' [tool.isort] +auto_identify_namespace_packages = true +known_first_party = ["bibat", "baseball"] multi_line_output = 3 include_trailing_comma = true force_grid_wrap = 0 From 2e454d1c97c737c318ace18f6d60c06ed687a58d Mon Sep 17 00:00:00 2001 From: teddygroves <groves.teddy@gmail.com> Date: Tue, 23 May 2023 14:00:59 +0200 Subject: [PATCH 5/5] bump version --- docs/conf.py | 2 +- pyproject.toml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index f2d4bc3..a5e5165 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ author = "Teddy Groves" # The full version, including alpha/beta/rc tags -release = "0.1.4" +release = "0.1.5" root_doc = "index" diff --git a/pyproject.toml b/pyproject.toml index f346a8e..4d4a55c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "bibat" authors = [ {name = "Teddy Groves", email = "tedgro@dtu.dk"}, ] -version = "0.1.4" +version = "0.1.5" classifiers = [ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Science/Research", @@ -75,7 +75,6 @@ extend-exclude = ''' ''' [tool.isort] -auto_identify_namespace_packages = true known_first_party = ["bibat", "baseball"] multi_line_output = 3 include_trailing_comma = true