Skip to content

Commit

Permalink
chore(experiments): Stats cleanup (#27151)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielbachhuber authored Jan 6, 2025
1 parent 4a07f37 commit 335edf2
Show file tree
Hide file tree
Showing 6 changed files with 249 additions and 105 deletions.
58 changes: 42 additions & 16 deletions posthog/hogql_queries/experiments/funnels_statistics_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,41 @@ def calculate_probabilities_v2(
for funnel conversion rates.
This function computes the probability that each variant is the best (i.e., has the highest
conversion rate) compared to all other variants, including the control. It uses samples
drawn from the posterior Beta distributions of each variant's conversion rate.
conversion rate) compared to all other variants, including the control. It uses a Beta
distribution as the "conjugate prior" for binomial (success/failure) data, and starts with
Beta(1,1) as a minimally informative prior distribution. The "conjugate prior" means that
the prior and posterior distributions are the same family, and the posterior is easy
to compute.
Parameters:
-----------
control : ExperimentVariantFunnelsBaseStats
Statistics for the control group, including success and failure counts
Statistics for the control group, containing success_count and failure_count
variants : list[ExperimentVariantFunnelsBaseStats]
List of statistics for test variants to compare against the control
Returns:
--------
list[float]
A list of probabilities where:
A list of probabilities that sum to 1, where:
- The first element is the probability that the control variant is the best
- Subsequent elements are the probabilities that each test variant is the best
Notes:
------
- Uses a Bayesian approach with Beta distributions as the posterior
- Uses Beta(1,1) as the prior, which is uniform over [0,1]
- Draws 10,000 samples from each variant's posterior distribution
- Uses a Bayesian approach with Beta distributions as conjugate prior for binomial data
- Uses Beta(1,1) as minimally informative prior (uniform over [0,1])
- Draws SAMPLE_SIZE (10,000) samples from each variant's posterior distribution
- Calculates win probability as frequency of samples where variant is maximum
Example:
--------
>>> from posthog.schema import ExperimentVariantFunnelsBaseStats
>>> from posthog.hogql_queries.experiments.funnels_statistics_v2 import calculate_probabilities_v2
>>> control = ExperimentVariantFunnelsBaseStats(key="control", success_count=100, failure_count=900)
>>> test = ExperimentVariantFunnelsBaseStats(key="test", success_count=150, failure_count=850)
>>> calculate_probabilities_v2(control, [test])
>>> # Returns: [0.001, 0.999] indicating the test variant is very likely to be best
"""
all_variants = [control, *variants]

Expand Down Expand Up @@ -179,27 +192,40 @@ def calculate_credible_intervals_v2(variants: list[ExperimentVariantFunnelsBaseS
Calculate Bayesian credible intervals for conversion rates of each variant.
This function computes the 95% credible intervals for the true conversion rate
of each variant, representing the range where we believe the true rate lies
with 95% probability.
of each variant using a Beta model. The interval represents the range where we
believe the true conversion rate lies with 95% probability.
Parameters:
-----------
variants : list[ExperimentVariantFunnelsBaseStats]
List of all variants including control, containing success and failure counts
List of all variants (including control), each containing success_count and failure_count
Returns:
--------
dict[str, list[float]]
Dictionary mapping variant keys to [lower, upper] credible intervals, where:
- lower is the 2.5th percentile of the posterior distribution
- upper is the 97.5th percentile of the posterior distribution
- lower is the 2.5th percentile of the Beta posterior distribution
- upper is the 97.5th percentile of the Beta posterior distribution
- intervals represent conversion rates between 0 and 1
Notes:
------
- Uses Beta distribution as the posterior
- Uses Beta(1,1) as the prior, which is uniform over [0,1]
- Returns 95% credible intervals
- Intervals become narrower with larger sample sizes
- Uses Beta distribution as conjugate prior for binomial data
- Uses Beta(1,1) as minimally informative prior (uniform over [0,1])
- Computes 95% credible intervals (2.5th to 97.5th percentiles)
- Intervals become narrower with more data (larger success_count + failure_count)
- Returns empty dict if any calculations fail
Example:
--------
>>> from posthog.schema import ExperimentVariantFunnelsBaseStats
>>> from posthog.hogql_queries.experiments.funnels_statistics_v2 import calculate_credible_intervals_v2
>>> variants = [
... ExperimentVariantFunnelsBaseStats(key="control", success_count=100, failure_count=900),
... ExperimentVariantFunnelsBaseStats(key="test", success_count=150, failure_count=850)
... ]
>>> calculate_credible_intervals_v2(variants)
>>> # Returns: {"control": [0.083, 0.120], "test": [0.129, 0.173]}
"""
intervals = {}

Expand Down
75 changes: 41 additions & 34 deletions posthog/hogql_queries/experiments/test/test_funnels_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
calculate_credible_intervals,
)
from posthog.test.base import APIBaseTest
from flaky import flaky


def create_variant(
Expand Down Expand Up @@ -45,6 +46,7 @@ def run_test_for_both_implementations(self, test_fn):
calculate_credible_intervals=calculate_credible_intervals_v2,
)

@flaky(max_runs=3, min_passes=1)
def test_small_sample_two_variants_not_significant(self):
"""Test with small sample size, two variants, no clear winner"""

Expand All @@ -58,16 +60,16 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.assertEqual(len(probabilities), 2)
if stats_version == 2:
self.assertAlmostEqual(probabilities[0], 0.15, delta=0.1)
self.assertAlmostEqual(probabilities[1], 0.85, delta=0.1)
self.assertAlmostEqual(probabilities[0], 0.149, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0.850, delta=0.05)
self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
self.assertEqual(p_value, 1)

# Check credible intervals
self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05)
self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05)
self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05)
self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05)
self.assertAlmostEqual(intervals["control"][0], 0.055, places=2)
self.assertAlmostEqual(intervals["control"][1], 0.174, places=2)
self.assertAlmostEqual(intervals["test"][0], 0.093, places=2)
self.assertAlmostEqual(intervals["test"][1], 0.233, places=2)
else:
# Original implementation behavior
self.assertTrue(0.1 < probabilities[0] < 0.5)
Expand All @@ -76,13 +78,14 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca
self.assertEqual(p_value, 1)

# Original implementation intervals
self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05)
self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05)
self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05)
self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05)
self.assertAlmostEqual(intervals["control"][0], 0.055, places=2)
self.assertAlmostEqual(intervals["control"][1], 0.174, places=2)
self.assertAlmostEqual(intervals["test"][0], 0.093, places=2)
self.assertAlmostEqual(intervals["test"][1], 0.233, places=2)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_large_sample_two_variants_significant(self):
"""Test with large sample size, two variants, clear winner"""

Expand All @@ -102,10 +105,10 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca
self.assertEqual(p_value, 0)

# Check credible intervals
self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01)
self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01)
self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01)
self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01)
self.assertAlmostEqual(intervals["control"][0], 0.095, places=2)
self.assertAlmostEqual(intervals["control"][1], 0.105, places=2)
self.assertAlmostEqual(intervals["test"][0], 0.145, places=2)
self.assertAlmostEqual(intervals["test"][1], 0.155, places=2)
else:
# Original implementation behavior
self.assertTrue(probabilities[1] > 0.5) # Test variant winning
Expand All @@ -114,13 +117,14 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca
self.assertLess(p_value, 0.05)

# Original implementation intervals
self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01)
self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01)
self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01)
self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01)
self.assertAlmostEqual(intervals["control"][0], 0.095, places=2)
self.assertAlmostEqual(intervals["control"][1], 0.105, places=2)
self.assertAlmostEqual(intervals["test"][0], 0.145, places=2)
self.assertAlmostEqual(intervals["test"][1], 0.155, places=2)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_many_variants_not_significant(self):
"""Test with multiple variants, no clear winner"""

Expand All @@ -142,14 +146,14 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

# Check credible intervals overlap
# Check credible intervals for control and all test variants
self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02)
self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02)
self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02)
self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02)
self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02)
self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02)
self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02)
self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02)
self.assertAlmostEqual(intervals["control"][0], 0.0829, places=2)
self.assertAlmostEqual(intervals["control"][1], 0.12, places=2)
self.assertAlmostEqual(intervals["test_a"][0], 0.0829, places=2)
self.assertAlmostEqual(intervals["test_a"][1], 0.12, places=2)
self.assertAlmostEqual(intervals["test_b"][0], 0.0829, places=2)
self.assertAlmostEqual(intervals["test_b"][1], 0.12, places=2)
self.assertAlmostEqual(intervals["test_c"][0], 0.0829, places=2)
self.assertAlmostEqual(intervals["test_c"][1], 0.12, places=2)
else:
# Original implementation behavior
self.assertTrue(all(0.1 < p < 0.9 for p in probabilities))
Expand All @@ -158,17 +162,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

# Check credible intervals overlap
# Check credible intervals for control and all test variants
self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02)
self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02)
self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02)
self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02)
self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02)
self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02)
self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02)
self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02)
self.assertAlmostEqual(intervals["control"][0], 0.081, places=2)
self.assertAlmostEqual(intervals["control"][1], 0.12, places=2)
self.assertAlmostEqual(intervals["test_a"][0], 0.081, places=2)
self.assertAlmostEqual(intervals["test_a"][1], 0.12, places=2)
self.assertAlmostEqual(intervals["test_b"][0], 0.081, places=2)
self.assertAlmostEqual(intervals["test_b"][1], 0.12, places=2)
self.assertAlmostEqual(intervals["test_c"][0], 0.081, places=2)
self.assertAlmostEqual(intervals["test_c"][1], 0.12, places=2)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_insufficient_sample_size(self):
"""Test with sample size below threshold"""

Expand Down Expand Up @@ -199,6 +204,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_expected_loss_minimal_difference(self):
"""Test expected loss when variants have very similar performance"""

Expand All @@ -222,6 +228,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_expected_loss_test_variant_clear_winner(self):
"""Test expected loss when one variant is clearly better"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
calculate_credible_intervals,
)
from posthog.test.base import APIBaseTest
from flaky import flaky


def create_variant(key: str, mean: float, exposure: float, absolute_exposure: int) -> ExperimentVariantTrendsBaseStats:
Expand Down Expand Up @@ -38,6 +39,7 @@ def run_test_for_both_implementations(self, test_fn):
calculate_credible_intervals=calculate_credible_intervals_v2_continuous,
)

@flaky(max_runs=3, min_passes=1)
def test_small_sample_two_variants_not_significant(self):
"""Test with small sample size, two variants, no clear winner"""

Expand Down Expand Up @@ -85,6 +87,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_large_sample_two_variants_significant(self):
"""Test with large sample size, two variants, clear winner"""

Expand Down Expand Up @@ -134,6 +137,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_large_sample_two_variants_strongly_significant(self):
"""Test with large sample size, two variants, very clear winner"""

Expand Down Expand Up @@ -179,6 +183,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_many_variants_not_significant(self):
"""Test with multiple variants, no clear winner"""

Expand Down Expand Up @@ -258,6 +263,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_many_variants_significant(self):
"""Test with multiple variants, one clear winner"""

Expand Down Expand Up @@ -327,6 +333,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_insufficient_sample_size(self):
"""Test with sample size below threshold"""

Expand Down Expand Up @@ -373,6 +380,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_edge_cases_zero_means(self):
"""Test edge cases like zero means"""

Expand Down Expand Up @@ -420,6 +428,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_edge_cases_near_zero_means(self):
"""Test edge cases like near-zero means"""

Expand Down Expand Up @@ -475,6 +484,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_expected_loss_minimal_difference(self):
"""Test expected loss when variants have very similar performance"""

Expand Down Expand Up @@ -504,6 +514,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=3, min_passes=1)
def test_expected_loss_test_variant_clear_winner(self):
"""Test expected loss when one variant is clearly better"""

Expand Down
Loading

0 comments on commit 335edf2

Please sign in to comment.