diff --git a/posthog/hogql_queries/experiments/funnels_statistics_v2.py b/posthog/hogql_queries/experiments/funnels_statistics_v2.py index 02f18d2f70740..73d23f9924a08 100644 --- a/posthog/hogql_queries/experiments/funnels_statistics_v2.py +++ b/posthog/hogql_queries/experiments/funnels_statistics_v2.py @@ -21,28 +21,41 @@ def calculate_probabilities_v2( for funnel conversion rates. This function computes the probability that each variant is the best (i.e., has the highest - conversion rate) compared to all other variants, including the control. It uses samples - drawn from the posterior Beta distributions of each variant's conversion rate. + conversion rate) compared to all other variants, including the control. It uses a Beta + distribution as the "conjugate prior" for binomial (success/failure) data, and starts with + Beta(1,1) as a minimally informative prior distribution. The "conjugate prior" means that + the prior and posterior distributions are the same family, and the posterior is easy + to compute. Parameters: ----------- control : ExperimentVariantFunnelsBaseStats - Statistics for the control group, including success and failure counts + Statistics for the control group, containing success_count and failure_count variants : list[ExperimentVariantFunnelsBaseStats] List of statistics for test variants to compare against the control Returns: -------- list[float] - A list of probabilities where: + A list of probabilities that sum to 1, where: - The first element is the probability that the control variant is the best - Subsequent elements are the probabilities that each test variant is the best Notes: ------ - - Uses a Bayesian approach with Beta distributions as the posterior - - Uses Beta(1,1) as the prior, which is uniform over [0,1] - - Draws 10,000 samples from each variant's posterior distribution + - Uses a Bayesian approach with Beta distributions as conjugate prior for binomial data + - Uses Beta(1,1) as minimally informative prior (uniform over [0,1]) + - Draws SAMPLE_SIZE (10,000) samples from each variant's posterior distribution + - Calculates win probability as frequency of samples where variant is maximum + + Example: + -------- + >>> from posthog.schema import ExperimentVariantFunnelsBaseStats + >>> from posthog.hogql_queries.experiments.funnels_statistics_v2 import calculate_probabilities_v2 + >>> control = ExperimentVariantFunnelsBaseStats(key="control", success_count=100, failure_count=900) + >>> test = ExperimentVariantFunnelsBaseStats(key="test", success_count=150, failure_count=850) + >>> calculate_probabilities_v2(control, [test]) + >>> # Returns: [0.001, 0.999] indicating the test variant is very likely to be best """ all_variants = [control, *variants] @@ -179,27 +192,40 @@ def calculate_credible_intervals_v2(variants: list[ExperimentVariantFunnelsBaseS Calculate Bayesian credible intervals for conversion rates of each variant. This function computes the 95% credible intervals for the true conversion rate - of each variant, representing the range where we believe the true rate lies - with 95% probability. + of each variant using a Beta model. The interval represents the range where we + believe the true conversion rate lies with 95% probability. Parameters: ----------- variants : list[ExperimentVariantFunnelsBaseStats] - List of all variants including control, containing success and failure counts + List of all variants (including control), each containing success_count and failure_count Returns: -------- dict[str, list[float]] Dictionary mapping variant keys to [lower, upper] credible intervals, where: - - lower is the 2.5th percentile of the posterior distribution - - upper is the 97.5th percentile of the posterior distribution + - lower is the 2.5th percentile of the Beta posterior distribution + - upper is the 97.5th percentile of the Beta posterior distribution + - intervals represent conversion rates between 0 and 1 Notes: ------ - - Uses Beta distribution as the posterior - - Uses Beta(1,1) as the prior, which is uniform over [0,1] - - Returns 95% credible intervals - - Intervals become narrower with larger sample sizes + - Uses Beta distribution as conjugate prior for binomial data + - Uses Beta(1,1) as minimally informative prior (uniform over [0,1]) + - Computes 95% credible intervals (2.5th to 97.5th percentiles) + - Intervals become narrower with more data (larger success_count + failure_count) + - Returns empty dict if any calculations fail + + Example: + -------- + >>> from posthog.schema import ExperimentVariantFunnelsBaseStats + >>> from posthog.hogql_queries.experiments.funnels_statistics_v2 import calculate_credible_intervals_v2 + >>> variants = [ + ... ExperimentVariantFunnelsBaseStats(key="control", success_count=100, failure_count=900), + ... ExperimentVariantFunnelsBaseStats(key="test", success_count=150, failure_count=850) + ... ] + >>> calculate_credible_intervals_v2(variants) + >>> # Returns: {"control": [0.083, 0.120], "test": [0.129, 0.173]} """ intervals = {} diff --git a/posthog/hogql_queries/experiments/test/test_funnels_statistics.py b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py index 2206ff92b9305..986d6ed449fbc 100644 --- a/posthog/hogql_queries/experiments/test/test_funnels_statistics.py +++ b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py @@ -11,6 +11,7 @@ calculate_credible_intervals, ) from posthog.test.base import APIBaseTest +from flaky import flaky def create_variant( @@ -45,6 +46,7 @@ def run_test_for_both_implementations(self, test_fn): calculate_credible_intervals=calculate_credible_intervals_v2, ) + @flaky(max_runs=3, min_passes=1) def test_small_sample_two_variants_not_significant(self): """Test with small sample size, two variants, no clear winner""" @@ -58,16 +60,16 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertAlmostEqual(probabilities[0], 0.15, delta=0.1) - self.assertAlmostEqual(probabilities[1], 0.85, delta=0.1) + self.assertAlmostEqual(probabilities[0], 0.149, delta=0.05) + self.assertAlmostEqual(probabilities[1], 0.850, delta=0.05) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) # Check credible intervals - self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05) - self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05) - self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05) - self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05) + self.assertAlmostEqual(intervals["control"][0], 0.055, places=2) + self.assertAlmostEqual(intervals["control"][1], 0.174, places=2) + self.assertAlmostEqual(intervals["test"][0], 0.093, places=2) + self.assertAlmostEqual(intervals["test"][1], 0.233, places=2) else: # Original implementation behavior self.assertTrue(0.1 < probabilities[0] < 0.5) @@ -76,13 +78,14 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # Original implementation intervals - self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05) - self.assertAlmostEqual(intervals["control"][1], 0.20, delta=0.05) - self.assertAlmostEqual(intervals["test"][0], 0.08, delta=0.05) - self.assertAlmostEqual(intervals["test"][1], 0.25, delta=0.05) + self.assertAlmostEqual(intervals["control"][0], 0.055, places=2) + self.assertAlmostEqual(intervals["control"][1], 0.174, places=2) + self.assertAlmostEqual(intervals["test"][0], 0.093, places=2) + self.assertAlmostEqual(intervals["test"][1], 0.233, places=2) self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_large_sample_two_variants_significant(self): """Test with large sample size, two variants, clear winner""" @@ -102,10 +105,10 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Check credible intervals - self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01) - self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01) - self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01) - self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01) + self.assertAlmostEqual(intervals["control"][0], 0.095, places=2) + self.assertAlmostEqual(intervals["control"][1], 0.105, places=2) + self.assertAlmostEqual(intervals["test"][0], 0.145, places=2) + self.assertAlmostEqual(intervals["test"][1], 0.155, places=2) else: # Original implementation behavior self.assertTrue(probabilities[1] > 0.5) # Test variant winning @@ -114,13 +117,14 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertLess(p_value, 0.05) # Original implementation intervals - self.assertAlmostEqual(intervals["control"][0], 0.095, delta=0.01) - self.assertAlmostEqual(intervals["control"][1], 0.105, delta=0.01) - self.assertAlmostEqual(intervals["test"][0], 0.145, delta=0.01) - self.assertAlmostEqual(intervals["test"][1], 0.155, delta=0.01) + self.assertAlmostEqual(intervals["control"][0], 0.095, places=2) + self.assertAlmostEqual(intervals["control"][1], 0.105, places=2) + self.assertAlmostEqual(intervals["test"][0], 0.145, places=2) + self.assertAlmostEqual(intervals["test"][1], 0.155, places=2) self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_many_variants_not_significant(self): """Test with multiple variants, no clear winner""" @@ -142,14 +146,14 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Check credible intervals overlap # Check credible intervals for control and all test variants - self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02) - self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02) - self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02) - self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02) - self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02) - self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02) - self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02) - self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02) + self.assertAlmostEqual(intervals["control"][0], 0.0829, places=2) + self.assertAlmostEqual(intervals["control"][1], 0.12, places=2) + self.assertAlmostEqual(intervals["test_a"][0], 0.0829, places=2) + self.assertAlmostEqual(intervals["test_a"][1], 0.12, places=2) + self.assertAlmostEqual(intervals["test_b"][0], 0.0829, places=2) + self.assertAlmostEqual(intervals["test_b"][1], 0.12, places=2) + self.assertAlmostEqual(intervals["test_c"][0], 0.0829, places=2) + self.assertAlmostEqual(intervals["test_c"][1], 0.12, places=2) else: # Original implementation behavior self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) @@ -158,17 +162,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Check credible intervals overlap # Check credible intervals for control and all test variants - self.assertAlmostEqual(intervals["control"][0], 0.09, delta=0.02) - self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.02) - self.assertAlmostEqual(intervals["test_a"][0], 0.09, delta=0.02) - self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.02) - self.assertAlmostEqual(intervals["test_b"][0], 0.09, delta=0.02) - self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.02) - self.assertAlmostEqual(intervals["test_c"][0], 0.09, delta=0.02) - self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.02) + self.assertAlmostEqual(intervals["control"][0], 0.081, places=2) + self.assertAlmostEqual(intervals["control"][1], 0.12, places=2) + self.assertAlmostEqual(intervals["test_a"][0], 0.081, places=2) + self.assertAlmostEqual(intervals["test_a"][1], 0.12, places=2) + self.assertAlmostEqual(intervals["test_b"][0], 0.081, places=2) + self.assertAlmostEqual(intervals["test_b"][1], 0.12, places=2) + self.assertAlmostEqual(intervals["test_c"][0], 0.081, places=2) + self.assertAlmostEqual(intervals["test_c"][1], 0.12, places=2) self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_insufficient_sample_size(self): """Test with sample size below threshold""" @@ -199,6 +204,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_expected_loss_minimal_difference(self): """Test expected loss when variants have very similar performance""" @@ -222,6 +228,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_expected_loss_test_variant_clear_winner(self): """Test expected loss when one variant is clearly better""" diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index a2b3c0a54fa7e..b8a4caf779f05 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -11,6 +11,7 @@ calculate_credible_intervals, ) from posthog.test.base import APIBaseTest +from flaky import flaky def create_variant(key: str, mean: float, exposure: float, absolute_exposure: int) -> ExperimentVariantTrendsBaseStats: @@ -38,6 +39,7 @@ def run_test_for_both_implementations(self, test_fn): calculate_credible_intervals=calculate_credible_intervals_v2_continuous, ) + @flaky(max_runs=3, min_passes=1) def test_small_sample_two_variants_not_significant(self): """Test with small sample size, two variants, no clear winner""" @@ -85,6 +87,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_large_sample_two_variants_significant(self): """Test with large sample size, two variants, clear winner""" @@ -134,6 +137,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_large_sample_two_variants_strongly_significant(self): """Test with large sample size, two variants, very clear winner""" @@ -179,6 +183,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_many_variants_not_significant(self): """Test with multiple variants, no clear winner""" @@ -258,6 +263,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_many_variants_significant(self): """Test with multiple variants, one clear winner""" @@ -327,6 +333,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_insufficient_sample_size(self): """Test with sample size below threshold""" @@ -373,6 +380,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_edge_cases_zero_means(self): """Test edge cases like zero means""" @@ -420,6 +428,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_edge_cases_near_zero_means(self): """Test edge cases like near-zero means""" @@ -475,6 +484,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_expected_loss_minimal_difference(self): """Test expected loss when variants have very similar performance""" @@ -504,6 +514,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_expected_loss_test_variant_clear_winner(self): """Test expected loss when one variant is clearly better""" diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py index ee5cf1502492f..8ee2a1a16f9c0 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py @@ -11,6 +11,7 @@ calculate_credible_intervals, ) from posthog.test.base import APIBaseTest +from flaky import flaky def create_variant(key: str, count: int, exposure: float, absolute_exposure: int) -> ExperimentVariantTrendsBaseStats: @@ -48,6 +49,7 @@ def run_test_for_both_implementations(self, test_fn): calculate_credible_intervals=calculate_credible_intervals_v2_count, ) + @flaky(max_runs=3, min_passes=1) def test_small_sample_two_variants_not_significant(self): """Test with small sample size, two variants, no clear winner""" @@ -82,6 +84,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_large_sample_two_variants_significant(self): """Test with large sample size, two variants, clear winner""" @@ -119,6 +122,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_large_sample_two_variants_strongly_significant(self): """Test with large sample size, two variants, very clear winner""" @@ -156,6 +160,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_many_variants_not_significant(self): """Test with multiple variants, no clear winner""" @@ -208,6 +213,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_many_variants_significant(self): """Test with multiple variants, one clear winner""" @@ -268,6 +274,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_real_world_data_1(self): """Test with multiple variants, one clear winner""" @@ -286,24 +293,30 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test], probabilities) intervals = calculate_credible_intervals([control, test]) self.assertEqual(len(probabilities), 2) - self.assertAlmostEqual(probabilities[1], 0.966, places=2) # test should be winning - self.assertAlmostEqual(probabilities[0], 0.034, places=2) # control should be losing if stats_version == 2: + self.assertAlmostEqual(probabilities[1], 0.966, delta=0.05) + self.assertAlmostEqual(probabilities[0], 0.034, delta=0.05) self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertLess(p_value, 0.01) self.assertGreater(p_value, 0.0) + self.assertAlmostEqual(intervals["control"][0], 0.094, places=2) + self.assertAlmostEqual(intervals["control"][1], 0.116, places=2) + self.assertAlmostEqual(intervals["test"][0], 0.107, places=2) + self.assertAlmostEqual(intervals["test"][1], 0.134, places=2) else: + self.assertAlmostEqual(probabilities[1], 0.966, delta=0.05) + self.assertAlmostEqual(probabilities[0], 0.034, delta=0.05) self.assertEqual(significance, ExperimentSignificanceCode.HIGH_P_VALUE) - self.assertAlmostEqual(p_value, 0.07, delta=0.01) + self.assertAlmostEqual(p_value, 0.07, places=2) - self.assertAlmostEqual(intervals["control"][0], 0.094, delta=0.01) - self.assertAlmostEqual(intervals["control"][1], 0.116, delta=0.01) - - self.assertAlmostEqual(intervals["test"][0], 0.107, delta=0.01) - self.assertAlmostEqual(intervals["test"][1], 0.129, delta=0.01) + self.assertAlmostEqual(intervals["control"][0], 0.094, places=2) + self.assertAlmostEqual(intervals["control"][1], 0.116, places=2) + self.assertAlmostEqual(intervals["test"][0], 0.107, places=2) + self.assertAlmostEqual(intervals["test"][1], 0.134, places=2) self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_insufficient_sample_size(self): """Test with sample size below threshold""" @@ -341,6 +354,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_edge_cases(self): """Test edge cases like zero counts""" @@ -374,6 +388,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_expected_loss_minimal_difference(self): """Test expected loss when variants have very similar performance""" @@ -403,6 +418,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=3, min_passes=1) def test_expected_loss_test_variant_clear_winner(self): """Test expected loss when one variant is clearly better""" diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index c0894302143ec..e03a0bb3fcecf 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -26,31 +26,56 @@ def calculate_probabilities_v2_continuous( ) -> list[float]: """ Calculate the win probabilities for each variant in an experiment using Bayesian analysis - for continuous metrics (e.g., revenue). + for continuous metrics (e.g., revenue) with log-normal distribution assumptions. - This function computes the probability that each variant is the best (i.e., has the highest - mean value) compared to all other variants, including the control. It uses samples - drawn from the posterior distributions of each variant's mean. + This function computes the probability that each variant is the best by comparing its + posterior distribution against all other variants. It uses a Normal-Inverse-Gamma prior + and performs analysis in log-space to handle right-skewed distributions typical of + metrics like revenue. Parameters: ----------- control_variant : ExperimentVariantTrendsBaseStats - Statistics for the control group, including mean value and exposure (number of users) + Statistics for the control group, containing the mean value (in count field) + and exposure (number of users) test_variants : list[ExperimentVariantTrendsBaseStats] List of statistics for test variants to compare against the control Returns: -------- list[float] - A list of probabilities where: - - The first element is the probability that the control variant is the best - - Subsequent elements are the probabilities that each test variant is the best + A list of probabilities where each element represents the probability that the + corresponding variant is the best (has highest mean value) among all variants: + - index 0: probability control variant is best + - index i>0: probability test variant i-1 is best + All probabilities sum to 1.0 Notes: ------ - - Uses a Bayesian approach with a t-distribution as the posterior - - Assumes a Normal-Inverse-Gamma prior - - Log-transforms the data to handle typical revenue distributions + - Uses log-transformation of data to handle right-skewed distributions + - Employs a Normal-Inverse-Gamma prior with parameters: + MU_0=0.0, KAPPA_0=1.0, ALPHA_0=1.0, BETA_0=1.0 + - Assumes constant variance in log-space (LOG_VARIANCE=0.75) + - Draws SAMPLE_SIZE=10000 samples from each posterior for probability estimation + + Example: + -------- + >>> from posthog.schema import ExperimentVariantTrendsBaseStats + >>> from posthog.hogql_queries.experiments.trends_statistics_v2_continuous import calculate_probabilities_v2_continuous + >>> control = ExperimentVariantTrendsBaseStats( + ... key="control", + ... count=50, # mean revenue per user + ... exposure=1.0, # exposure relative to control + ... absolute_exposure=500 # number of users + ... ) + >>> test = ExperimentVariantTrendsBaseStats( + ... key="test", + ... count=60, # mean revenue per user + ... exposure=1, # exposure relative to control + ... absolute_exposure=500 # number of users + ... ) + >>> calculate_probabilities_v2_continuous(control, [test]) + >>> # Returns: [0.0004, 0.9996] indicating the test variant is very likely to be best """ if len(test_variants) >= 10: raise ValidationError("Can't calculate experiment results for more than 10 variants", code="too_much_data") @@ -161,12 +186,19 @@ def are_results_significant_v2_continuous( def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, upper_bound=0.975): """ - Calculate Bayesian credible intervals for each variant's mean value. + Calculate Bayesian credible intervals for each variant's mean value using a log-normal model. + + This function computes credible intervals in log-space using a t-distribution posterior + derived from a Normal-Inverse-Gamma prior, then transforms the results back to the original + scale. This approach is particularly suitable for right-skewed metrics like revenue. Parameters: ----------- variants : list[ExperimentVariantTrendsBaseStats] - List of variants containing mean values and exposure data + List of variants where each variant contains: + - count: the mean value of the metric + - absolute_exposure: number of users/observations + - key: identifier for the variant lower_bound : float, optional (default=0.025) Lower percentile for the credible interval (2.5% for 95% CI) upper_bound : float, optional (default=0.975) @@ -175,7 +207,44 @@ def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, uppe Returns: -------- dict[str, tuple[float, float]] - Dictionary mapping variant keys to their credible intervals + Dictionary mapping variant keys to their credible intervals where: + - Key: variant identifier + - Value: tuple of (lower_bound, upper_bound) in original scale + Returns empty dict if any calculation errors occur + + Notes: + ------ + - Uses log-transformation to handle right-skewed distributions + - Employs Normal-Inverse-Gamma prior with parameters: + MU_0=0.0, KAPPA_0=1.0, ALPHA_0=1.0, BETA_0=1.0 + - Assumes constant variance in log-space (LOG_VARIANCE=0.75) + - Results are transformed back to original scale and guaranteed non-negative + - Handles potential calculation errors gracefully by returning empty dict + + Example: + -------- + >>> from posthog.schema import ExperimentVariantTrendsBaseStats + >>> from posthog.hogql_queries.experiments.trends_statistics_v2_continuous import calculate_credible_intervals_v2_continuous + >>> variants = [ + ... ExperimentVariantTrendsBaseStats( + ... key="control", + ... count=50.0, # mean revenue per user + ... exposure=1.0, # exposure relative to control + ... absolute_exposure=500 # number of users + ... ), + ... ExperimentVariantTrendsBaseStats( + ... key="test", + ... count=60.0, # mean revenue per user + ... exposure=1, # exposure relative to control + ... absolute_exposure=500 # number of users + ... ) + ... ] + >>> calculate_credible_intervals_v2_continuous(variants) + >>> # Returns something like: + >>> # { + >>> # 'control': (45.98, 53.53), # 95% confident true mean is between $45.98-$53.53 + >>> # 'test': (55.15, 64.22) # 95% confident true mean is between $55.15-$64.22 + >>> # } """ intervals = {} diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py index 208747a14c1a1..38bf41ac0ca1d 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py @@ -23,35 +23,38 @@ def calculate_probabilities_v2_count( Calculate the win probabilities for each variant in an experiment using Bayesian analysis. This function computes the probability that each variant is the best (i.e., has the highest - conversion rate) compared to all other variants, including the control. It uses samples - drawn from the posterior distributions of each variant's conversion rate. + rate) compared to all other variants, including the control. It uses a Gamma-Poisson model + where samples are drawn from the posterior Gamma distributions of each variant's rate. Parameters: ----------- control_variant : ExperimentVariantTrendsBaseStats - Statistics for the control group, including count (successes) and exposure (total trials) + Statistics for the control group, including count (events) and absolute_exposure test_variants : list[ExperimentVariantTrendsBaseStats] List of statistics for test variants to compare against the control Returns: -------- list[float] - A list of probabilities where: + A list of probabilities that sum to 1, where: - The first element is the probability that the control variant is the best - Subsequent elements are the probabilities that each test variant is the best Notes: ------ - - Uses a Bayesian approach with a Beta distribution as the posterior - - Assumes a minimally informative prior (alpha=1, beta=1) + - Uses a Bayesian approach with a Gamma distribution as the posterior + - Assumes a minimally informative Gamma prior (alpha=1, beta=1) - Draws samples from the posterior to estimate win probabilities + - Suitable for count/rate data following a Poisson distribution Example: -------- - >>> control = ExperimentVariantTrendsBaseStats(key="control", count=100, exposure=1000, absolute_exposure=1000) - >>> test = ExperimentVariantTrendsBaseStats(key="test", count=120, exposure=1000, absolute_exposure=1000) - >>> probabilities = calculate_probabilities_v2(control, [test]) - >>> # Returns: [0.085, 0.915] indicating the test variant is more likely to be the best + >>> from posthog.schema import ExperimentVariantTrendsBaseStats + >>> from posthog.hogql_queries.experiments.trends_statistics_v2_count import calculate_probabilities_v2_count + >>> control = ExperimentVariantTrendsBaseStats(key="control", count=100, exposure=1, absolute_exposure=1000) + >>> test = ExperimentVariantTrendsBaseStats(key="test", count=120, exposure=1, absolute_exposure=1000) + >>> calculate_probabilities_v2_count(control, [test]) + >>> # Returns: [0.088, 0.920] indicating the test variant is more likely to be the best """ if len(test_variants) >= 10: raise ValidationError("Can't calculate experiment results for more than 10 variants", code="too_much_data") @@ -96,28 +99,38 @@ def are_results_significant_v2_count( probabilities: list[Probability], ) -> tuple[ExperimentSignificanceCode, Probability]: """ - Determines if experiment results are statistically significant using Bayesian analysis. + Determines if experiment results are statistically significant. - This function evaluates the win probabilities of each variant to determine if any variant - is significantly better than the others. The method: - 1. Checks if sample sizes meet minimum threshold requirements - 2. Evaluates win probabilities from the posterior distributions - 3. Calculates expected loss for the winning variant + This function evaluates whether any variant can be confidently declared as best by: + 1. Checking if variants have sufficient exposure (minimum threshold) + 2. Evaluating if the highest win probability exceeds the significance threshold + 3. For the variant with highest rate, calculating expected loss compared to alternatives Parameters: ----------- control_variant : ExperimentVariantTrendsBaseStats - Statistics for the control group, including count and exposure data + Statistics for the control group, including count and absolute_exposure test_variants : list[ExperimentVariantTrendsBaseStats] List of statistics for test variants to compare against control probabilities : list[Probability] - List of win probabilities for each variant, as calculated by calculate_probabilities + Win probabilities for each variant (must sum to 1), as calculated by calculate_probabilities_v2_count Returns: -------- tuple[ExperimentSignificanceCode, Probability] - - ExperimentSignificanceCode indicating the significance status - - Expected loss value for significant results, 1.0 for non-significant results + - ExperimentSignificanceCode indicating result status: + * NOT_ENOUGH_EXPOSURE: if any variant has exposure below threshold + * LOW_WIN_PROBABILITY: if no variant exceeds probability threshold + * HIGH_LOSS: if expected loss is too high for best variant + * SIGNIFICANT: if a variant is confidently best + - Expected loss value (between 0 and 1) for significant results, 1.0 for non-significant results + + Notes: + ------ + - Uses FF_DISTRIBUTION_THRESHOLD for minimum exposure check + - Uses MIN_PROBABILITY_FOR_SIGNIFICANCE (default 0.9) for win probability threshold + - Uses EXPECTED_LOSS_SIGNIFICANCE_LEVEL for maximum acceptable expected loss + - Expected loss represents the expected rate difference between chosen variant and potential better alternatives """ # Check exposure thresholds for variant in test_variants: @@ -151,17 +164,16 @@ def are_results_significant_v2_count( def calculate_credible_intervals_v2_count(variants, lower_bound=0.025, upper_bound=0.975): """ - Calculate Bayesian credible intervals for each variant's conversion rate. + Calculate Bayesian credible intervals for each variant's rate using a Gamma-Poisson model. - Credible intervals represent the range where we believe the true conversion rate lies - with a specified probability (default 95%). Unlike frequentist confidence intervals, - these have a direct probabilistic interpretation: "There is a 95% probability that - the true conversion rate lies within this interval." + Credible intervals represent the range where we believe the true rate lies + with a specified probability (default 95%). These intervals have a direct probabilistic + interpretation: "There is a 95% probability that the true rate lies within this interval." Parameters: ----------- variants : list[ExperimentVariantTrendsBaseStats] - List of variants containing count (successes) and exposure (total trials) data + List of variants containing count (number of events) and absolute_exposure data lower_bound : float, optional (default=0.025) Lower percentile for the credible interval (2.5% for 95% CI) upper_bound : float, optional (default=0.975) @@ -171,22 +183,25 @@ def calculate_credible_intervals_v2_count(variants, lower_bound=0.025, upper_bou -------- dict[str, tuple[float, float]] Dictionary mapping variant keys to their credible intervals - Each interval is a tuple of (lower_bound, upper_bound) + Each interval is a tuple of (lower_bound, upper_bound) representing rates Notes: ------ - - Uses a Gamma distribution as the posterior distribution - - Assumes a minimally informative prior (alpha=1, beta=1) - - Intervals are calculated for visualization purposes, not for significance testing + - Uses a Gamma distribution as the posterior for the rate parameter + - Assumes a minimally informative Gamma prior (alpha=1, beta=1) + - Suitable for count/rate data following a Poisson distribution - Returns empty dict if any calculations fail + - Intervals represent rates (events per exposure) Example: -------- + >>> from posthog.schema import ExperimentVariantTrendsBaseStats + >>> from posthog.hogql_queries.experiments.trends_statistics_v2_count import calculate_credible_intervals_v2_count >>> variants = [ - ... ExperimentVariantTrendsBaseStats(key="control", count=100, exposure=1000, absolute_exposure=1000), - ... ExperimentVariantTrendsBaseStats(key="test", count=150, exposure=1000, absolute_exposure=1000) + ... ExperimentVariantTrendsBaseStats(key="control", count=100, exposure=1, absolute_exposure=1000), + ... ExperimentVariantTrendsBaseStats(key="test", count=150, exposure=1, absolute_exposure=1000) ... ] - >>> intervals = calculate_credible_intervals_v2(variants) + >>> calculate_credible_intervals_v2_count(variants) >>> # Returns: {"control": (0.082, 0.122), "test": (0.128, 0.176)} """ intervals = {}