From 53000ecdf458872abb2e98dd2cd409e67456a242 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Wed, 11 Dec 2024 05:07:46 -0800 Subject: [PATCH 01/34] Rename in prep for continuous implementation --- .../experiments/experiment_trends_query_runner.py | 14 +++++++------- ...atistics.py => test_trends_statistics_count.py} | 14 +++++++------- ...tistics_v2.py => trends_statistics_v2_count.py} | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) rename posthog/hogql_queries/experiments/test/{test_trends_statistics.py => test_trends_statistics_count.py} (97%) rename posthog/hogql_queries/experiments/{trends_statistics_v2.py => trends_statistics_v2_count.py} (98%) diff --git a/posthog/hogql_queries/experiments/experiment_trends_query_runner.py b/posthog/hogql_queries/experiments/experiment_trends_query_runner.py index 6c47688be3563..824da6f5c70b6 100644 --- a/posthog/hogql_queries/experiments/experiment_trends_query_runner.py +++ b/posthog/hogql_queries/experiments/experiment_trends_query_runner.py @@ -9,10 +9,10 @@ calculate_credible_intervals, calculate_probabilities, ) -from posthog.hogql_queries.experiments.trends_statistics_v2 import ( - are_results_significant_v2, - calculate_credible_intervals_v2, - calculate_probabilities_v2, +from posthog.hogql_queries.experiments.trends_statistics_v2_count import ( + are_results_significant_v2_count, + calculate_credible_intervals_v2_count, + calculate_probabilities_v2_count, ) from posthog.hogql_queries.insights.trends.trends_query_runner import TrendsQueryRunner from posthog.hogql_queries.query_runner import QueryRunner @@ -315,9 +315,9 @@ def run(query_runner: TrendsQueryRunner, result_key: str, is_parallel: bool): # Statistical analysis control_variant, test_variants = self._get_variants_with_base_stats(count_result, exposure_result) if self.stats_version == 2: - probabilities = calculate_probabilities_v2(control_variant, test_variants) - significance_code, p_value = are_results_significant_v2(control_variant, test_variants, probabilities) - credible_intervals = calculate_credible_intervals_v2([control_variant, *test_variants]) + probabilities = calculate_probabilities_v2_count(control_variant, test_variants) + significance_code, p_value = are_results_significant_v2_count(control_variant, test_variants, probabilities) + credible_intervals = calculate_credible_intervals_v2_count([control_variant, *test_variants]) else: probabilities = calculate_probabilities(control_variant, test_variants) significance_code, p_value = are_results_significant(control_variant, test_variants, probabilities) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py similarity index 97% rename from posthog/hogql_queries/experiments/test/test_trends_statistics.py rename to posthog/hogql_queries/experiments/test/test_trends_statistics_count.py index 239939e5e6f15..29577b818d24b 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py @@ -1,9 +1,9 @@ from posthog.hogql_queries.experiments import MIN_PROBABILITY_FOR_SIGNIFICANCE from posthog.schema import ExperimentVariantTrendsBaseStats, ExperimentSignificanceCode -from posthog.hogql_queries.experiments.trends_statistics_v2 import ( - calculate_probabilities_v2, - are_results_significant_v2, - calculate_credible_intervals_v2, +from posthog.hogql_queries.experiments.trends_statistics_v2_count import ( + calculate_probabilities_v2_count, + are_results_significant_v2_count, + calculate_credible_intervals_v2_count, ) from posthog.hogql_queries.experiments.trends_statistics import ( calculate_probabilities, @@ -41,9 +41,9 @@ def run_test_for_both_implementations(self, test_fn): # Run for v2 implementation test_fn( stats_version=2, - calculate_probabilities=calculate_probabilities_v2, - are_results_significant=are_results_significant_v2, - calculate_credible_intervals=calculate_credible_intervals_v2, + calculate_probabilities=calculate_probabilities_v2_count, + are_results_significant=are_results_significant_v2_count, + calculate_credible_intervals=calculate_credible_intervals_v2_count, ) def test_small_sample_two_variants_not_significant(self): diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2.py b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py similarity index 98% rename from posthog/hogql_queries/experiments/trends_statistics_v2.py rename to posthog/hogql_queries/experiments/trends_statistics_v2_count.py index 441798ae20ebe..c0248152317fb 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py @@ -13,7 +13,7 @@ SAMPLE_SIZE = 10000 -def calculate_probabilities_v2( +def calculate_probabilities_v2_count( control_variant: ExperimentVariantTrendsBaseStats, test_variants: list[ExperimentVariantTrendsBaseStats] ) -> list[float]: """ @@ -87,7 +87,7 @@ def calculate_probabilities_v2( return probabilities -def are_results_significant_v2( +def are_results_significant_v2_count( control_variant: ExperimentVariantTrendsBaseStats, test_variants: list[ExperimentVariantTrendsBaseStats], probabilities: list[Probability], @@ -142,7 +142,7 @@ def are_results_significant_v2( return ExperimentSignificanceCode.SIGNIFICANT, 0.0 -def calculate_credible_intervals_v2(variants, lower_bound=0.025, upper_bound=0.975): +def calculate_credible_intervals_v2_count(variants, lower_bound=0.025, upper_bound=0.975): """ Calculate Bayesian credible intervals for each variant's conversion rate. From 7a5ffc54d5c075d263a97c29b19c10b6f506caca Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Wed, 11 Dec 2024 05:45:02 -0800 Subject: [PATCH 02/34] First pass at continuous stats methods --- .../test/test_trends_statistics_continuous.py | 533 ++++++++++++++++++ .../trends_statistics_v2_continuous.py | 191 +++++++ 2 files changed, 724 insertions(+) create mode 100644 posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py create mode 100644 posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py new file mode 100644 index 0000000000000..8deaa87983e6e --- /dev/null +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -0,0 +1,533 @@ +from posthog.hogql_queries.experiments import MIN_PROBABILITY_FOR_SIGNIFICANCE +from posthog.schema import ExperimentVariantTrendsBaseStats, ExperimentSignificanceCode +from posthog.hogql_queries.experiments.trends_statistics_v2_continuous import ( + calculate_probabilities_v2_continuous, + are_results_significant_v2_continuous, + calculate_credible_intervals_v2_continuous, +) +from posthog.hogql_queries.experiments.trends_statistics import ( + calculate_probabilities, + are_results_significant, + calculate_credible_intervals, +) +from posthog.test.base import APIBaseTest + + +def create_variant(key: str, mean: float, exposure: int) -> ExperimentVariantTrendsBaseStats: + # Note: We use the count field to store the mean value for continuous metrics + return ExperimentVariantTrendsBaseStats(key=key, count=mean, exposure=exposure, absolute_exposure=exposure) + + +def create_variant_with_different_exposures( + key: str, + mean: float, + exposure: float, # relative exposure + absolute_exposure: int, # absolute exposure +) -> ExperimentVariantTrendsBaseStats: + return ExperimentVariantTrendsBaseStats(key=key, count=mean, exposure=exposure, absolute_exposure=absolute_exposure) + + +class TestExperimentTrendsStatisticsContinuous(APIBaseTest): + def run_test_for_both_implementations(self, test_fn): + """Run the same test for both implementations""" + # Run for original implementation + test_fn( + stats_version=1, + calculate_probabilities=calculate_probabilities, + are_results_significant=are_results_significant, + calculate_credible_intervals=calculate_credible_intervals, + ) + # Run for v2 implementation + test_fn( + stats_version=2, + calculate_probabilities=calculate_probabilities_v2_continuous, + are_results_significant=are_results_significant_v2_continuous, + calculate_credible_intervals=calculate_credible_intervals_v2_continuous, + ) + + def test_small_sample_two_variants_not_significant(self): + """Test with small sample size, two variants, no clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", mean=100.0, exposure=100) + test = create_variant("test", mean=105.0, exposure=100) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + if stats_version == 2: + self.assertTrue( + 0.4 < probabilities[0] < 0.6, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Close to 50/50 + self.assertTrue( + 0.4 < probabilities[1] < 0.6, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Close to 50/50 + self.assertEqual( + significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" + ) + self.assertEqual(p_value, 1, f"stats_version={stats_version}") + + # Control: ~$100 mean with wide interval due to small sample + self.assertTrue( + 80 < intervals["control"][0] < 90, f"stats_version={stats_version}, intervals={intervals}" + ) # Lower bound + self.assertTrue( + 110 < intervals["control"][1] < 120, f"stats_version={stats_version}, intervals={intervals}" + ) # Upper bound + + # Test: ~$105 mean with wide interval due to small sample + self.assertTrue( + 85 < intervals["test"][0] < 95, f"stats_version={stats_version}, intervals={intervals}" + ) # Lower bound + self.assertTrue( + 115 < intervals["test"][1] < 125, f"stats_version={stats_version}, intervals={intervals}" + ) # Upper bound + else: + # Original implementation behavior for small sample + self.assertTrue( + 0.3 < probabilities[0] < 0.7, f"stats_version={stats_version}, probabilities={probabilities}" + ) + self.assertTrue( + 0.3 < probabilities[1] < 0.7, f"stats_version={stats_version}, probabilities={probabilities}" + ) + self.assertEqual( + significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" + ) + self.assertEqual(p_value, 1, f"stats_version={stats_version}") + + # Original implementation returns intervals as ratios/multipliers of the mean + self.assertTrue( + intervals["control"][0] < 1, f"stats_version={stats_version}, intervals={intervals}" + ) # Lower bound is less than mean + self.assertTrue( + intervals["control"][1] > 1, f"stats_version={stats_version}, intervals={intervals}" + ) # Upper bound is greater than mean + self.assertTrue(intervals["test"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + + self.run_test_for_both_implementations(run_test) + + def test_large_sample_two_variants_significant(self): + """Test with large sample size, two variants, clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", mean=100.0, exposure=10000) + test = create_variant("test", mean=120.0, exposure=10000) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + if stats_version == 2: + self.assertTrue( + probabilities[1] > 0.95, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Test variant strongly winning + self.assertTrue( + probabilities[0] < 0.05, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Control variant strongly losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") + self.assertEqual(p_value, 0, f"stats_version={stats_version}") + + # Control: $100 mean with narrow interval due to large sample + self.assertTrue( + 98 < intervals["control"][0] < 102, f"stats_version={stats_version}, intervals={intervals}" + ) # Lower bound + self.assertTrue( + 98 < intervals["control"][1] < 102, f"stats_version={stats_version}, intervals={intervals}" + ) # Upper bound + + # Test: $120 mean with narrow interval due to large sample + self.assertTrue( + 118 < intervals["test"][0] < 122, f"stats_version={stats_version}, intervals={intervals}" + ) # Lower bound + self.assertTrue( + 118 < intervals["test"][1] < 122, f"stats_version={stats_version}, intervals={intervals}" + ) # Upper bound + else: + # Original implementation behavior for large sample + self.assertTrue( + probabilities[1] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Test variant winning + self.assertTrue( + probabilities[0] < 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Control variant losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") + self.assertLess(p_value, 0.05, f"stats_version={stats_version}") + + # Original implementation returns intervals as ratios/multipliers of the mean + self.assertTrue(intervals["control"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + + self.run_test_for_both_implementations(run_test) + + def test_large_sample_two_variants_strongly_significant(self): + """Test with large sample size, two variants, very clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", mean=100.0, exposure=10000) + test = create_variant("test", mean=150.0, exposure=10000) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + if stats_version == 2: + self.assertTrue( + probabilities[1] > 0.99, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Test variant very strongly winning + self.assertTrue( + probabilities[0] < 0.01, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Control variant very strongly losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") + self.assertEqual(p_value, 0, f"stats_version={stats_version}") + + # Control: $100 mean + self.assertTrue( + 98 < intervals["control"][0] < 102, f"stats_version={stats_version}, intervals={intervals}" + ) # Lower bound + self.assertTrue( + 98 < intervals["control"][1] < 102, f"stats_version={stats_version}, intervals={intervals}" + ) # Upper bound + + # Test: $150 mean, clearly higher than control + self.assertTrue( + 147 < intervals["test"][0] < 153, f"stats_version={stats_version}, intervals={intervals}" + ) # Lower bound + self.assertTrue( + 147 < intervals["test"][1] < 153, f"stats_version={stats_version}, intervals={intervals}" + ) # Upper bound + else: + # Original implementation behavior for strongly significant case + self.assertTrue( + probabilities[1] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Test variant winning + self.assertTrue( + probabilities[0] < 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Control variant losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") + self.assertLess(p_value, 0.05, f"stats_version={stats_version}") + + # Original implementation returns intervals as ratios/multipliers of the mean + # For strongly significant differences, the intervals should not overlap when scaled + self.assertTrue( + intervals["control"][1] * 100 < intervals["test"][0] * 150, + f"stats_version={stats_version}, intervals={intervals}", + ) + + self.run_test_for_both_implementations(run_test) + + def test_many_variants_not_significant(self): + """Test with multiple variants, no clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", mean=100.0, exposure=1000) + test_a = create_variant("test_a", mean=98.0, exposure=1000) + test_b = create_variant("test_b", mean=102.0, exposure=1000) + test_c = create_variant("test_c", mean=101.0, exposure=1000) + + probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) + significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) + intervals = calculate_credible_intervals([control, test_a, test_b, test_c]) + + self.assertEqual(len(probabilities), 4, f"stats_version={stats_version}") + if stats_version == 2: + self.assertTrue( + all(p < MIN_PROBABILITY_FOR_SIGNIFICANCE for p in probabilities), + f"stats_version={stats_version}, probabilities={probabilities}", + ) + self.assertEqual( + significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" + ) + self.assertEqual(p_value, 1, f"stats_version={stats_version}") + + # All variants around $100 with overlapping intervals + for variant_key in ["control", "test_a", "test_b", "test_c"]: + self.assertTrue( + 90 < intervals[variant_key][0] < 95, f"stats_version={stats_version}, intervals={intervals}" + ) # Lower bounds + self.assertTrue( + 105 < intervals[variant_key][1] < 110, f"stats_version={stats_version}, intervals={intervals}" + ) # Upper bounds + else: + # Original implementation behavior for multiple variants with no clear winner + self.assertTrue( + all(0.1 < p < 0.9 for p in probabilities), + f"stats_version={stats_version}, probabilities={probabilities}", + ) + self.assertEqual( + significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" + ) + self.assertEqual(p_value, 1, f"stats_version={stats_version}") + + # Original implementation returns intervals as ratios/multipliers of the mean + for variant_key in ["control", "test_a", "test_b", "test_c"]: + self.assertTrue( + intervals[variant_key][0] < 1, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + intervals[variant_key][1] > 1, f"stats_version={stats_version}, intervals={intervals}" + ) + + self.run_test_for_both_implementations(run_test) + + def test_many_variants_significant(self): + """Test with multiple variants, one clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", mean=100.0, exposure=10000) + test_a = create_variant("test_a", mean=105.0, exposure=10000) + test_b = create_variant("test_b", mean=150.0, exposure=10000) + test_c = create_variant("test_c", mean=110.0, exposure=10000) + + probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) + significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) + intervals = calculate_credible_intervals([control, test_a, test_b, test_c]) + + self.assertEqual(len(probabilities), 4, f"stats_version={stats_version}") + if stats_version == 2: + self.assertTrue( + probabilities[2] > 0.9, f"stats_version={stats_version}, probabilities={probabilities}" + ) # test_b should be winning + self.assertTrue( + probabilities[1] < 0.1, f"stats_version={stats_version}, probabilities={probabilities}" + ) # test_a should be losing + self.assertTrue( + probabilities[0] < 0.1, f"stats_version={stats_version}, probabilities={probabilities}" + ) # control should be losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") + self.assertEqual(p_value, 0, f"stats_version={stats_version}") + + # Control at $100 + self.assertTrue( + 98 < intervals["control"][0] < 102, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 98 < intervals["control"][1] < 102, f"stats_version={stats_version}, intervals={intervals}" + ) + + # Test A slightly higher at $105 + self.assertTrue( + 103 < intervals["test_a"][0] < 107, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 103 < intervals["test_a"][1] < 107, f"stats_version={stats_version}, intervals={intervals}" + ) + + # Test B clearly winning at $150 + self.assertTrue( + 147 < intervals["test_b"][0] < 153, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 147 < intervals["test_b"][1] < 153, f"stats_version={stats_version}, intervals={intervals}" + ) + + # Test C slightly higher at $110 + self.assertTrue( + 108 < intervals["test_c"][0] < 112, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 108 < intervals["test_c"][1] < 112, f"stats_version={stats_version}, intervals={intervals}" + ) + else: + # Original implementation behavior for multiple variants with clear winner + self.assertTrue( + probabilities[2] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # test_b should be winning + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") + self.assertLess(p_value, 0.05, f"stats_version={stats_version}") + + # Original implementation returns intervals as ratios/multipliers of the mean + # Test B (150.0) should have non-overlapping intervals with others when scaled + self.assertTrue( + intervals["control"][1] * 100 < intervals["test_b"][0] * 150, + f"stats_version={stats_version}, intervals={intervals}", + ) + self.assertTrue( + intervals["test_a"][1] * 105 < intervals["test_b"][0] * 150, + f"stats_version={stats_version}, intervals={intervals}", + ) + self.assertTrue( + intervals["test_c"][1] * 110 < intervals["test_b"][0] * 150, + f"stats_version={stats_version}, intervals={intervals}", + ) + + self.run_test_for_both_implementations(run_test) + + def test_insufficient_sample_size(self): + """Test with sample size below threshold""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", mean=100.0, exposure=50) + test = create_variant("test", mean=120.0, exposure=50) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + if stats_version == 2: + self.assertTrue( + probabilities[0] < 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Control has lower probability + self.assertTrue( + probabilities[1] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Test has higher probability + self.assertEqual( + significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, f"stats_version={stats_version}" + ) + self.assertEqual(p_value, 1.0, f"stats_version={stats_version}") + + # Both variants should have wide intervals due to small sample size + self.assertTrue( + 70 < intervals["control"][0] < 80, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 120 < intervals["control"][1] < 130, f"stats_version={stats_version}, intervals={intervals}" + ) + + self.assertTrue( + 90 < intervals["test"][0] < 100, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 140 < intervals["test"][1] < 150, f"stats_version={stats_version}, intervals={intervals}" + ) + else: + # Original implementation behavior for insufficient sample size + self.assertTrue( + 0.3 < probabilities[0] < 0.7, f"stats_version={stats_version}, probabilities={probabilities}" + ) + self.assertTrue( + 0.3 < probabilities[1] < 0.7, f"stats_version={stats_version}, probabilities={probabilities}" + ) + self.assertEqual( + significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, f"stats_version={stats_version}" + ) + self.assertEqual(p_value, 1.0, f"stats_version={stats_version}") + + # Original implementation returns intervals as ratios/multipliers of the mean + self.assertTrue(intervals["control"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + + self.run_test_for_both_implementations(run_test) + + def test_edge_cases(self): + """Test edge cases like zero means""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant("control", mean=0.0, exposure=1000) + test = create_variant("test", mean=0.0, exposure=1000) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + if stats_version == 2: + self.assertTrue( + abs(probabilities[0] - 0.5) < 0.1, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Should be close to 50/50 + self.assertTrue( + abs(probabilities[1] - 0.5) < 0.1, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Should be close to 50/50 + self.assertEqual( + significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" + ) + self.assertEqual(p_value, 1, f"stats_version={stats_version}") + + # Both variants should have very small intervals near zero + self.assertTrue( + 0 <= intervals["control"][0] < 0.1, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 0 <= intervals["control"][1] < 0.1, f"stats_version={stats_version}, intervals={intervals}" + ) + + self.assertTrue( + 0 <= intervals["test"][0] < 0.1, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 0 <= intervals["test"][1] < 0.1, f"stats_version={stats_version}, intervals={intervals}" + ) + else: + # Original implementation behavior for zero means + self.assertTrue( + 0.4 < probabilities[0] < 0.6, f"stats_version={stats_version}, probabilities={probabilities}" + ) + self.assertTrue( + 0.4 < probabilities[1] < 0.6, f"stats_version={stats_version}, probabilities={probabilities}" + ) + self.assertEqual( + significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" + ) + self.assertEqual(p_value, 1, f"stats_version={stats_version}") + + # Original implementation returns intervals as ratios/multipliers of the mean + # For zero means, the intervals should still be valid ratios + self.assertTrue(intervals["control"][0] >= 0, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][1] >= 0, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][0] >= 0, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][1] >= 0, f"stats_version={stats_version}, intervals={intervals}") + + self.run_test_for_both_implementations(run_test) + + def test_different_relative_and_absolute_exposure(self): + """Test that credible intervals are calculated using absolute_exposure rather than relative exposure""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control = create_variant_with_different_exposures( + "control", mean=100.0, exposure=1, absolute_exposure=10000 + ) + test = create_variant_with_different_exposures("test", mean=120.0, exposure=1.2, absolute_exposure=12000) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + if stats_version == 2: + self.assertTrue(probabilities[0] < 0.1, f"stats_version={stats_version}, probabilities={probabilities}") + self.assertTrue(0.9 < probabilities[1], f"stats_version={stats_version}, probabilities={probabilities}") + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") + self.assertEqual(p_value, 0, f"stats_version={stats_version}") + + # Control at $100 mean + self.assertTrue( + 98 < intervals["control"][0] < 102, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 98 < intervals["control"][1] < 102, f"stats_version={stats_version}, intervals={intervals}" + ) + + # Test at $120 mean + self.assertTrue( + 118 < intervals["test"][0] < 122, f"stats_version={stats_version}, intervals={intervals}" + ) + self.assertTrue( + 118 < intervals["test"][1] < 122, f"stats_version={stats_version}, intervals={intervals}" + ) + else: + # Original implementation behavior for different exposures + self.assertTrue( + probabilities[1] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Test variant winning + self.assertTrue( + probabilities[0] < 0.5, f"stats_version={stats_version}, probabilities={probabilities}" + ) # Control variant losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") + self.assertLess(p_value, 0.05, f"stats_version={stats_version}") + + # Original implementation returns intervals as ratios/multipliers of the mean + self.assertTrue(intervals["control"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["test"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + + self.run_test_for_both_implementations(run_test) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py new file mode 100644 index 0000000000000..1360e79a12d3f --- /dev/null +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -0,0 +1,191 @@ +from rest_framework.exceptions import ValidationError +from sentry_sdk import capture_exception +from posthog.hogql_queries.experiments import FF_DISTRIBUTION_THRESHOLD, MIN_PROBABILITY_FOR_SIGNIFICANCE +from posthog.schema import ExperimentSignificanceCode, ExperimentVariantTrendsBaseStats +from scipy.stats import t +import numpy as np + +# Prior parameters (minimal prior knowledge) +MU_0 = 0.0 # Prior mean +KAPPA_0 = 1.0 # Prior strength for mean +ALPHA_0 = 1.0 # Prior shape for variance +BETA_0 = 1.0 # Prior scale for variance + +SAMPLE_SIZE = 10000 + + +def calculate_probabilities_v2_continuous( + control_variant: ExperimentVariantTrendsBaseStats, test_variants: list[ExperimentVariantTrendsBaseStats] +) -> list[float]: + """ + Calculate the win probabilities for each variant in an experiment using Bayesian analysis + for continuous metrics (e.g., revenue). + + This function computes the probability that each variant is the best (i.e., has the highest + mean value) compared to all other variants, including the control. It uses samples + drawn from the posterior distributions of each variant's mean. + + Parameters: + ----------- + control_variant : ExperimentVariantTrendsBaseStats + Statistics for the control group, including mean value and exposure (number of users) + test_variants : list[ExperimentVariantTrendsBaseStats] + List of statistics for test variants to compare against the control + + Returns: + -------- + list[float] + A list of probabilities where: + - The first element is the probability that the control variant is the best + - Subsequent elements are the probabilities that each test variant is the best + + Notes: + ------ + - Uses a Bayesian approach with a t-distribution as the posterior + - Assumes a Normal-Inverse-Gamma prior + - Log-transforms the data to handle typical revenue distributions + """ + if len(test_variants) >= 10: + raise ValidationError("Can't calculate experiment results for more than 10 variants", code="too_much_data") + if len(test_variants) < 1: + raise ValidationError("Can't calculate experiment results for less than 2 variants", code="no_data") + + # Calculate posterior parameters for control + log_control_mean = np.log(control_variant.count) # Using count field to store mean value + log_variance = 0.25 # Assumed variance in log-space + + # Update parameters for control + kappa_n_control = KAPPA_0 + control_variant.exposure + mu_n_control = (KAPPA_0 * MU_0 + control_variant.exposure * log_control_mean) / kappa_n_control + alpha_n_control = ALPHA_0 + control_variant.exposure / 2 + beta_n_control = BETA_0 + 0.5 * control_variant.exposure * log_variance + + # Draw samples from control posterior + control_posterior = t( + df=2 * alpha_n_control, loc=mu_n_control, scale=np.sqrt(beta_n_control / (kappa_n_control * alpha_n_control)) + ) + samples_control = control_posterior.rvs(SAMPLE_SIZE) + + # Draw samples for each test variant + test_samples = [] + for test in test_variants: + log_test_mean = np.log(test.count) # Using count field to store mean value + + kappa_n_test = KAPPA_0 + test.exposure + mu_n_test = (KAPPA_0 * MU_0 + test.exposure * log_test_mean) / kappa_n_test + alpha_n_test = ALPHA_0 + test.exposure / 2 + beta_n_test = BETA_0 + 0.5 * test.exposure * log_variance + + test_posterior = t( + df=2 * alpha_n_test, loc=mu_n_test, scale=np.sqrt(beta_n_test / (kappa_n_test * alpha_n_test)) + ) + test_samples.append(test_posterior.rvs(SAMPLE_SIZE)) + + # Calculate probabilities + probabilities = [] + + # Probability control wins (beats all test variants) + control_wins = np.all([samples_control > test_sample for test_sample in test_samples], axis=0) + probabilities.append(float(np.mean(control_wins))) + + # Probability each test variant wins (beats control and all other test variants) + for i, test_sample in enumerate(test_samples): + other_test_samples = test_samples[:i] + test_samples[i + 1 :] + variant_wins = np.all( + [test_sample > samples_control] + [test_sample > other for other in other_test_samples], axis=0 + ) + probabilities.append(float(np.mean(variant_wins))) + + return probabilities + + +def are_results_significant_v2_continuous( + control_variant: ExperimentVariantTrendsBaseStats, + test_variants: list[ExperimentVariantTrendsBaseStats], + probabilities: list[float], +) -> tuple[ExperimentSignificanceCode, float]: + """ + Determines if experiment results are statistically significant using Bayesian analysis + for continuous metrics. + + Parameters: + ----------- + control_variant : ExperimentVariantTrendsBaseStats + Statistics for the control group + test_variants : list[ExperimentVariantTrendsBaseStats] + List of statistics for test variants to compare against control + probabilities : list[float] + List of win probabilities for each variant + + Returns: + -------- + tuple[ExperimentSignificanceCode, float] + - ExperimentSignificanceCode indicating the significance status + - Probability value + """ + # Check exposure thresholds + for variant in test_variants: + if variant.absolute_exposure < FF_DISTRIBUTION_THRESHOLD: + return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1.0 + + if control_variant.absolute_exposure < FF_DISTRIBUTION_THRESHOLD: + return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1.0 + + # Find highest probability among all variants + max_probability = max(probabilities) + + # Check if any variant has a high enough probability of being best + if max_probability < MIN_PROBABILITY_FOR_SIGNIFICANCE: + return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0 + + return ExperimentSignificanceCode.SIGNIFICANT, 0.0 + + +def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, upper_bound=0.975): + """ + Calculate Bayesian credible intervals for each variant's mean value. + + Parameters: + ----------- + variants : list[ExperimentVariantTrendsBaseStats] + List of variants containing mean values and exposure data + lower_bound : float, optional (default=0.025) + Lower percentile for the credible interval (2.5% for 95% CI) + upper_bound : float, optional (default=0.975) + Upper percentile for the credible interval (97.5% for 95% CI) + + Returns: + -------- + dict[str, tuple[float, float]] + Dictionary mapping variant keys to their credible intervals + """ + intervals = {} + + for variant in variants: + try: + # Log-transform the mean value + log_mean = np.log(variant.count) # Using count field to store mean value + log_variance = 0.25 + + # Calculate posterior parameters using absolute_exposure + kappa_n = KAPPA_0 + variant.absolute_exposure + mu_n = (KAPPA_0 * MU_0 + variant.absolute_exposure * log_mean) / kappa_n + alpha_n = ALPHA_0 + variant.absolute_exposure / 2 + beta_n = BETA_0 + 0.5 * variant.absolute_exposure * log_variance + + # Create posterior distribution + posterior = t(df=2 * alpha_n, loc=mu_n, scale=np.sqrt(beta_n / (kappa_n * alpha_n))) + + # Calculate credible intervals + credible_interval = posterior.interval(upper_bound - lower_bound) + + # Transform back from log space + intervals[variant.key] = (float(np.exp(credible_interval[0])), float(np.exp(credible_interval[1]))) + except Exception as e: + capture_exception( + Exception(f"Error calculating credible interval for variant {variant.key}"), + {"error": str(e)}, + ) + return {} + + return intervals From 2394003d99acf97ef66f3b951301f4f38b3c51cc Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Wed, 11 Dec 2024 06:10:22 -0800 Subject: [PATCH 03/34] Drop assertion context because it's too noisy --- .../test/test_trends_statistics_continuous.py | 426 ++++++------------ 1 file changed, 132 insertions(+), 294 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 8deaa87983e6e..a8cbba9d14e2d 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -56,56 +56,32 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test], probabilities) intervals = calculate_credible_intervals([control, test]) - self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue( - 0.4 < probabilities[0] < 0.6, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Close to 50/50 - self.assertTrue( - 0.4 < probabilities[1] < 0.6, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Close to 50/50 - self.assertEqual( - significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" - ) - self.assertEqual(p_value, 1, f"stats_version={stats_version}") + self.assertTrue(0.4 < probabilities[0] < 0.6) # Close to 50/50 + self.assertTrue(0.4 < probabilities[1] < 0.6) # Close to 50/50 + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) # Control: ~$100 mean with wide interval due to small sample - self.assertTrue( - 80 < intervals["control"][0] < 90, f"stats_version={stats_version}, intervals={intervals}" - ) # Lower bound - self.assertTrue( - 110 < intervals["control"][1] < 120, f"stats_version={stats_version}, intervals={intervals}" - ) # Upper bound + self.assertTrue(80 < intervals["control"][0] < 90) # Lower bound + self.assertTrue(110 < intervals["control"][1] < 120) # Upper bound # Test: ~$105 mean with wide interval due to small sample - self.assertTrue( - 85 < intervals["test"][0] < 95, f"stats_version={stats_version}, intervals={intervals}" - ) # Lower bound - self.assertTrue( - 115 < intervals["test"][1] < 125, f"stats_version={stats_version}, intervals={intervals}" - ) # Upper bound + self.assertTrue(85 < intervals["test"][0] < 95) # Lower bound + self.assertTrue(115 < intervals["test"][1] < 125) # Upper bound else: # Original implementation behavior for small sample - self.assertTrue( - 0.3 < probabilities[0] < 0.7, f"stats_version={stats_version}, probabilities={probabilities}" - ) - self.assertTrue( - 0.3 < probabilities[1] < 0.7, f"stats_version={stats_version}, probabilities={probabilities}" - ) - self.assertEqual( - significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" - ) - self.assertEqual(p_value, 1, f"stats_version={stats_version}") + self.assertTrue(0.3 < probabilities[0] < 0.7) + self.assertTrue(0.3 < probabilities[1] < 0.7) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue( - intervals["control"][0] < 1, f"stats_version={stats_version}, intervals={intervals}" - ) # Lower bound is less than mean - self.assertTrue( - intervals["control"][1] > 1, f"stats_version={stats_version}, intervals={intervals}" - ) # Upper bound is greater than mean - self.assertTrue(intervals["test"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][0] < 1) # Lower bound is less than mean + self.assertTrue(intervals["control"][1] > 1) # Upper bound is greater than mean + self.assertTrue(intervals["test"][0] < 1) + self.assertTrue(intervals["test"][1] > 1) self.run_test_for_both_implementations(run_test) @@ -120,48 +96,32 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test], probabilities) intervals = calculate_credible_intervals([control, test]) - self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue( - probabilities[1] > 0.95, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Test variant strongly winning - self.assertTrue( - probabilities[0] < 0.05, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Control variant strongly losing - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") - self.assertEqual(p_value, 0, f"stats_version={stats_version}") + self.assertTrue(probabilities[1] > 0.95) # Test variant strongly winning + self.assertTrue(probabilities[0] < 0.05) # Control variant strongly losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(p_value, 0) # Control: $100 mean with narrow interval due to large sample - self.assertTrue( - 98 < intervals["control"][0] < 102, f"stats_version={stats_version}, intervals={intervals}" - ) # Lower bound - self.assertTrue( - 98 < intervals["control"][1] < 102, f"stats_version={stats_version}, intervals={intervals}" - ) # Upper bound + self.assertTrue(98 < intervals["control"][0] < 102) # Lower bound + self.assertTrue(98 < intervals["control"][1] < 102) # Upper bound # Test: $120 mean with narrow interval due to large sample - self.assertTrue( - 118 < intervals["test"][0] < 122, f"stats_version={stats_version}, intervals={intervals}" - ) # Lower bound - self.assertTrue( - 118 < intervals["test"][1] < 122, f"stats_version={stats_version}, intervals={intervals}" - ) # Upper bound + self.assertTrue(118 < intervals["test"][0] < 122) # Lower bound + self.assertTrue(118 < intervals["test"][1] < 122) # Upper bound else: # Original implementation behavior for large sample - self.assertTrue( - probabilities[1] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Test variant winning - self.assertTrue( - probabilities[0] < 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Control variant losing - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") - self.assertLess(p_value, 0.05, f"stats_version={stats_version}") + self.assertTrue(probabilities[1] > 0.5) # Test variant winning + self.assertTrue(probabilities[0] < 0.5) # Control variant losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(p_value, 0.05) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue(intervals["control"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["control"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][0] < 1) + self.assertTrue(intervals["control"][1] > 1) + self.assertTrue(intervals["test"][0] < 1) + self.assertTrue(intervals["test"][1] > 1) self.run_test_for_both_implementations(run_test) @@ -176,49 +136,30 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test], probabilities) intervals = calculate_credible_intervals([control, test]) - self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue( - probabilities[1] > 0.99, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Test variant very strongly winning - self.assertTrue( - probabilities[0] < 0.01, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Control variant very strongly losing - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") - self.assertEqual(p_value, 0, f"stats_version={stats_version}") + self.assertTrue(probabilities[1] > 0.99) # Test variant very strongly winning + self.assertTrue(probabilities[0] < 0.01) # Control variant very strongly losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(p_value, 0) # Control: $100 mean - self.assertTrue( - 98 < intervals["control"][0] < 102, f"stats_version={stats_version}, intervals={intervals}" - ) # Lower bound - self.assertTrue( - 98 < intervals["control"][1] < 102, f"stats_version={stats_version}, intervals={intervals}" - ) # Upper bound + self.assertTrue(98 < intervals["control"][0] < 102) # Lower bound + self.assertTrue(98 < intervals["control"][1] < 102) # Upper bound # Test: $150 mean, clearly higher than control - self.assertTrue( - 147 < intervals["test"][0] < 153, f"stats_version={stats_version}, intervals={intervals}" - ) # Lower bound - self.assertTrue( - 147 < intervals["test"][1] < 153, f"stats_version={stats_version}, intervals={intervals}" - ) # Upper bound + self.assertTrue(147 < intervals["test"][0] < 153) # Lower bound + self.assertTrue(147 < intervals["test"][1] < 153) # Upper bound else: # Original implementation behavior for strongly significant case - self.assertTrue( - probabilities[1] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Test variant winning - self.assertTrue( - probabilities[0] < 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Control variant losing - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") - self.assertLess(p_value, 0.05, f"stats_version={stats_version}") + self.assertTrue(probabilities[1] > 0.5) # Test variant winning + self.assertTrue(probabilities[0] < 0.5) # Control variant losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(p_value, 0.05) # Original implementation returns intervals as ratios/multipliers of the mean # For strongly significant differences, the intervals should not overlap when scaled - self.assertTrue( - intervals["control"][1] * 100 < intervals["test"][0] * 150, - f"stats_version={stats_version}, intervals={intervals}", - ) + self.assertTrue(intervals["control"][1] * 100 < intervals["test"][0] * 150) self.run_test_for_both_implementations(run_test) @@ -235,44 +176,26 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) intervals = calculate_credible_intervals([control, test_a, test_b, test_c]) - self.assertEqual(len(probabilities), 4, f"stats_version={stats_version}") + self.assertEqual(len(probabilities), 4) if stats_version == 2: - self.assertTrue( - all(p < MIN_PROBABILITY_FOR_SIGNIFICANCE for p in probabilities), - f"stats_version={stats_version}, probabilities={probabilities}", - ) - self.assertEqual( - significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" - ) - self.assertEqual(p_value, 1, f"stats_version={stats_version}") + self.assertTrue(all(p < MIN_PROBABILITY_FOR_SIGNIFICANCE for p in probabilities)) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) # All variants around $100 with overlapping intervals for variant_key in ["control", "test_a", "test_b", "test_c"]: - self.assertTrue( - 90 < intervals[variant_key][0] < 95, f"stats_version={stats_version}, intervals={intervals}" - ) # Lower bounds - self.assertTrue( - 105 < intervals[variant_key][1] < 110, f"stats_version={stats_version}, intervals={intervals}" - ) # Upper bounds + self.assertTrue(90 < intervals[variant_key][0] < 95) # Lower bounds + self.assertTrue(105 < intervals[variant_key][1] < 110) # Upper bounds else: # Original implementation behavior for multiple variants with no clear winner - self.assertTrue( - all(0.1 < p < 0.9 for p in probabilities), - f"stats_version={stats_version}, probabilities={probabilities}", - ) - self.assertEqual( - significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" - ) - self.assertEqual(p_value, 1, f"stats_version={stats_version}") + self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean for variant_key in ["control", "test_a", "test_b", "test_c"]: - self.assertTrue( - intervals[variant_key][0] < 1, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - intervals[variant_key][1] > 1, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(intervals[variant_key][0] < 1) + self.assertTrue(intervals[variant_key][1] > 1) self.run_test_for_both_implementations(run_test) @@ -289,73 +212,40 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) intervals = calculate_credible_intervals([control, test_a, test_b, test_c]) - self.assertEqual(len(probabilities), 4, f"stats_version={stats_version}") + self.assertEqual(len(probabilities), 4) if stats_version == 2: - self.assertTrue( - probabilities[2] > 0.9, f"stats_version={stats_version}, probabilities={probabilities}" - ) # test_b should be winning - self.assertTrue( - probabilities[1] < 0.1, f"stats_version={stats_version}, probabilities={probabilities}" - ) # test_a should be losing - self.assertTrue( - probabilities[0] < 0.1, f"stats_version={stats_version}, probabilities={probabilities}" - ) # control should be losing - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") - self.assertEqual(p_value, 0, f"stats_version={stats_version}") + self.assertTrue(probabilities[2] > 0.9) # test_b should be winning + self.assertTrue(probabilities[1] < 0.1) # test_a should be losing + self.assertTrue(probabilities[0] < 0.1) # control should be losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(p_value, 0) # Control at $100 - self.assertTrue( - 98 < intervals["control"][0] < 102, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 98 < intervals["control"][1] < 102, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(98 < intervals["control"][0] < 102) + self.assertTrue(98 < intervals["control"][1] < 102) # Test A slightly higher at $105 - self.assertTrue( - 103 < intervals["test_a"][0] < 107, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 103 < intervals["test_a"][1] < 107, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(103 < intervals["test_a"][0] < 107) + self.assertTrue(103 < intervals["test_a"][1] < 107) # Test B clearly winning at $150 - self.assertTrue( - 147 < intervals["test_b"][0] < 153, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 147 < intervals["test_b"][1] < 153, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(147 < intervals["test_b"][0] < 153) + self.assertTrue(147 < intervals["test_b"][1] < 153) # Test C slightly higher at $110 - self.assertTrue( - 108 < intervals["test_c"][0] < 112, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 108 < intervals["test_c"][1] < 112, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(108 < intervals["test_c"][0] < 112) + self.assertTrue(108 < intervals["test_c"][1] < 112) else: # Original implementation behavior for multiple variants with clear winner - self.assertTrue( - probabilities[2] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # test_b should be winning - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") - self.assertLess(p_value, 0.05, f"stats_version={stats_version}") + self.assertTrue(probabilities[2] > 0.5) # test_b should be winning + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(p_value, 0.05) # Original implementation returns intervals as ratios/multipliers of the mean # Test B (150.0) should have non-overlapping intervals with others when scaled - self.assertTrue( - intervals["control"][1] * 100 < intervals["test_b"][0] * 150, - f"stats_version={stats_version}, intervals={intervals}", - ) - self.assertTrue( - intervals["test_a"][1] * 105 < intervals["test_b"][0] * 150, - f"stats_version={stats_version}, intervals={intervals}", - ) - self.assertTrue( - intervals["test_c"][1] * 110 < intervals["test_b"][0] * 150, - f"stats_version={stats_version}, intervals={intervals}", - ) + self.assertTrue(intervals["control"][1] * 100 < intervals["test_b"][0] * 150) + self.assertTrue(intervals["test_a"][1] * 105 < intervals["test_b"][0] * 150) + self.assertTrue(intervals["test_c"][1] * 110 < intervals["test_b"][0] * 150) self.run_test_for_both_implementations(run_test) @@ -370,51 +260,31 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test], probabilities) intervals = calculate_credible_intervals([control, test]) - self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue( - probabilities[0] < 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Control has lower probability - self.assertTrue( - probabilities[1] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Test has higher probability - self.assertEqual( - significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, f"stats_version={stats_version}" - ) - self.assertEqual(p_value, 1.0, f"stats_version={stats_version}") + self.assertTrue(probabilities[0] < 0.5) # Control has lower probability + self.assertTrue(probabilities[1] > 0.5) # Test has higher probability + self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertEqual(p_value, 1.0) # Both variants should have wide intervals due to small sample size - self.assertTrue( - 70 < intervals["control"][0] < 80, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 120 < intervals["control"][1] < 130, f"stats_version={stats_version}, intervals={intervals}" - ) - - self.assertTrue( - 90 < intervals["test"][0] < 100, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 140 < intervals["test"][1] < 150, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(70 < intervals["control"][0] < 80) + self.assertTrue(120 < intervals["control"][1] < 130) + + self.assertTrue(90 < intervals["test"][0] < 100) + self.assertTrue(140 < intervals["test"][1] < 150) else: # Original implementation behavior for insufficient sample size - self.assertTrue( - 0.3 < probabilities[0] < 0.7, f"stats_version={stats_version}, probabilities={probabilities}" - ) - self.assertTrue( - 0.3 < probabilities[1] < 0.7, f"stats_version={stats_version}, probabilities={probabilities}" - ) - self.assertEqual( - significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, f"stats_version={stats_version}" - ) - self.assertEqual(p_value, 1.0, f"stats_version={stats_version}") + self.assertTrue(0.3 < probabilities[0] < 0.7) + self.assertTrue(0.3 < probabilities[1] < 0.7) + self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertEqual(p_value, 1.0) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue(intervals["control"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["control"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][0] < 1) + self.assertTrue(intervals["control"][1] > 1) + self.assertTrue(intervals["test"][0] < 1) + self.assertTrue(intervals["test"][1] > 1) self.run_test_for_both_implementations(run_test) @@ -429,52 +299,32 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test], probabilities) intervals = calculate_credible_intervals([control, test]) - self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue( - abs(probabilities[0] - 0.5) < 0.1, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Should be close to 50/50 - self.assertTrue( - abs(probabilities[1] - 0.5) < 0.1, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Should be close to 50/50 - self.assertEqual( - significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" - ) - self.assertEqual(p_value, 1, f"stats_version={stats_version}") + self.assertTrue(abs(probabilities[0] - 0.5) < 0.1) # Should be close to 50/50 + self.assertTrue(abs(probabilities[1] - 0.5) < 0.1) # Should be close to 50/50 + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) # Both variants should have very small intervals near zero - self.assertTrue( - 0 <= intervals["control"][0] < 0.1, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 0 <= intervals["control"][1] < 0.1, f"stats_version={stats_version}, intervals={intervals}" - ) - - self.assertTrue( - 0 <= intervals["test"][0] < 0.1, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 0 <= intervals["test"][1] < 0.1, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(0 <= intervals["control"][0] < 0.1) + self.assertTrue(0 <= intervals["control"][1] < 0.1) + + self.assertTrue(0 <= intervals["test"][0] < 0.1) + self.assertTrue(0 <= intervals["test"][1] < 0.1) else: # Original implementation behavior for zero means - self.assertTrue( - 0.4 < probabilities[0] < 0.6, f"stats_version={stats_version}, probabilities={probabilities}" - ) - self.assertTrue( - 0.4 < probabilities[1] < 0.6, f"stats_version={stats_version}, probabilities={probabilities}" - ) - self.assertEqual( - significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY, f"stats_version={stats_version}" - ) - self.assertEqual(p_value, 1, f"stats_version={stats_version}") + self.assertTrue(0.4 < probabilities[0] < 0.6) + self.assertTrue(0.4 < probabilities[1] < 0.6) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean # For zero means, the intervals should still be valid ratios - self.assertTrue(intervals["control"][0] >= 0, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["control"][1] >= 0, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][0] >= 0, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][1] >= 0, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][0] >= 0) + self.assertTrue(intervals["control"][1] >= 0) + self.assertTrue(intervals["test"][0] >= 0) + self.assertTrue(intervals["test"][1] >= 0) self.run_test_for_both_implementations(run_test) @@ -491,43 +341,31 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca significance, p_value = are_results_significant(control, [test], probabilities) intervals = calculate_credible_intervals([control, test]) - self.assertEqual(len(probabilities), 2, f"stats_version={stats_version}") + self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue(probabilities[0] < 0.1, f"stats_version={stats_version}, probabilities={probabilities}") - self.assertTrue(0.9 < probabilities[1], f"stats_version={stats_version}, probabilities={probabilities}") - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") - self.assertEqual(p_value, 0, f"stats_version={stats_version}") + self.assertTrue(probabilities[0] < 0.1) + self.assertTrue(0.9 < probabilities[1]) + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(p_value, 0) # Control at $100 mean - self.assertTrue( - 98 < intervals["control"][0] < 102, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 98 < intervals["control"][1] < 102, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(98 < intervals["control"][0] < 102) + self.assertTrue(98 < intervals["control"][1] < 102) # Test at $120 mean - self.assertTrue( - 118 < intervals["test"][0] < 122, f"stats_version={stats_version}, intervals={intervals}" - ) - self.assertTrue( - 118 < intervals["test"][1] < 122, f"stats_version={stats_version}, intervals={intervals}" - ) + self.assertTrue(118 < intervals["test"][0] < 122) + self.assertTrue(118 < intervals["test"][1] < 122) else: # Original implementation behavior for different exposures - self.assertTrue( - probabilities[1] > 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Test variant winning - self.assertTrue( - probabilities[0] < 0.5, f"stats_version={stats_version}, probabilities={probabilities}" - ) # Control variant losing - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT, f"stats_version={stats_version}") - self.assertLess(p_value, 0.05, f"stats_version={stats_version}") + self.assertTrue(probabilities[1] > 0.5) # Test variant winning + self.assertTrue(probabilities[0] < 0.5) # Control variant losing + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(p_value, 0.05) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue(intervals["control"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["control"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][0] < 1, f"stats_version={stats_version}, intervals={intervals}") - self.assertTrue(intervals["test"][1] > 1, f"stats_version={stats_version}, intervals={intervals}") + self.assertTrue(intervals["control"][0] < 1) + self.assertTrue(intervals["control"][1] > 1) + self.assertTrue(intervals["test"][0] < 1) + self.assertTrue(intervals["test"][1] > 1) self.run_test_for_both_implementations(run_test) From f5ec62220151616b8f55376bd44c17788e88ed31 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Wed, 11 Dec 2024 06:21:33 -0800 Subject: [PATCH 04/34] Add a test case for near zero means --- .../test/test_trends_statistics_continuous.py | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index a8cbba9d14e2d..c129de180af7e 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -288,7 +288,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) - def test_edge_cases(self): + def test_edge_cases_zero_means(self): """Test edge cases like zero means""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): @@ -328,6 +328,49 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + def test_edge_cases_near_zero_means(self): + """Test edge cases like near-zero means""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + # Using very small positive values instead of exact zeros + control = create_variant("control", mean=0.0001, exposure=1000) + test = create_variant("test", mean=0.0001, exposure=1000) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + + self.assertEqual(len(probabilities), 2) + if stats_version == 2: + self.assertTrue(abs(probabilities[0] - 0.5) < 0.1) # Should be close to 50/50 + self.assertTrue(abs(probabilities[1] - 0.5) < 0.1) # Should be close to 50/50 + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) + + # Both variants should have intervals appropriate for their small means + # For a mean of 0.0001, expect intervals to be within an order of magnitude + self.assertTrue(0.00005 <= intervals["control"][0] <= 0.0002) # Lower bound + self.assertTrue(0.00005 <= intervals["control"][1] <= 0.0002) # Upper bound + + self.assertTrue(0.00005 <= intervals["test"][0] <= 0.0002) # Lower bound + self.assertTrue(0.00005 <= intervals["test"][1] <= 0.0002) # Upper bound + else: + # Original implementation behavior for near-zero means + self.assertTrue(0.4 < probabilities[0] < 0.6) + self.assertTrue(0.4 < probabilities[1] < 0.6) + self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) + self.assertEqual(p_value, 1) + + # Original implementation returns intervals as ratios/multipliers of the mean + # For near-zero means, the intervals become very small ratios + # This is expected behavior when dealing with values close to zero + self.assertTrue(0 <= intervals["control"][0] <= 0.0001) # Lower bound ratio + self.assertTrue(0 <= intervals["control"][1] <= 0.005) # Upper bound ratio + self.assertTrue(0 <= intervals["test"][0] <= 0.0001) # Lower bound ratio + self.assertTrue(0 <= intervals["test"][1] <= 0.005) # Upper bound ratio + + self.run_test_for_both_implementations(run_test) + def test_different_relative_and_absolute_exposure(self): """Test that credible intervals are calculated using absolute_exposure rather than relative exposure""" From 2501c1c34ec1321b0e83262cda2cfc850a3ab08f Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Wed, 11 Dec 2024 06:27:06 -0800 Subject: [PATCH 05/34] Handle zero mean scenario --- .../trends_statistics_v2_continuous.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index 1360e79a12d3f..404b82d2257df 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -12,6 +12,7 @@ BETA_0 = 1.0 # Prior scale for variance SAMPLE_SIZE = 10000 +EPSILON = 1e-10 # Small epsilon value to handle zeros def calculate_probabilities_v2_continuous( @@ -51,7 +52,7 @@ def calculate_probabilities_v2_continuous( raise ValidationError("Can't calculate experiment results for less than 2 variants", code="no_data") # Calculate posterior parameters for control - log_control_mean = np.log(control_variant.count) # Using count field to store mean value + log_control_mean = np.log(control_variant.count + EPSILON) # Using count field to store mean value log_variance = 0.25 # Assumed variance in log-space # Update parameters for control @@ -69,7 +70,7 @@ def calculate_probabilities_v2_continuous( # Draw samples for each test variant test_samples = [] for test in test_variants: - log_test_mean = np.log(test.count) # Using count field to store mean value + log_test_mean = np.log(test.count + EPSILON) # Using count field to store mean value kappa_n_test = KAPPA_0 + test.exposure mu_n_test = (KAPPA_0 * MU_0 + test.exposure * log_test_mean) / kappa_n_test @@ -163,8 +164,8 @@ def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, uppe for variant in variants: try: - # Log-transform the mean value - log_mean = np.log(variant.count) # Using count field to store mean value + # Log-transform the mean value, adding epsilon to handle zeros + log_mean = np.log(variant.count + EPSILON) # Using count field to store mean value log_variance = 0.25 # Calculate posterior parameters using absolute_exposure @@ -179,8 +180,11 @@ def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, uppe # Calculate credible intervals credible_interval = posterior.interval(upper_bound - lower_bound) - # Transform back from log space - intervals[variant.key] = (float(np.exp(credible_interval[0])), float(np.exp(credible_interval[1]))) + # Transform back from log space and subtract epsilon + intervals[variant.key] = ( + float(max(0, np.exp(credible_interval[0]) - EPSILON)), # Ensure non-negative + float(max(0, np.exp(credible_interval[1]) - EPSILON)), # Ensure non-negative + ) except Exception as e: capture_exception( Exception(f"Error calculating credible interval for variant {variant.key}"), From 43735d2230573f147c2f961fd26ef57ad9d21ab3 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:55:29 +0000 Subject: [PATCH 06/34] Update query snapshots --- .../test/__snapshots__/test_trends.ambr | 152 ++++-------------- 1 file changed, 32 insertions(+), 120 deletions(-) diff --git a/posthog/queries/test/__snapshots__/test_trends.ambr b/posthog/queries/test/__snapshots__/test_trends.ambr index 01ab1c2e0e23e..f43d12a9d3137 100644 --- a/posthog/queries/test/__snapshots__/test_trends.ambr +++ b/posthog/queries/test/__snapshots__/test_trends.ambr @@ -871,18 +871,14 @@ # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.1 ''' - - SELECT replaceRegexpAll(JSONExtractRaw(properties, '$some_property'), '^"|"$', '') AS value, - count(*) as count - FROM events e SAMPLE 1.0 - WHERE team_id = 99999 - AND ((event = 'sign up')) - AND toTimeZone(timestamp, 'UTC') >= toDateTime('2019-12-28 00:00:00', 'UTC') - AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-04 23:59:59', 'UTC') - GROUP BY value - ORDER BY count DESC, value DESC - LIMIT 26 - OFFSET 0 + /* celery:posthog.tasks.tasks.sync_insight_caching_state */ + SELECT team_id, + date_diff('second', max(timestamp), now()) AS age + FROM events + WHERE timestamp > date_sub(DAY, 3, now()) + AND timestamp < now() + GROUP BY team_id + ORDER BY age; ''' # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.10 @@ -1047,122 +1043,38 @@ # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.2 ''' - - SELECT groupArray(day_start) as date, - groupArray(count) AS total, - breakdown_value - FROM - (SELECT SUM(total) as count, - day_start, - breakdown_value - FROM - (SELECT * - FROM - (SELECT toUInt16(0) AS total, - ticks.day_start as day_start, - breakdown_value - FROM - (SELECT toStartOfDay(toDateTime('2020-01-04 23:59:59', 'UTC')) - toIntervalDay(number) as day_start - FROM numbers(8) - UNION ALL SELECT toStartOfDay(toDateTime('2019-12-28 00:00:00', 'UTC')) as day_start) as ticks - CROSS JOIN - (SELECT breakdown_value - FROM - (SELECT ['other_value', '$$_posthog_breakdown_null_$$', 'value'] as breakdown_value) ARRAY - JOIN breakdown_value) as sec - ORDER BY breakdown_value, - day_start - UNION ALL SELECT count(DISTINCT pdi.person_id) as total, - toStartOfDay(toTimeZone(toDateTime(timestamp, 'UTC'), 'UTC')) as day_start, - transform(ifNull(nullIf(replaceRegexpAll(JSONExtractRaw(properties, '$some_property'), '^"|"$', ''), ''), '$$_posthog_breakdown_null_$$'), (['other_value', '$$_posthog_breakdown_null_$$', 'value']), (['other_value', '$$_posthog_breakdown_null_$$', 'value']), '$$_posthog_breakdown_other_$$') as breakdown_value - FROM events e SAMPLE 1.0 - INNER JOIN - (SELECT distinct_id, - argMax(person_id, version) as person_id - FROM person_distinct_id2 - WHERE team_id = 99999 - GROUP BY distinct_id - HAVING argMax(is_deleted, version) = 0) as pdi ON events.distinct_id = pdi.distinct_id - WHERE e.team_id = 99999 - AND toTimeZone(timestamp, 'UTC') >= toDateTime(toStartOfDay(toDateTime('2019-12-28 00:00:00', 'UTC')), 'UTC') - AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-04 23:59:59', 'UTC') - AND ((event = 'sign up')) - GROUP BY day_start, - breakdown_value)) - GROUP BY day_start, - breakdown_value - ORDER BY breakdown_value, - day_start) - GROUP BY breakdown_value - ORDER BY breakdown_value + /* celery:posthog.tasks.tasks.sync_insight_caching_state */ + SELECT team_id, + date_diff('second', max(timestamp), now()) AS age + FROM events + WHERE timestamp > date_sub(DAY, 3, now()) + AND timestamp < now() + GROUP BY team_id + ORDER BY age; ''' # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.3 ''' - - SELECT replaceRegexpAll(JSONExtractRaw(properties, '$some_property'), '^"|"$', '') AS value, - count(*) as count - FROM events e SAMPLE 1.0 - WHERE team_id = 99999 - AND event = 'sign up' - AND toTimeZone(timestamp, 'UTC') >= toDateTime('2019-12-28 00:00:00', 'UTC') - AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-04 23:59:59', 'UTC') - GROUP BY value - ORDER BY count DESC, value DESC - LIMIT 26 - OFFSET 0 + /* celery:posthog.tasks.tasks.sync_insight_caching_state */ + SELECT team_id, + date_diff('second', max(timestamp), now()) AS age + FROM events + WHERE timestamp > date_sub(DAY, 3, now()) + AND timestamp < now() + GROUP BY team_id + ORDER BY age; ''' # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.4 ''' - - SELECT groupArray(day_start) as date, - groupArray(count) AS total, - breakdown_value - FROM - (SELECT SUM(total) as count, - day_start, - breakdown_value - FROM - (SELECT * - FROM - (SELECT toUInt16(0) AS total, - ticks.day_start as day_start, - breakdown_value - FROM - (SELECT toStartOfDay(toDateTime('2020-01-04 23:59:59', 'UTC')) - toIntervalDay(number) as day_start - FROM numbers(8) - UNION ALL SELECT toStartOfDay(toDateTime('2019-12-28 00:00:00', 'UTC')) as day_start) as ticks - CROSS JOIN - (SELECT breakdown_value - FROM - (SELECT ['other_value', '$$_posthog_breakdown_null_$$', 'value'] as breakdown_value) ARRAY - JOIN breakdown_value) as sec - ORDER BY breakdown_value, - day_start - UNION ALL SELECT count(DISTINCT pdi.person_id) as total, - toStartOfDay(toTimeZone(toDateTime(timestamp, 'UTC'), 'UTC')) as day_start, - transform(ifNull(nullIf(replaceRegexpAll(JSONExtractRaw(properties, '$some_property'), '^"|"$', ''), ''), '$$_posthog_breakdown_null_$$'), (['other_value', '$$_posthog_breakdown_null_$$', 'value']), (['other_value', '$$_posthog_breakdown_null_$$', 'value']), '$$_posthog_breakdown_other_$$') as breakdown_value - FROM events e SAMPLE 1.0 - INNER JOIN - (SELECT distinct_id, - argMax(person_id, version) as person_id - FROM person_distinct_id2 - WHERE team_id = 99999 - GROUP BY distinct_id - HAVING argMax(is_deleted, version) = 0) as pdi ON events.distinct_id = pdi.distinct_id - WHERE e.team_id = 99999 - AND event = 'sign up' - AND toTimeZone(timestamp, 'UTC') >= toDateTime(toStartOfDay(toDateTime('2019-12-28 00:00:00', 'UTC')), 'UTC') - AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-04 23:59:59', 'UTC') - GROUP BY day_start, - breakdown_value)) - GROUP BY day_start, - breakdown_value - ORDER BY breakdown_value, - day_start) - GROUP BY breakdown_value - ORDER BY breakdown_value + /* celery:posthog.tasks.tasks.sync_insight_caching_state */ + SELECT team_id, + date_diff('second', max(timestamp), now()) AS age + FROM events + WHERE timestamp > date_sub(DAY, 3, now()) + AND timestamp < now() + GROUP BY team_id + ORDER BY age; ''' # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.5 From 8a670dddafe29855eb069be531716621cc3f62dd Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:37:06 +0000 Subject: [PATCH 07/34] Update query snapshots --- posthog/api/test/__snapshots__/test_decide.ambr | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/posthog/api/test/__snapshots__/test_decide.ambr b/posthog/api/test/__snapshots__/test_decide.ambr index 5caca4e947152..a3d7eafe4ea90 100644 --- a/posthog/api/test/__snapshots__/test_decide.ambr +++ b/posthog/api/test/__snapshots__/test_decide.ambr @@ -3890,6 +3890,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id", "posthog_team"."id", @@ -4260,6 +4261,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -4693,6 +4695,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -4986,6 +4989,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -5171,6 +5175,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id", "posthog_team"."id", @@ -5602,6 +5607,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -5834,6 +5840,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -6165,6 +6172,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -6344,6 +6352,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -6475,6 +6484,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id", "posthog_team"."id", @@ -6906,6 +6916,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -7138,6 +7149,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -7465,6 +7477,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" @@ -7640,6 +7653,7 @@ "posthog_hogfunction"."inputs", "posthog_hogfunction"."encrypted_inputs", "posthog_hogfunction"."filters", + "posthog_hogfunction"."mappings", "posthog_hogfunction"."masking", "posthog_hogfunction"."template_id" FROM "posthog_hogfunction" From 280cd488753561f717a274c319ecf11347be756d Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:39:31 +0000 Subject: [PATCH 08/34] Update query snapshots --- .../test/__snapshots__/test_trends.ambr | 152 ++++++++++++++---- 1 file changed, 120 insertions(+), 32 deletions(-) diff --git a/posthog/queries/test/__snapshots__/test_trends.ambr b/posthog/queries/test/__snapshots__/test_trends.ambr index f43d12a9d3137..01ab1c2e0e23e 100644 --- a/posthog/queries/test/__snapshots__/test_trends.ambr +++ b/posthog/queries/test/__snapshots__/test_trends.ambr @@ -871,14 +871,18 @@ # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.1 ''' - /* celery:posthog.tasks.tasks.sync_insight_caching_state */ - SELECT team_id, - date_diff('second', max(timestamp), now()) AS age - FROM events - WHERE timestamp > date_sub(DAY, 3, now()) - AND timestamp < now() - GROUP BY team_id - ORDER BY age; + + SELECT replaceRegexpAll(JSONExtractRaw(properties, '$some_property'), '^"|"$', '') AS value, + count(*) as count + FROM events e SAMPLE 1.0 + WHERE team_id = 99999 + AND ((event = 'sign up')) + AND toTimeZone(timestamp, 'UTC') >= toDateTime('2019-12-28 00:00:00', 'UTC') + AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-04 23:59:59', 'UTC') + GROUP BY value + ORDER BY count DESC, value DESC + LIMIT 26 + OFFSET 0 ''' # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.10 @@ -1043,38 +1047,122 @@ # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.2 ''' - /* celery:posthog.tasks.tasks.sync_insight_caching_state */ - SELECT team_id, - date_diff('second', max(timestamp), now()) AS age - FROM events - WHERE timestamp > date_sub(DAY, 3, now()) - AND timestamp < now() - GROUP BY team_id - ORDER BY age; + + SELECT groupArray(day_start) as date, + groupArray(count) AS total, + breakdown_value + FROM + (SELECT SUM(total) as count, + day_start, + breakdown_value + FROM + (SELECT * + FROM + (SELECT toUInt16(0) AS total, + ticks.day_start as day_start, + breakdown_value + FROM + (SELECT toStartOfDay(toDateTime('2020-01-04 23:59:59', 'UTC')) - toIntervalDay(number) as day_start + FROM numbers(8) + UNION ALL SELECT toStartOfDay(toDateTime('2019-12-28 00:00:00', 'UTC')) as day_start) as ticks + CROSS JOIN + (SELECT breakdown_value + FROM + (SELECT ['other_value', '$$_posthog_breakdown_null_$$', 'value'] as breakdown_value) ARRAY + JOIN breakdown_value) as sec + ORDER BY breakdown_value, + day_start + UNION ALL SELECT count(DISTINCT pdi.person_id) as total, + toStartOfDay(toTimeZone(toDateTime(timestamp, 'UTC'), 'UTC')) as day_start, + transform(ifNull(nullIf(replaceRegexpAll(JSONExtractRaw(properties, '$some_property'), '^"|"$', ''), ''), '$$_posthog_breakdown_null_$$'), (['other_value', '$$_posthog_breakdown_null_$$', 'value']), (['other_value', '$$_posthog_breakdown_null_$$', 'value']), '$$_posthog_breakdown_other_$$') as breakdown_value + FROM events e SAMPLE 1.0 + INNER JOIN + (SELECT distinct_id, + argMax(person_id, version) as person_id + FROM person_distinct_id2 + WHERE team_id = 99999 + GROUP BY distinct_id + HAVING argMax(is_deleted, version) = 0) as pdi ON events.distinct_id = pdi.distinct_id + WHERE e.team_id = 99999 + AND toTimeZone(timestamp, 'UTC') >= toDateTime(toStartOfDay(toDateTime('2019-12-28 00:00:00', 'UTC')), 'UTC') + AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-04 23:59:59', 'UTC') + AND ((event = 'sign up')) + GROUP BY day_start, + breakdown_value)) + GROUP BY day_start, + breakdown_value + ORDER BY breakdown_value, + day_start) + GROUP BY breakdown_value + ORDER BY breakdown_value ''' # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.3 ''' - /* celery:posthog.tasks.tasks.sync_insight_caching_state */ - SELECT team_id, - date_diff('second', max(timestamp), now()) AS age - FROM events - WHERE timestamp > date_sub(DAY, 3, now()) - AND timestamp < now() - GROUP BY team_id - ORDER BY age; + + SELECT replaceRegexpAll(JSONExtractRaw(properties, '$some_property'), '^"|"$', '') AS value, + count(*) as count + FROM events e SAMPLE 1.0 + WHERE team_id = 99999 + AND event = 'sign up' + AND toTimeZone(timestamp, 'UTC') >= toDateTime('2019-12-28 00:00:00', 'UTC') + AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-04 23:59:59', 'UTC') + GROUP BY value + ORDER BY count DESC, value DESC + LIMIT 26 + OFFSET 0 ''' # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.4 ''' - /* celery:posthog.tasks.tasks.sync_insight_caching_state */ - SELECT team_id, - date_diff('second', max(timestamp), now()) AS age - FROM events - WHERE timestamp > date_sub(DAY, 3, now()) - AND timestamp < now() - GROUP BY team_id - ORDER BY age; + + SELECT groupArray(day_start) as date, + groupArray(count) AS total, + breakdown_value + FROM + (SELECT SUM(total) as count, + day_start, + breakdown_value + FROM + (SELECT * + FROM + (SELECT toUInt16(0) AS total, + ticks.day_start as day_start, + breakdown_value + FROM + (SELECT toStartOfDay(toDateTime('2020-01-04 23:59:59', 'UTC')) - toIntervalDay(number) as day_start + FROM numbers(8) + UNION ALL SELECT toStartOfDay(toDateTime('2019-12-28 00:00:00', 'UTC')) as day_start) as ticks + CROSS JOIN + (SELECT breakdown_value + FROM + (SELECT ['other_value', '$$_posthog_breakdown_null_$$', 'value'] as breakdown_value) ARRAY + JOIN breakdown_value) as sec + ORDER BY breakdown_value, + day_start + UNION ALL SELECT count(DISTINCT pdi.person_id) as total, + toStartOfDay(toTimeZone(toDateTime(timestamp, 'UTC'), 'UTC')) as day_start, + transform(ifNull(nullIf(replaceRegexpAll(JSONExtractRaw(properties, '$some_property'), '^"|"$', ''), ''), '$$_posthog_breakdown_null_$$'), (['other_value', '$$_posthog_breakdown_null_$$', 'value']), (['other_value', '$$_posthog_breakdown_null_$$', 'value']), '$$_posthog_breakdown_other_$$') as breakdown_value + FROM events e SAMPLE 1.0 + INNER JOIN + (SELECT distinct_id, + argMax(person_id, version) as person_id + FROM person_distinct_id2 + WHERE team_id = 99999 + GROUP BY distinct_id + HAVING argMax(is_deleted, version) = 0) as pdi ON events.distinct_id = pdi.distinct_id + WHERE e.team_id = 99999 + AND event = 'sign up' + AND toTimeZone(timestamp, 'UTC') >= toDateTime(toStartOfDay(toDateTime('2019-12-28 00:00:00', 'UTC')), 'UTC') + AND toTimeZone(timestamp, 'UTC') <= toDateTime('2020-01-04 23:59:59', 'UTC') + GROUP BY day_start, + breakdown_value)) + GROUP BY day_start, + breakdown_value + ORDER BY breakdown_value, + day_start) + GROUP BY breakdown_value + ORDER BY breakdown_value ''' # --- # name: TestTrends.test_dau_with_breakdown_filtering_with_sampling.5 From 883750f6ab7fad95495d7d090c62b76b6be6ed4c Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Thu, 12 Dec 2024 05:11:59 -0800 Subject: [PATCH 09/34] Adjust test values --- .../test/test_trends_statistics_continuous.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index c129de180af7e..2423a8556b65c 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -268,23 +268,23 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1.0) # Both variants should have wide intervals due to small sample size - self.assertTrue(70 < intervals["control"][0] < 80) - self.assertTrue(120 < intervals["control"][1] < 130) + self.assertTrue(70 < intervals["control"][0] < 90) + self.assertTrue(100 < intervals["control"][1] < 120) - self.assertTrue(90 < intervals["test"][0] < 100) - self.assertTrue(140 < intervals["test"][1] < 150) + self.assertTrue(85 < intervals["test"][0] < 105) + self.assertTrue(115 < intervals["test"][1] < 135) else: # Original implementation behavior for insufficient sample size - self.assertTrue(0.3 < probabilities[0] < 0.7) - self.assertTrue(0.3 < probabilities[1] < 0.7) + self.assertTrue(0.05 < probabilities[0] < 0.1) + self.assertTrue(0.85 < probabilities[1] < 1.0) self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) self.assertEqual(p_value, 1.0) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue(intervals["control"][0] < 1) - self.assertTrue(intervals["control"][1] > 1) - self.assertTrue(intervals["test"][0] < 1) - self.assertTrue(intervals["test"][1] > 1) + self.assertTrue(1.5 <= intervals["control"][0] < 1.8) + self.assertTrue(2.3 <= intervals["control"][1] < 2.6) + self.assertTrue(1.8 <= intervals["test"][0] < 2.1) + self.assertTrue(2.6 <= intervals["test"][1] < 2.9) self.run_test_for_both_implementations(run_test) From 4be08a03cbee48e8a67c079f14df6633d2dfcd5e Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Thu, 12 Dec 2024 05:12:15 -0800 Subject: [PATCH 10/34] Use `absolute_exposure`, not relative exposure --- .../trends_statistics_v2_continuous.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index 404b82d2257df..f5a88276010db 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -56,10 +56,10 @@ def calculate_probabilities_v2_continuous( log_variance = 0.25 # Assumed variance in log-space # Update parameters for control - kappa_n_control = KAPPA_0 + control_variant.exposure - mu_n_control = (KAPPA_0 * MU_0 + control_variant.exposure * log_control_mean) / kappa_n_control - alpha_n_control = ALPHA_0 + control_variant.exposure / 2 - beta_n_control = BETA_0 + 0.5 * control_variant.exposure * log_variance + kappa_n_control = KAPPA_0 + control_variant.absolute_exposure + mu_n_control = (KAPPA_0 * MU_0 + control_variant.absolute_exposure * log_control_mean) / kappa_n_control + alpha_n_control = ALPHA_0 + control_variant.absolute_exposure / 2 + beta_n_control = BETA_0 + 0.5 * control_variant.absolute_exposure * log_variance # Draw samples from control posterior control_posterior = t( @@ -72,10 +72,10 @@ def calculate_probabilities_v2_continuous( for test in test_variants: log_test_mean = np.log(test.count + EPSILON) # Using count field to store mean value - kappa_n_test = KAPPA_0 + test.exposure - mu_n_test = (KAPPA_0 * MU_0 + test.exposure * log_test_mean) / kappa_n_test - alpha_n_test = ALPHA_0 + test.exposure / 2 - beta_n_test = BETA_0 + 0.5 * test.exposure * log_variance + kappa_n_test = KAPPA_0 + test.absolute_exposure + mu_n_test = (KAPPA_0 * MU_0 + test.absolute_exposure * log_test_mean) / kappa_n_test + alpha_n_test = ALPHA_0 + test.absolute_exposure / 2 + beta_n_test = BETA_0 + 0.5 * test.absolute_exposure * log_variance test_posterior = t( df=2 * alpha_n_test, loc=mu_n_test, scale=np.sqrt(beta_n_test / (kappa_n_test * alpha_n_test)) From c6c58b4a0da22cdd8734037833e51063364cf7c3 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 04:51:40 -0800 Subject: [PATCH 11/34] Introduce a new `assertRange()` method --- .../test/test_trends_statistics_continuous.py | 151 +++++++++--------- 1 file changed, 79 insertions(+), 72 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 2423a8556b65c..e87411e7a7f6d 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -30,6 +30,7 @@ def create_variant_with_different_exposures( class TestExperimentTrendsStatisticsContinuous(APIBaseTest): def run_test_for_both_implementations(self, test_fn): """Run the same test for both implementations""" + self.stats_version = 1 # Run for original implementation test_fn( stats_version=1, @@ -37,6 +38,7 @@ def run_test_for_both_implementations(self, test_fn): are_results_significant=are_results_significant, calculate_credible_intervals=calculate_credible_intervals, ) + self.stats_version = 2 # Run for v2 implementation test_fn( stats_version=2, @@ -45,6 +47,11 @@ def run_test_for_both_implementations(self, test_fn): calculate_credible_intervals=calculate_credible_intervals_v2_continuous, ) + def assertRange(self, value, range: tuple[float, float]): + self.assertTrue( + range[0] <= value <= range[1], f"{value} is not in range {range} (stats version {self.stats_version})" + ) + def test_small_sample_two_variants_not_significant(self): """Test with small sample size, two variants, no clear winner""" @@ -58,30 +65,30 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue(0.4 < probabilities[0] < 0.6) # Close to 50/50 - self.assertTrue(0.4 < probabilities[1] < 0.6) # Close to 50/50 + self.assertRange(probabilities[0], (0.4, 0.6)) # Close to 50/50 + self.assertRange(probabilities[1], (0.4, 0.6)) # Close to 50/50 self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) # Control: ~$100 mean with wide interval due to small sample - self.assertTrue(80 < intervals["control"][0] < 90) # Lower bound - self.assertTrue(110 < intervals["control"][1] < 120) # Upper bound + self.assertRange(intervals["control"][0], (80, 90)) # Lower bound + self.assertRange(intervals["control"][1], (110, 120)) # Upper bound # Test: ~$105 mean with wide interval due to small sample - self.assertTrue(85 < intervals["test"][0] < 95) # Lower bound - self.assertTrue(115 < intervals["test"][1] < 125) # Upper bound + self.assertRange(intervals["test"][0], (85, 95)) # Lower bound + self.assertRange(intervals["test"][1], (115, 125)) # Upper bound else: # Original implementation behavior for small sample - self.assertTrue(0.3 < probabilities[0] < 0.7) - self.assertTrue(0.3 < probabilities[1] < 0.7) + self.assertRange(probabilities[0], (0.3, 0.7)) + self.assertRange(probabilities[1], (0.3, 0.7)) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue(intervals["control"][0] < 1) # Lower bound is less than mean - self.assertTrue(intervals["control"][1] > 1) # Upper bound is greater than mean - self.assertTrue(intervals["test"][0] < 1) - self.assertTrue(intervals["test"][1] > 1) + self.assertRange(intervals["control"][0], (0.8, 1.2)) # Lower bound is less than mean + self.assertRange(intervals["control"][1], (1.1, 1.3)) # Upper bound is greater than mean + self.assertRange(intervals["test"][0], (0.8, 1.2)) + self.assertRange(intervals["test"][1], (1.1, 1.3)) self.run_test_for_both_implementations(run_test) @@ -98,18 +105,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue(probabilities[1] > 0.95) # Test variant strongly winning - self.assertTrue(probabilities[0] < 0.05) # Control variant strongly losing + self.assertRange(probabilities[1], (0.95, 1.0)) # Test variant strongly winning + self.assertRange(probabilities[0], (0.0, 0.05)) # Control variant strongly losing self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertEqual(p_value, 0) # Control: $100 mean with narrow interval due to large sample - self.assertTrue(98 < intervals["control"][0] < 102) # Lower bound - self.assertTrue(98 < intervals["control"][1] < 102) # Upper bound + self.assertRange(intervals["control"][0], (98, 102)) # Lower bound + self.assertRange(intervals["control"][1], (98, 102)) # Upper bound # Test: $120 mean with narrow interval due to large sample - self.assertTrue(118 < intervals["test"][0] < 122) # Lower bound - self.assertTrue(118 < intervals["test"][1] < 122) # Upper bound + self.assertRange(intervals["test"][0], (118, 122)) # Lower bound + self.assertRange(intervals["test"][1], (118, 122)) # Upper bound else: # Original implementation behavior for large sample self.assertTrue(probabilities[1] > 0.5) # Test variant winning @@ -138,18 +145,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue(probabilities[1] > 0.99) # Test variant very strongly winning - self.assertTrue(probabilities[0] < 0.01) # Control variant very strongly losing + self.assertRange(probabilities[1], (0.99, 1.0)) # Test variant very strongly winning + self.assertRange(probabilities[0], (0.0, 0.01)) # Control variant very strongly losing self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertEqual(p_value, 0) # Control: $100 mean - self.assertTrue(98 < intervals["control"][0] < 102) # Lower bound - self.assertTrue(98 < intervals["control"][1] < 102) # Upper bound + self.assertRange(intervals["control"][0], (98, 102)) # Lower bound + self.assertRange(intervals["control"][1], (98, 102)) # Upper bound # Test: $150 mean, clearly higher than control - self.assertTrue(147 < intervals["test"][0] < 153) # Lower bound - self.assertTrue(147 < intervals["test"][1] < 153) # Upper bound + self.assertRange(intervals["test"][0], (147, 153)) # Lower bound + self.assertRange(intervals["test"][1], (147, 153)) # Upper bound else: # Original implementation behavior for strongly significant case self.assertTrue(probabilities[1] > 0.5) # Test variant winning @@ -184,8 +191,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # All variants around $100 with overlapping intervals for variant_key in ["control", "test_a", "test_b", "test_c"]: - self.assertTrue(90 < intervals[variant_key][0] < 95) # Lower bounds - self.assertTrue(105 < intervals[variant_key][1] < 110) # Upper bounds + self.assertRange(intervals[variant_key][0], (90, 95)) # Lower bounds + self.assertRange(intervals[variant_key][1], (105, 110)) # Upper bounds else: # Original implementation behavior for multiple variants with no clear winner self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) @@ -221,20 +228,20 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control at $100 - self.assertTrue(98 < intervals["control"][0] < 102) - self.assertTrue(98 < intervals["control"][1] < 102) + self.assertRange(intervals["control"][0], (98, 102)) + self.assertRange(intervals["control"][1], (98, 102)) # Test A slightly higher at $105 - self.assertTrue(103 < intervals["test_a"][0] < 107) - self.assertTrue(103 < intervals["test_a"][1] < 107) + self.assertRange(intervals["test_a"][0], (103, 107)) + self.assertRange(intervals["test_a"][1], (103, 107)) # Test B clearly winning at $150 - self.assertTrue(147 < intervals["test_b"][0] < 153) - self.assertTrue(147 < intervals["test_b"][1] < 153) + self.assertRange(intervals["test_b"][0], (147, 153)) + self.assertRange(intervals["test_b"][1], (147, 153)) # Test C slightly higher at $110 - self.assertTrue(108 < intervals["test_c"][0] < 112) - self.assertTrue(108 < intervals["test_c"][1] < 112) + self.assertRange(intervals["test_c"][0], (108, 112)) + self.assertRange(intervals["test_c"][1], (108, 112)) else: # Original implementation behavior for multiple variants with clear winner self.assertTrue(probabilities[2] > 0.5) # test_b should be winning @@ -262,29 +269,29 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue(probabilities[0] < 0.5) # Control has lower probability - self.assertTrue(probabilities[1] > 0.5) # Test has higher probability + self.assertRange(probabilities[0], (0.0, 0.5)) # Control has lower probability + self.assertRange(probabilities[1], (0.5, 1.0)) # Test has higher probability self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) self.assertEqual(p_value, 1.0) # Both variants should have wide intervals due to small sample size - self.assertTrue(70 < intervals["control"][0] < 90) - self.assertTrue(100 < intervals["control"][1] < 120) + self.assertRange(intervals["control"][0], (70, 90)) + self.assertRange(intervals["control"][1], (100, 120)) - self.assertTrue(85 < intervals["test"][0] < 105) - self.assertTrue(115 < intervals["test"][1] < 135) + self.assertRange(intervals["test"][0], (85, 105)) + self.assertRange(intervals["test"][1], (115, 135)) else: # Original implementation behavior for insufficient sample size - self.assertTrue(0.05 < probabilities[0] < 0.1) - self.assertTrue(0.85 < probabilities[1] < 1.0) + self.assertRange(probabilities[0], (0.05, 0.1)) + self.assertRange(probabilities[1], (0.85, 1.0)) self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) self.assertEqual(p_value, 1.0) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue(1.5 <= intervals["control"][0] < 1.8) - self.assertTrue(2.3 <= intervals["control"][1] < 2.6) - self.assertTrue(1.8 <= intervals["test"][0] < 2.1) - self.assertTrue(2.6 <= intervals["test"][1] < 2.9) + self.assertRange(intervals["control"][0], (1.5, 1.8)) + self.assertRange(intervals["control"][1], (2.3, 2.6)) + self.assertRange(intervals["test"][0], (1.8, 2.1)) + self.assertRange(intervals["test"][1], (2.6, 2.9)) self.run_test_for_both_implementations(run_test) @@ -301,21 +308,21 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue(abs(probabilities[0] - 0.5) < 0.1) # Should be close to 50/50 - self.assertTrue(abs(probabilities[1] - 0.5) < 0.1) # Should be close to 50/50 + self.assertRange(probabilities[0], (0.4, 0.6)) # Should be close to 50/50 + self.assertRange(probabilities[1], (0.4, 0.6)) # Should be close to 50/50 self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) # Both variants should have very small intervals near zero - self.assertTrue(0 <= intervals["control"][0] < 0.1) - self.assertTrue(0 <= intervals["control"][1] < 0.1) + self.assertRange(intervals["control"][0], (0, 0.1)) + self.assertRange(intervals["control"][1], (0, 0.1)) - self.assertTrue(0 <= intervals["test"][0] < 0.1) - self.assertTrue(0 <= intervals["test"][1] < 0.1) + self.assertRange(intervals["test"][0], (0, 0.1)) + self.assertRange(intervals["test"][1], (0, 0.1)) else: # Original implementation behavior for zero means - self.assertTrue(0.4 < probabilities[0] < 0.6) - self.assertTrue(0.4 < probabilities[1] < 0.6) + self.assertRange(probabilities[0], (0.4, 0.6)) + self.assertRange(probabilities[1], (0.4, 0.6)) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) @@ -342,32 +349,32 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue(abs(probabilities[0] - 0.5) < 0.1) # Should be close to 50/50 - self.assertTrue(abs(probabilities[1] - 0.5) < 0.1) # Should be close to 50/50 + self.assertRange(probabilities[0], (0.4, 0.6)) # Should be close to 50/50 + self.assertRange(probabilities[1], (0.4, 0.6)) # Should be close to 50/50 self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) # Both variants should have intervals appropriate for their small means # For a mean of 0.0001, expect intervals to be within an order of magnitude - self.assertTrue(0.00005 <= intervals["control"][0] <= 0.0002) # Lower bound - self.assertTrue(0.00005 <= intervals["control"][1] <= 0.0002) # Upper bound + self.assertRange(intervals["control"][0], (0.00005, 0.0002)) # Lower bound + self.assertRange(intervals["control"][1], (0.00005, 0.0002)) # Upper bound - self.assertTrue(0.00005 <= intervals["test"][0] <= 0.0002) # Lower bound - self.assertTrue(0.00005 <= intervals["test"][1] <= 0.0002) # Upper bound + self.assertRange(intervals["test"][0], (0.00005, 0.0002)) # Lower bound + self.assertRange(intervals["test"][1], (0.00005, 0.0002)) # Upper bound else: # Original implementation behavior for near-zero means - self.assertTrue(0.4 < probabilities[0] < 0.6) - self.assertTrue(0.4 < probabilities[1] < 0.6) + self.assertRange(probabilities[0], (0.4, 0.6)) + self.assertRange(probabilities[1], (0.4, 0.6)) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean # For near-zero means, the intervals become very small ratios # This is expected behavior when dealing with values close to zero - self.assertTrue(0 <= intervals["control"][0] <= 0.0001) # Lower bound ratio - self.assertTrue(0 <= intervals["control"][1] <= 0.005) # Upper bound ratio - self.assertTrue(0 <= intervals["test"][0] <= 0.0001) # Lower bound ratio - self.assertTrue(0 <= intervals["test"][1] <= 0.005) # Upper bound ratio + self.assertRange(intervals["control"][0], (0, 0.0001)) # Lower bound ratio + self.assertRange(intervals["control"][1], (0, 0.005)) # Upper bound ratio + self.assertRange(intervals["test"][0], (0, 0.0001)) # Lower bound ratio + self.assertRange(intervals["test"][1], (0, 0.005)) # Upper bound ratio self.run_test_for_both_implementations(run_test) @@ -386,18 +393,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertTrue(probabilities[0] < 0.1) - self.assertTrue(0.9 < probabilities[1]) + self.assertRange(probabilities[0], (0.0, 0.1)) + self.assertRange(probabilities[1], (0.9, 1.0)) self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertEqual(p_value, 0) # Control at $100 mean - self.assertTrue(98 < intervals["control"][0] < 102) - self.assertTrue(98 < intervals["control"][1] < 102) + self.assertRange(intervals["control"][0], (98, 102)) + self.assertRange(intervals["control"][1], (98, 102)) # Test at $120 mean - self.assertTrue(118 < intervals["test"][0] < 122) - self.assertTrue(118 < intervals["test"][1] < 122) + self.assertRange(intervals["test"][0], (118, 122)) + self.assertRange(intervals["test"][1], (118, 122)) else: # Original implementation behavior for different exposures self.assertTrue(probabilities[1] > 0.5) # Test variant winning From 2a27a31e2ea96fe240fa05182f78b71f0cf1726e Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 05:15:55 -0800 Subject: [PATCH 12/34] Increase log variance to increase uncertainty for small sample sizes --- .../experiments/test/test_trends_statistics_continuous.py | 4 ++-- .../experiments/trends_statistics_v2_continuous.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index e87411e7a7f6d..854521c933885 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -72,11 +72,11 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Control: ~$100 mean with wide interval due to small sample self.assertRange(intervals["control"][0], (80, 90)) # Lower bound - self.assertRange(intervals["control"][1], (110, 120)) # Upper bound + self.assertRange(intervals["control"][1], (100, 120)) # Upper bound # Test: ~$105 mean with wide interval due to small sample self.assertRange(intervals["test"][0], (85, 95)) # Lower bound - self.assertRange(intervals["test"][1], (115, 125)) # Upper bound + self.assertRange(intervals["test"][1], (105, 125)) # Upper bound else: # Original implementation behavior for small sample self.assertRange(probabilities[0], (0.3, 0.7)) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index f5a88276010db..384fa90496e3a 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -53,7 +53,7 @@ def calculate_probabilities_v2_continuous( # Calculate posterior parameters for control log_control_mean = np.log(control_variant.count + EPSILON) # Using count field to store mean value - log_variance = 0.25 # Assumed variance in log-space + log_variance = 2 # Assumed variance in log-space # Update parameters for control kappa_n_control = KAPPA_0 + control_variant.absolute_exposure From d3b8c7e83361a834ba33c0d17d8b41e54f48e583 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 05:37:29 -0800 Subject: [PATCH 13/34] Use correct exposure values --- .../test/test_trends_statistics_continuous.py | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 854521c933885..b2729ee21607a 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -13,17 +13,8 @@ from posthog.test.base import APIBaseTest -def create_variant(key: str, mean: float, exposure: int) -> ExperimentVariantTrendsBaseStats: +def create_variant(key: str, mean: float, exposure: float, absolute_exposure: int) -> ExperimentVariantTrendsBaseStats: # Note: We use the count field to store the mean value for continuous metrics - return ExperimentVariantTrendsBaseStats(key=key, count=mean, exposure=exposure, absolute_exposure=exposure) - - -def create_variant_with_different_exposures( - key: str, - mean: float, - exposure: float, # relative exposure - absolute_exposure: int, # absolute exposure -) -> ExperimentVariantTrendsBaseStats: return ExperimentVariantTrendsBaseStats(key=key, count=mean, exposure=exposure, absolute_exposure=absolute_exposure) @@ -382,10 +373,15 @@ def test_different_relative_and_absolute_exposure(self): """Test that credible intervals are calculated using absolute_exposure rather than relative exposure""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant_with_different_exposures( - "control", mean=100.0, exposure=1, absolute_exposure=10000 + control_absolute_exposure = 10000 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 12000 + test = create_variant( + "test", + mean=120.0, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, ) - test = create_variant_with_different_exposures("test", mean=120.0, exposure=1.2, absolute_exposure=12000) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) From 61bfc39769f787414b3ef8529dab245e093c11c4 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 05:41:20 -0800 Subject: [PATCH 14/34] Use correct absolute exposure for these tests --- .../test/test_trends_statistics_continuous.py | 92 ++++++++++++++++--- 1 file changed, 78 insertions(+), 14 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index b2729ee21607a..9f73a9f988a4d 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -165,10 +165,29 @@ def test_many_variants_not_significant(self): """Test with multiple variants, no clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", mean=100.0, exposure=1000) - test_a = create_variant("test_a", mean=98.0, exposure=1000) - test_b = create_variant("test_b", mean=102.0, exposure=1000) - test_c = create_variant("test_c", mean=101.0, exposure=1000) + control_absolute_exposure = 1000 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_a_absolute_exposure = 1000 + test_a = create_variant( + "test_a", + mean=98.0, + exposure=test_a_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_a_absolute_exposure, + ) + test_b_absolute_exposure = 1000 + test_b = create_variant( + "test_b", + mean=102.0, + exposure=test_b_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_b_absolute_exposure, + ) + test_c_absolute_exposure = 1000 + test_c = create_variant( + "test_c", + mean=101.0, + exposure=test_c_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_c_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) @@ -201,10 +220,29 @@ def test_many_variants_significant(self): """Test with multiple variants, one clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", mean=100.0, exposure=10000) - test_a = create_variant("test_a", mean=105.0, exposure=10000) - test_b = create_variant("test_b", mean=150.0, exposure=10000) - test_c = create_variant("test_c", mean=110.0, exposure=10000) + control_absolute_exposure = 10000 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_a_absolute_exposure = 10000 + test_a = create_variant( + "test_a", + mean=105.0, + exposure=test_a_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_a_absolute_exposure, + ) + test_b_absolute_exposure = 10000 + test_b = create_variant( + "test_b", + mean=150.0, + exposure=test_b_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_b_absolute_exposure, + ) + test_c_absolute_exposure = 10000 + test_c = create_variant( + "test_c", + mean=110.0, + exposure=test_c_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_c_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) @@ -251,8 +289,15 @@ def test_insufficient_sample_size(self): """Test with sample size below threshold""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", mean=100.0, exposure=50) - test = create_variant("test", mean=120.0, exposure=50) + control_absolute_exposure = 50 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 50 + test = create_variant( + "test", + mean=120.0, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -290,8 +335,15 @@ def test_edge_cases_zero_means(self): """Test edge cases like zero means""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", mean=0.0, exposure=1000) - test = create_variant("test", mean=0.0, exposure=1000) + control_absolute_exposure = 1000 + control = create_variant("control", mean=0.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 1000 + test = create_variant( + "test", + mean=0.0, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -331,8 +383,20 @@ def test_edge_cases_near_zero_means(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): # Using very small positive values instead of exact zeros - control = create_variant("control", mean=0.0001, exposure=1000) - test = create_variant("test", mean=0.0001, exposure=1000) + control_absolute_exposure = 1000 + control = create_variant( + "control", + mean=0.0001, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) + test_absolute_exposure = 1000 + test = create_variant( + "test", + mean=0.0001, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) From 4c14b487056819773a04c6ff177a0bd454dd1647 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 06:08:03 -0800 Subject: [PATCH 15/34] Use correct exposure values --- .../test/test_trends_statistics_continuous.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 9f73a9f988a4d..2654bfed4d727 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -47,8 +47,15 @@ def test_small_sample_two_variants_not_significant(self): """Test with small sample size, two variants, no clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", mean=100.0, exposure=100) - test = create_variant("test", mean=105.0, exposure=100) + control_absolute_exposure = 100 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 100 + test = create_variant( + "test", + mean=105.0, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -87,8 +94,15 @@ def test_large_sample_two_variants_significant(self): """Test with large sample size, two variants, clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", mean=100.0, exposure=10000) - test = create_variant("test", mean=120.0, exposure=10000) + control_absolute_exposure = 10000 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 10000 + test = create_variant( + "test", + mean=120.0, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -127,8 +141,15 @@ def test_large_sample_two_variants_strongly_significant(self): """Test with large sample size, two variants, very clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", mean=100.0, exposure=10000) - test = create_variant("test", mean=150.0, exposure=10000) + control_absolute_exposure = 10000 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 10000 + test = create_variant( + "test", + mean=150.0, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) From 6c0ab9c1fc4c90d2398198695d390dc27972d7fa Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 06:15:04 -0800 Subject: [PATCH 16/34] Adjust range values for `test_many_variants_not_significant` --- .../test/test_trends_statistics_continuous.py | 40 ++++++++++++++++--- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 2654bfed4d727..045dbc1d0b473 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -221,9 +221,23 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # All variants around $100 with overlapping intervals - for variant_key in ["control", "test_a", "test_b", "test_c"]: - self.assertRange(intervals[variant_key][0], (90, 95)) # Lower bounds - self.assertRange(intervals[variant_key][1], (105, 110)) # Upper bounds + lower_bound = (90, 100) + upper_bound = (100, 110) + # Control variant + self.assertRange(intervals["control"][0], lower_bound) # Lower bound + self.assertRange(intervals["control"][1], upper_bound) # Upper bound + + # Test A variant + self.assertRange(intervals["test_a"][0], lower_bound) # Lower bound + self.assertRange(intervals["test_a"][1], upper_bound) # Upper bound + + # Test B variant + self.assertRange(intervals["test_b"][0], lower_bound) # Lower bound + self.assertRange(intervals["test_b"][1], upper_bound) # Upper bound + + # Test C variant + self.assertRange(intervals["test_c"][0], lower_bound) # Lower bound + self.assertRange(intervals["test_c"][1], upper_bound) # Upper bound else: # Original implementation behavior for multiple variants with no clear winner self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) @@ -231,9 +245,23 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean - for variant_key in ["control", "test_a", "test_b", "test_c"]: - self.assertTrue(intervals[variant_key][0] < 1) - self.assertTrue(intervals[variant_key][1] > 1) + lower_bound = (0.075, 0.095) + upper_bound = (0.11, 0.13) + # Control variant + self.assertRange(intervals["control"][0], lower_bound) + self.assertRange(intervals["control"][1], upper_bound) + + # Test A variant + self.assertRange(intervals["test_a"][0], lower_bound) + self.assertRange(intervals["test_a"][1], upper_bound) + + # Test B variant + self.assertRange(intervals["test_b"][0], lower_bound) + self.assertRange(intervals["test_b"][1], upper_bound) + + # Test C variant + self.assertRange(intervals["test_c"][0], lower_bound) + self.assertRange(intervals["test_c"][1], upper_bound) self.run_test_for_both_implementations(run_test) From 4aa1fc626ded4dc8f13312ff2f708da9345dfba4 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 06:23:16 -0800 Subject: [PATCH 17/34] Update values for `test_large_sample_two_variants_significant` --- .../test/test_trends_statistics_continuous.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 045dbc1d0b473..6f6f30d9ff395 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -124,16 +124,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertRange(intervals["test"][1], (118, 122)) # Upper bound else: # Original implementation behavior for large sample - self.assertTrue(probabilities[1] > 0.5) # Test variant winning - self.assertTrue(probabilities[0] < 0.5) # Control variant losing - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - self.assertLess(p_value, 0.05) + self.assertRange(probabilities[1], (0.5, 1.0)) # Test variant winning + self.assertRange(probabilities[0], (0.0, 0.5)) # Control variant losing + self.assertTrue( + significance in [ExperimentSignificanceCode.HIGH_P_VALUE, ExperimentSignificanceCode.SIGNIFICANT] + ) + self.assertRange(p_value, (0, 0.3)) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue(intervals["control"][0] < 1) - self.assertTrue(intervals["control"][1] > 1) - self.assertTrue(intervals["test"][0] < 1) - self.assertTrue(intervals["test"][1] > 1) + self.assertRange(intervals["control"][0], (0, 0.1)) # Lower bound less than mean + self.assertRange(intervals["control"][1], (0.01, 0.02)) # Upper bound greater than mean + self.assertRange(intervals["test"][0], (0, 0.1)) # Lower bound less than mean + self.assertRange(intervals["test"][1], (0.01, 0.02)) # Upper bound greater than mean self.run_test_for_both_implementations(run_test) From 6962e977d112026fbd633f6b07b2538ca8714121 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 06:32:45 -0800 Subject: [PATCH 18/34] Update values for `test_different_relative_and_absolute_exposure` --- .../test/test_trends_statistics_continuous.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 6f6f30d9ff395..674074cddf883 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -518,15 +518,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertRange(intervals["test"][1], (118, 122)) else: # Original implementation behavior for different exposures - self.assertTrue(probabilities[1] > 0.5) # Test variant winning - self.assertTrue(probabilities[0] < 0.5) # Control variant losing - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - self.assertLess(p_value, 0.05) + self.assertRange(probabilities[1], (0.4, 0.6)) # Close to 50/50 + self.assertRange(probabilities[0], (0.4, 0.6)) # Close to 50/50 + self.assertTrue( + significance + in [ExperimentSignificanceCode.LOW_WIN_PROBABILITY, ExperimentSignificanceCode.SIGNIFICANT] + ) + self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertTrue(intervals["control"][0] < 1) - self.assertTrue(intervals["control"][1] > 1) - self.assertTrue(intervals["test"][0] < 1) - self.assertTrue(intervals["test"][1] > 1) + self.assertRange(intervals["control"][0], (0.007, 0.009)) + self.assertRange(intervals["control"][1], (0.01, 0.02)) + self.assertRange(intervals["test"][0], (0.007, 0.009)) + self.assertRange(intervals["test"][1], (0.01, 0.02)) self.run_test_for_both_implementations(run_test) From 61f118c61e75feb8a9e8731ad0be5eb13962cffe Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 06:46:22 -0800 Subject: [PATCH 19/34] Replace some use of assertRange --- .../test/test_trends_statistics_continuous.py | 140 +++++++++--------- 1 file changed, 68 insertions(+), 72 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 674074cddf883..f292222efbeee 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -69,12 +69,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # Control: ~$100 mean with wide interval due to small sample - self.assertRange(intervals["control"][0], (80, 90)) # Lower bound - self.assertRange(intervals["control"][1], (100, 120)) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 85, delta=5) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 110, delta=5) # Upper bound # Test: ~$105 mean with wide interval due to small sample - self.assertRange(intervals["test"][0], (85, 95)) # Lower bound - self.assertRange(intervals["test"][1], (105, 125)) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 90, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test"][1], 115, delta=5) # Upper bound else: # Original implementation behavior for small sample self.assertRange(probabilities[0], (0.3, 0.7)) @@ -83,10 +83,10 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertRange(intervals["control"][0], (0.8, 1.2)) # Lower bound is less than mean - self.assertRange(intervals["control"][1], (1.1, 1.3)) # Upper bound is greater than mean - self.assertRange(intervals["test"][0], (0.8, 1.2)) - self.assertRange(intervals["test"][1], (1.1, 1.3)) + self.assertAlmostEqual(intervals["control"][0], 1.0, delta=0.2) # Lower bound is less than mean + self.assertAlmostEqual(intervals["control"][1], 1.2, delta=0.1) # Upper bound is greater than mean + self.assertAlmostEqual(intervals["test"][0], 1.0, delta=0.2) + self.assertAlmostEqual(intervals["test"][1], 1.2, delta=0.1) self.run_test_for_both_implementations(run_test) @@ -116,12 +116,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control: $100 mean with narrow interval due to large sample - self.assertRange(intervals["control"][0], (98, 102)) # Lower bound - self.assertRange(intervals["control"][1], (98, 102)) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 100, delta=2) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 100, delta=2) # Upper bound # Test: $120 mean with narrow interval due to large sample - self.assertRange(intervals["test"][0], (118, 122)) # Lower bound - self.assertRange(intervals["test"][1], (118, 122)) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 120, delta=2) # Lower bound + self.assertAlmostEqual(intervals["test"][1], 120, delta=2) # Upper bound else: # Original implementation behavior for large sample self.assertRange(probabilities[1], (0.5, 1.0)) # Test variant winning @@ -132,10 +132,10 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertRange(p_value, (0, 0.3)) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertRange(intervals["control"][0], (0, 0.1)) # Lower bound less than mean - self.assertRange(intervals["control"][1], (0.01, 0.02)) # Upper bound greater than mean - self.assertRange(intervals["test"][0], (0, 0.1)) # Lower bound less than mean - self.assertRange(intervals["test"][1], (0.01, 0.02)) # Upper bound greater than mean + self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05) # Lower bound less than mean + self.assertAlmostEqual(intervals["control"][1], 0.015, delta=0.005) # Upper bound greater than mean + self.assertAlmostEqual(intervals["test"][0], 0.05, delta=0.05) # Lower bound less than mean + self.assertAlmostEqual(intervals["test"][1], 0.015, delta=0.005) # Upper bound greater than mean self.run_test_for_both_implementations(run_test) @@ -165,12 +165,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control: $100 mean - self.assertRange(intervals["control"][0], (98, 102)) # Lower bound - self.assertRange(intervals["control"][1], (98, 102)) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 100, delta=2) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 100, delta=2) # Upper bound # Test: $150 mean, clearly higher than control - self.assertRange(intervals["test"][0], (147, 153)) # Lower bound - self.assertRange(intervals["test"][1], (147, 153)) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 150, delta=3) # Lower bound + self.assertAlmostEqual(intervals["test"][1], 150, delta=3) # Upper bound else: # Original implementation behavior for strongly significant case self.assertTrue(probabilities[1] > 0.5) # Test variant winning @@ -223,23 +223,21 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # All variants around $100 with overlapping intervals - lower_bound = (90, 100) - upper_bound = (100, 110) # Control variant - self.assertRange(intervals["control"][0], lower_bound) # Lower bound - self.assertRange(intervals["control"][1], upper_bound) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 95, delta=5) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 105, delta=5) # Upper bound # Test A variant - self.assertRange(intervals["test_a"][0], lower_bound) # Lower bound - self.assertRange(intervals["test_a"][1], upper_bound) # Upper bound + self.assertAlmostEqual(intervals["test_a"][0], 95, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test_a"][1], 105, delta=5) # Upper bound # Test B variant - self.assertRange(intervals["test_b"][0], lower_bound) # Lower bound - self.assertRange(intervals["test_b"][1], upper_bound) # Upper bound + self.assertAlmostEqual(intervals["test_b"][0], 95, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test_b"][1], 105, delta=5) # Upper bound # Test C variant - self.assertRange(intervals["test_c"][0], lower_bound) # Lower bound - self.assertRange(intervals["test_c"][1], upper_bound) # Upper bound + self.assertAlmostEqual(intervals["test_c"][0], 95, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test_c"][1], 105, delta=5) # Upper bound else: # Original implementation behavior for multiple variants with no clear winner self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) @@ -247,23 +245,21 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean - lower_bound = (0.075, 0.095) - upper_bound = (0.11, 0.13) # Control variant - self.assertRange(intervals["control"][0], lower_bound) - self.assertRange(intervals["control"][1], upper_bound) + self.assertAlmostEqual(intervals["control"][0], 0.085, delta=0.01) # ~8.5% + self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.01) # ~12% # Test A variant - self.assertRange(intervals["test_a"][0], lower_bound) - self.assertRange(intervals["test_a"][1], upper_bound) + self.assertAlmostEqual(intervals["test_a"][0], 0.085, delta=0.01) # ~8.5% + self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.01) # ~12% # Test B variant - self.assertRange(intervals["test_b"][0], lower_bound) - self.assertRange(intervals["test_b"][1], upper_bound) + self.assertAlmostEqual(intervals["test_b"][0], 0.085, delta=0.01) # ~8.5% + self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.01) # ~12% # Test C variant - self.assertRange(intervals["test_c"][0], lower_bound) - self.assertRange(intervals["test_c"][1], upper_bound) + self.assertAlmostEqual(intervals["test_c"][0], 0.085, delta=0.01) # ~8.5% + self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.01) # ~12% self.run_test_for_both_implementations(run_test) @@ -362,11 +358,11 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1.0) # Both variants should have wide intervals due to small sample size - self.assertRange(intervals["control"][0], (70, 90)) - self.assertRange(intervals["control"][1], (100, 120)) + self.assertAlmostEqual(intervals["control"][0], 80, delta=10) + self.assertAlmostEqual(intervals["control"][1], 110, delta=10) - self.assertRange(intervals["test"][0], (85, 105)) - self.assertRange(intervals["test"][1], (115, 135)) + self.assertAlmostEqual(intervals["test"][0], 95, delta=10) + self.assertAlmostEqual(intervals["test"][1], 125, delta=10) else: # Original implementation behavior for insufficient sample size self.assertRange(probabilities[0], (0.05, 0.1)) @@ -375,10 +371,10 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1.0) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertRange(intervals["control"][0], (1.5, 1.8)) - self.assertRange(intervals["control"][1], (2.3, 2.6)) - self.assertRange(intervals["test"][0], (1.8, 2.1)) - self.assertRange(intervals["test"][1], (2.6, 2.9)) + self.assertAlmostEqual(intervals["control"][0], 1.65, delta=0.15) + self.assertAlmostEqual(intervals["control"][1], 2.45, delta=0.15) + self.assertAlmostEqual(intervals["test"][0], 1.95, delta=0.15) + self.assertAlmostEqual(intervals["test"][1], 2.75, delta=0.15) self.run_test_for_both_implementations(run_test) @@ -408,11 +404,11 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # Both variants should have very small intervals near zero - self.assertRange(intervals["control"][0], (0, 0.1)) - self.assertRange(intervals["control"][1], (0, 0.1)) + self.assertAlmostEqual(intervals["control"][0], 0, delta=0.05) + self.assertAlmostEqual(intervals["control"][1], 0, delta=0.05) - self.assertRange(intervals["test"][0], (0, 0.1)) - self.assertRange(intervals["test"][1], (0, 0.1)) + self.assertAlmostEqual(intervals["test"][0], 0, delta=0.05) + self.assertAlmostEqual(intervals["test"][1], 0, delta=0.05) else: # Original implementation behavior for zero means self.assertRange(probabilities[0], (0.4, 0.6)) @@ -422,10 +418,10 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Original implementation returns intervals as ratios/multipliers of the mean # For zero means, the intervals should still be valid ratios - self.assertTrue(intervals["control"][0] >= 0) - self.assertTrue(intervals["control"][1] >= 0) - self.assertTrue(intervals["test"][0] >= 0) - self.assertTrue(intervals["test"][1] >= 0) + self.assertAlmostEqual(intervals["control"][0], 0, delta=0.1) + self.assertAlmostEqual(intervals["control"][1], 0, delta=0.1) + self.assertAlmostEqual(intervals["test"][0], 0, delta=0.1) + self.assertAlmostEqual(intervals["test"][1], 0, delta=0.1) self.run_test_for_both_implementations(run_test) @@ -462,11 +458,11 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Both variants should have intervals appropriate for their small means # For a mean of 0.0001, expect intervals to be within an order of magnitude - self.assertRange(intervals["control"][0], (0.00005, 0.0002)) # Lower bound - self.assertRange(intervals["control"][1], (0.00005, 0.0002)) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 0.0001, delta=0.00015) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 0.0001, delta=0.00015) # Upper bound - self.assertRange(intervals["test"][0], (0.00005, 0.0002)) # Lower bound - self.assertRange(intervals["test"][1], (0.00005, 0.0002)) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 0.0001, delta=0.00015) # Lower bound + self.assertAlmostEqual(intervals["test"][1], 0.0001, delta=0.00015) # Upper bound else: # Original implementation behavior for near-zero means self.assertRange(probabilities[0], (0.4, 0.6)) @@ -477,10 +473,10 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Original implementation returns intervals as ratios/multipliers of the mean # For near-zero means, the intervals become very small ratios # This is expected behavior when dealing with values close to zero - self.assertRange(intervals["control"][0], (0, 0.0001)) # Lower bound ratio - self.assertRange(intervals["control"][1], (0, 0.005)) # Upper bound ratio - self.assertRange(intervals["test"][0], (0, 0.0001)) # Lower bound ratio - self.assertRange(intervals["test"][1], (0, 0.005)) # Upper bound ratio + self.assertAlmostEqual(intervals["control"][0], 0.00005, delta=0.00005) # Lower bound ratio + self.assertAlmostEqual(intervals["control"][1], 0.0025, delta=0.0025) # Upper bound ratio + self.assertAlmostEqual(intervals["test"][0], 0.00005, delta=0.00005) # Lower bound ratio + self.assertAlmostEqual(intervals["test"][1], 0.0025, delta=0.0025) # Upper bound ratio self.run_test_for_both_implementations(run_test) @@ -510,12 +506,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control at $100 mean - self.assertRange(intervals["control"][0], (98, 102)) - self.assertRange(intervals["control"][1], (98, 102)) + self.assertAlmostEqual(intervals["control"][0], 100, delta=2) + self.assertAlmostEqual(intervals["control"][1], 100, delta=2) # Test at $120 mean - self.assertRange(intervals["test"][0], (118, 122)) - self.assertRange(intervals["test"][1], (118, 122)) + self.assertAlmostEqual(intervals["test"][0], 120, delta=2) + self.assertAlmostEqual(intervals["test"][1], 120, delta=2) else: # Original implementation behavior for different exposures self.assertRange(probabilities[1], (0.4, 0.6)) # Close to 50/50 @@ -527,9 +523,9 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # Original implementation returns intervals as ratios/multipliers of the mean - self.assertRange(intervals["control"][0], (0.007, 0.009)) - self.assertRange(intervals["control"][1], (0.01, 0.02)) - self.assertRange(intervals["test"][0], (0.007, 0.009)) - self.assertRange(intervals["test"][1], (0.01, 0.02)) + self.assertAlmostEqual(intervals["control"][0], 0.008, delta=0.001) + self.assertAlmostEqual(intervals["control"][1], 0.015, delta=0.005) + self.assertAlmostEqual(intervals["test"][0], 0.008, delta=0.001) + self.assertAlmostEqual(intervals["test"][1], 0.015, delta=0.005) self.run_test_for_both_implementations(run_test) From dc8fceb59a9e8767bcf81e9d9b137d10fdfe01f4 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 06:52:26 -0800 Subject: [PATCH 20/34] Replace use of `assertRange` --- .../test/test_trends_statistics_continuous.py | 75 +++++++++---------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index f292222efbeee..6d3b6a3f199e5 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -38,11 +38,6 @@ def run_test_for_both_implementations(self, test_fn): calculate_credible_intervals=calculate_credible_intervals_v2_continuous, ) - def assertRange(self, value, range: tuple[float, float]): - self.assertTrue( - range[0] <= value <= range[1], f"{value} is not in range {range} (stats version {self.stats_version})" - ) - def test_small_sample_two_variants_not_significant(self): """Test with small sample size, two variants, no clear winner""" @@ -63,8 +58,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertRange(probabilities[0], (0.4, 0.6)) # Close to 50/50 - self.assertRange(probabilities[1], (0.4, 0.6)) # Close to 50/50 + self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) + self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) @@ -77,8 +72,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 115, delta=5) # Upper bound else: # Original implementation behavior for small sample - self.assertRange(probabilities[0], (0.3, 0.7)) - self.assertRange(probabilities[1], (0.3, 0.7)) + self.assertAlmostEqual(probabilities[0], 0.5, delta=0.2) + self.assertAlmostEqual(probabilities[1], 0.5, delta=0.2) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) @@ -110,8 +105,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertRange(probabilities[1], (0.95, 1.0)) # Test variant strongly winning - self.assertRange(probabilities[0], (0.0, 0.05)) # Control variant strongly losing + self.assertAlmostEqual(probabilities[1], 1.0, delta=0.025) + self.assertAlmostEqual(probabilities[0], 0.0, delta=0.025) self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertEqual(p_value, 0) @@ -124,12 +119,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 120, delta=2) # Upper bound else: # Original implementation behavior for large sample - self.assertRange(probabilities[1], (0.5, 1.0)) # Test variant winning - self.assertRange(probabilities[0], (0.0, 0.5)) # Control variant losing + self.assertAlmostEqual(probabilities[1], 0.75, delta=0.25) + self.assertAlmostEqual(probabilities[0], 0.25, delta=0.25) self.assertTrue( significance in [ExperimentSignificanceCode.HIGH_P_VALUE, ExperimentSignificanceCode.SIGNIFICANT] ) - self.assertRange(p_value, (0, 0.3)) + self.assertAlmostEqual(p_value, 0.15, delta=0.15) # Original implementation returns intervals as ratios/multipliers of the mean self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05) # Lower bound less than mean @@ -159,8 +154,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertRange(probabilities[1], (0.99, 1.0)) # Test variant very strongly winning - self.assertRange(probabilities[0], (0.0, 0.01)) # Control variant very strongly losing + self.assertAlmostEqual(probabilities[1], 1.0, delta=0.005) + self.assertAlmostEqual(probabilities[0], 0.0, delta=0.005) self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertEqual(p_value, 0) @@ -304,20 +299,20 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control at $100 - self.assertRange(intervals["control"][0], (98, 102)) - self.assertRange(intervals["control"][1], (98, 102)) + self.assertAlmostEqual(intervals["control"][0], 100, delta=2) + self.assertAlmostEqual(intervals["control"][1], 100, delta=2) # Test A slightly higher at $105 - self.assertRange(intervals["test_a"][0], (103, 107)) - self.assertRange(intervals["test_a"][1], (103, 107)) + self.assertAlmostEqual(intervals["test_a"][0], 105, delta=2) + self.assertAlmostEqual(intervals["test_a"][1], 105, delta=2) # Test B clearly winning at $150 - self.assertRange(intervals["test_b"][0], (147, 153)) - self.assertRange(intervals["test_b"][1], (147, 153)) + self.assertAlmostEqual(intervals["test_b"][0], 150, delta=3) + self.assertAlmostEqual(intervals["test_b"][1], 150, delta=3) # Test C slightly higher at $110 - self.assertRange(intervals["test_c"][0], (108, 112)) - self.assertRange(intervals["test_c"][1], (108, 112)) + self.assertAlmostEqual(intervals["test_c"][0], 110, delta=2) + self.assertAlmostEqual(intervals["test_c"][1], 110, delta=2) else: # Original implementation behavior for multiple variants with clear winner self.assertTrue(probabilities[2] > 0.5) # test_b should be winning @@ -352,8 +347,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertRange(probabilities[0], (0.0, 0.5)) # Control has lower probability - self.assertRange(probabilities[1], (0.5, 1.0)) # Test has higher probability + self.assertAlmostEqual(probabilities[0], 0.25, delta=0.25) # Control has lower probability + self.assertAlmostEqual(probabilities[1], 0.75, delta=0.25) # Test has higher probability self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) self.assertEqual(p_value, 1.0) @@ -365,8 +360,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 125, delta=10) else: # Original implementation behavior for insufficient sample size - self.assertRange(probabilities[0], (0.05, 0.1)) - self.assertRange(probabilities[1], (0.85, 1.0)) + self.assertAlmostEqual(probabilities[0], 0.075, delta=0.025) + self.assertAlmostEqual(probabilities[1], 0.925, delta=0.075) self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) self.assertEqual(p_value, 1.0) @@ -398,8 +393,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertRange(probabilities[0], (0.4, 0.6)) # Should be close to 50/50 - self.assertRange(probabilities[1], (0.4, 0.6)) # Should be close to 50/50 + self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) # Should be close to 50/50 + self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) # Should be close to 50/50 self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) @@ -411,8 +406,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 0, delta=0.05) else: # Original implementation behavior for zero means - self.assertRange(probabilities[0], (0.4, 0.6)) - self.assertRange(probabilities[1], (0.4, 0.6)) + self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) + self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) @@ -451,8 +446,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertRange(probabilities[0], (0.4, 0.6)) # Should be close to 50/50 - self.assertRange(probabilities[1], (0.4, 0.6)) # Should be close to 50/50 + self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) # Should be close to 50/50 + self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) # Should be close to 50/50 self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) @@ -465,8 +460,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 0.0001, delta=0.00015) # Upper bound else: # Original implementation behavior for near-zero means - self.assertRange(probabilities[0], (0.4, 0.6)) - self.assertRange(probabilities[1], (0.4, 0.6)) + self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) + self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) @@ -500,8 +495,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertRange(probabilities[0], (0.0, 0.1)) - self.assertRange(probabilities[1], (0.9, 1.0)) + self.assertAlmostEqual(probabilities[0], 0.05, delta=0.05) + self.assertAlmostEqual(probabilities[1], 0.95, delta=0.05) self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertEqual(p_value, 0) @@ -514,8 +509,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 120, delta=2) else: # Original implementation behavior for different exposures - self.assertRange(probabilities[1], (0.4, 0.6)) # Close to 50/50 - self.assertRange(probabilities[0], (0.4, 0.6)) # Close to 50/50 + self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) # Close to 50/50 + self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) # Close to 50/50 self.assertTrue( significance in [ExperimentSignificanceCode.LOW_WIN_PROBABILITY, ExperimentSignificanceCode.SIGNIFICANT] From 7dfc17b453200134088597b97bb207e80568c270 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 06:52:46 -0800 Subject: [PATCH 21/34] This test is no longer necessary --- .../test/test_trends_statistics_continuous.py | 50 ------------------- 1 file changed, 50 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 6d3b6a3f199e5..f54c7be8b55b7 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -474,53 +474,3 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 0.0025, delta=0.0025) # Upper bound ratio self.run_test_for_both_implementations(run_test) - - def test_different_relative_and_absolute_exposure(self): - """Test that credible intervals are calculated using absolute_exposure rather than relative exposure""" - - def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control_absolute_exposure = 10000 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) - test_absolute_exposure = 12000 - test = create_variant( - "test", - mean=120.0, - exposure=test_absolute_exposure / control_absolute_exposure, - absolute_exposure=test_absolute_exposure, - ) - - probabilities = calculate_probabilities(control, [test]) - significance, p_value = are_results_significant(control, [test], probabilities) - intervals = calculate_credible_intervals([control, test]) - - self.assertEqual(len(probabilities), 2) - if stats_version == 2: - self.assertAlmostEqual(probabilities[0], 0.05, delta=0.05) - self.assertAlmostEqual(probabilities[1], 0.95, delta=0.05) - self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - self.assertEqual(p_value, 0) - - # Control at $100 mean - self.assertAlmostEqual(intervals["control"][0], 100, delta=2) - self.assertAlmostEqual(intervals["control"][1], 100, delta=2) - - # Test at $120 mean - self.assertAlmostEqual(intervals["test"][0], 120, delta=2) - self.assertAlmostEqual(intervals["test"][1], 120, delta=2) - else: - # Original implementation behavior for different exposures - self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) # Close to 50/50 - self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) # Close to 50/50 - self.assertTrue( - significance - in [ExperimentSignificanceCode.LOW_WIN_PROBABILITY, ExperimentSignificanceCode.SIGNIFICANT] - ) - self.assertEqual(p_value, 1) - - # Original implementation returns intervals as ratios/multipliers of the mean - self.assertAlmostEqual(intervals["control"][0], 0.008, delta=0.001) - self.assertAlmostEqual(intervals["control"][1], 0.015, delta=0.005) - self.assertAlmostEqual(intervals["test"][0], 0.008, delta=0.001) - self.assertAlmostEqual(intervals["test"][1], 0.015, delta=0.005) - - self.run_test_for_both_implementations(run_test) From 9f43ba77efbf2ba0baa02dd94df517759fdb29fd Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 07:27:41 -0800 Subject: [PATCH 22/34] Use correct `absolute_exposure` values --- .../test/test_trends_statistics_count.py | 128 ++++++++++++++---- 1 file changed, 104 insertions(+), 24 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py index aad0b72c6a9fa..08748f8edd480 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py @@ -13,8 +13,10 @@ from posthog.test.base import APIBaseTest -def create_variant(key: str, count: int, exposure: int) -> ExperimentVariantTrendsBaseStats: - return ExperimentVariantTrendsBaseStats(key=key, count=count, exposure=exposure, absolute_exposure=exposure) +def create_variant(key: str, count: int, exposure: float, absolute_exposure: int) -> ExperimentVariantTrendsBaseStats: + return ExperimentVariantTrendsBaseStats( + key=key, count=count, exposure=exposure, absolute_exposure=absolute_exposure + ) def create_variant_with_different_exposures( @@ -50,8 +52,15 @@ def test_small_sample_two_variants_not_significant(self): """Test with small sample size, two variants, no clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", count=10, exposure=100) - test = create_variant("test", count=11, exposure=100) + control_absolute_exposure = 100 + control = create_variant("control", count=10, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 100 + test = create_variant( + "test", + count=11, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -77,8 +86,15 @@ def test_large_sample_two_variants_significant(self): """Test with large sample size, two variants, clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", count=1000, exposure=10000) - test = create_variant("test", count=1200, exposure=10000) + control_absolute_exposure = 10000 + control = create_variant("control", count=1000, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 10000 + test = create_variant( + "test", + count=1200, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -107,8 +123,15 @@ def test_large_sample_two_variants_strongly_significant(self): """Test with large sample size, two variants, very clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", count=1000, exposure=10000) - test = create_variant("test", count=1500, exposure=10000) + control_absolute_exposure = 10000 + control = create_variant("control", count=1000, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 10000 + test = create_variant( + "test", + count=1500, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -137,10 +160,29 @@ def test_many_variants_not_significant(self): """Test with multiple variants, no clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", count=100, exposure=1000) - test_a = create_variant("test_a", count=98, exposure=1000) - test_b = create_variant("test_b", count=102, exposure=1000) - test_c = create_variant("test_c", count=101, exposure=1000) + control_absolute_exposure = 1000 + control = create_variant("control", count=100, exposure=1, absolute_exposure=control_absolute_exposure) + test_a_absolute_exposure = 1000 + test_a = create_variant( + "test_a", + count=98, + exposure=test_a_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_a_absolute_exposure, + ) + test_b_absolute_exposure = 1000 + test_b = create_variant( + "test_b", + count=102, + exposure=test_b_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_b_absolute_exposure, + ) + test_c_absolute_exposure = 1000 + test_c = create_variant( + "test_c", + count=101, + exposure=test_c_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_c_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) @@ -170,10 +212,29 @@ def test_many_variants_significant(self): """Test with multiple variants, one clear winner""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", count=1000, exposure=10000) - test_a = create_variant("test_a", count=1050, exposure=10000) - test_b = create_variant("test_b", count=1500, exposure=10000) - test_c = create_variant("test_c", count=1100, exposure=10000) + control_absolute_exposure = 10000 + control = create_variant("control", count=1000, exposure=1, absolute_exposure=control_absolute_exposure) + test_a_absolute_exposure = 10000 + test_a = create_variant( + "test_a", + count=1050, + exposure=test_a_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_a_absolute_exposure, + ) + test_b_absolute_exposure = 10000 + test_b = create_variant( + "test_b", + count=1500, + exposure=test_b_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_b_absolute_exposure, + ) + test_c_absolute_exposure = 10000 + test_c = create_variant( + "test_c", + count=1100, + exposure=test_c_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_c_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) significance, p_value = are_results_significant(control, [test_a, test_b, test_c], probabilities) @@ -211,8 +272,15 @@ def test_insufficient_sample_size(self): """Test with sample size below threshold""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", count=5, exposure=50) - test = create_variant("test", count=8, exposure=50) + control_absolute_exposure = 50 + control = create_variant("control", count=5, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 50 + test = create_variant( + "test", + count=8, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -241,8 +309,15 @@ def test_edge_cases(self): """Test edge cases like zero counts""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - control = create_variant("control", count=0, exposure=1000) - test = create_variant("test", count=0, exposure=1000) + control_absolute_exposure = 1000 + control = create_variant("control", count=0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 1000 + test = create_variant( + "test", + count=0, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) @@ -268,11 +343,16 @@ def test_different_relative_and_absolute_exposure(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): # Control has exposure=1 (relative) but absolute_exposure=10000 - control = create_variant_with_different_exposures( - "control", count=1000, exposure=1, absolute_exposure=10000 - ) + control_absolute_exposure = 10000 + control = create_variant("control", count=1000, exposure=1, absolute_exposure=control_absolute_exposure) # Test has exposure=1.2 (relative) but absolute_exposure=12000 - test = create_variant_with_different_exposures("test", count=1200, exposure=1.2, absolute_exposure=12000) + test_absolute_exposure = 12000 + test = create_variant( + "test", + count=1200, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) probabilities = calculate_probabilities(control, [test]) significance, p_value = are_results_significant(control, [test], probabilities) From 3a6d4073798bedcf03827c0e521a30760308ef53 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 11:30:49 -0800 Subject: [PATCH 23/34] Drop duplicative test --- .../test/test_trends_statistics_count.py | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py index 08748f8edd480..c53d65c066d72 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py @@ -337,39 +337,3 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 0.004, places=3) self.run_test_for_both_implementations(run_test) - - def test_different_relative_and_absolute_exposure(self): - """Test that credible intervals are calculated using absolute_exposure rather than relative exposure""" - - def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - # Control has exposure=1 (relative) but absolute_exposure=10000 - control_absolute_exposure = 10000 - control = create_variant("control", count=1000, exposure=1, absolute_exposure=control_absolute_exposure) - # Test has exposure=1.2 (relative) but absolute_exposure=12000 - test_absolute_exposure = 12000 - test = create_variant( - "test", - count=1200, - exposure=test_absolute_exposure / control_absolute_exposure, - absolute_exposure=test_absolute_exposure, - ) - - probabilities = calculate_probabilities(control, [test]) - significance, p_value = are_results_significant(control, [test], probabilities) - intervals = calculate_credible_intervals([control, test]) - - self.assertEqual(len(probabilities), 2) - self.assertTrue(0.4 < probabilities[0] < 0.6) # Close to 50/50 - self.assertTrue(0.4 < probabilities[1] < 0.6) # Close to 50/50 - self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) - self.assertEqual(p_value, 1) - - # Control at ~10% conversion rate - self.assertAlmostEqual(intervals["control"][0], 0.094, places=2) - self.assertAlmostEqual(intervals["control"][1], 0.106, places=2) - - # Test at ~10% conversion rate - self.assertAlmostEqual(intervals["test"][0], 0.094, places=2) - self.assertAlmostEqual(intervals["test"][1], 0.106, places=2) - - self.run_test_for_both_implementations(run_test) From 3d2e24cbef9054cd6901fe17c27022b8760ca4ef Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 11:39:29 -0800 Subject: [PATCH 24/34] Add a test case with real world data --- .../test/test_trends_statistics_count.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py index c53d65c066d72..09d8051960d65 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py @@ -268,6 +268,41 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + def test_real_world_data_1(self): + """Test with multiple variants, one clear winner""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control_absolute_exposure = 2608 + control = create_variant("control", count=269, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 2615 + test = create_variant( + "test", + count=314, + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) + + probabilities = calculate_probabilities(control, [test]) + significance, p_value = are_results_significant(control, [test], probabilities) + intervals = calculate_credible_intervals([control, test]) + self.assertEqual(len(probabilities), 2) + self.assertAlmostEqual(probabilities[1], 0.966, places=2) # test should be winning + self.assertAlmostEqual(probabilities[0], 0.034, places=2) # control should be losing + if stats_version == 2: + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(p_value, 0) + else: + self.assertEqual(significance, ExperimentSignificanceCode.HIGH_P_VALUE) + self.assertAlmostEqual(p_value, 0.07, delta=0.01) + + self.assertAlmostEqual(intervals["control"][0], 0.094, delta=0.01) + self.assertAlmostEqual(intervals["control"][1], 0.116, delta=0.01) + + self.assertAlmostEqual(intervals["test"][0], 0.107, delta=0.01) + self.assertAlmostEqual(intervals["test"][1], 0.129, delta=0.01) + + self.run_test_for_both_implementations(run_test) + def test_insufficient_sample_size(self): """Test with sample size below threshold""" From 2ea703dade7c839df8f9f058be072ca0a003bb6e Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Fri, 13 Dec 2024 13:53:39 -0800 Subject: [PATCH 25/34] Use a standard `LOG_VARIANCE` between methods --- .../test/test_trends_statistics_continuous.py | 68 +++++++++---------- .../trends_statistics_v2_continuous.py | 10 +-- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index f54c7be8b55b7..ae5bf249671ce 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -58,18 +58,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) - self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) + self.assertAlmostEqual(probabilities[0], 0.4, delta=0.1) + self.assertAlmostEqual(probabilities[1], 0.6, delta=0.1) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) # Control: ~$100 mean with wide interval due to small sample - self.assertAlmostEqual(intervals["control"][0], 85, delta=5) # Lower bound - self.assertAlmostEqual(intervals["control"][1], 110, delta=5) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 72, delta=5) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 128, delta=5) # Upper bound # Test: ~$105 mean with wide interval due to small sample - self.assertAlmostEqual(intervals["test"][0], 90, delta=5) # Lower bound - self.assertAlmostEqual(intervals["test"][1], 115, delta=5) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 75, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test"][1], 130, delta=5) # Upper bound else: # Original implementation behavior for small sample self.assertAlmostEqual(probabilities[0], 0.5, delta=0.2) @@ -111,12 +111,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control: $100 mean with narrow interval due to large sample - self.assertAlmostEqual(intervals["control"][0], 100, delta=2) # Lower bound - self.assertAlmostEqual(intervals["control"][1], 100, delta=2) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 97, delta=2) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 103, delta=2) # Upper bound # Test: $120 mean with narrow interval due to large sample - self.assertAlmostEqual(intervals["test"][0], 120, delta=2) # Lower bound - self.assertAlmostEqual(intervals["test"][1], 120, delta=2) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 116, delta=2) # Lower bound + self.assertAlmostEqual(intervals["test"][1], 124, delta=2) # Upper bound else: # Original implementation behavior for large sample self.assertAlmostEqual(probabilities[1], 0.75, delta=0.25) @@ -160,12 +160,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control: $100 mean - self.assertAlmostEqual(intervals["control"][0], 100, delta=2) # Lower bound - self.assertAlmostEqual(intervals["control"][1], 100, delta=2) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 97, delta=2) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 103, delta=2) # Upper bound # Test: $150 mean, clearly higher than control - self.assertAlmostEqual(intervals["test"][0], 150, delta=3) # Lower bound - self.assertAlmostEqual(intervals["test"][1], 150, delta=3) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 146, delta=3) # Lower bound + self.assertAlmostEqual(intervals["test"][1], 154, delta=3) # Upper bound else: # Original implementation behavior for strongly significant case self.assertTrue(probabilities[1] > 0.5) # Test variant winning @@ -219,20 +219,20 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # All variants around $100 with overlapping intervals # Control variant - self.assertAlmostEqual(intervals["control"][0], 95, delta=5) # Lower bound - self.assertAlmostEqual(intervals["control"][1], 105, delta=5) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 90, delta=5) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 110, delta=5) # Upper bound # Test A variant - self.assertAlmostEqual(intervals["test_a"][0], 95, delta=5) # Lower bound - self.assertAlmostEqual(intervals["test_a"][1], 105, delta=5) # Upper bound + self.assertAlmostEqual(intervals["test_a"][0], 90, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test_a"][1], 110, delta=5) # Upper bound # Test B variant - self.assertAlmostEqual(intervals["test_b"][0], 95, delta=5) # Lower bound - self.assertAlmostEqual(intervals["test_b"][1], 105, delta=5) # Upper bound + self.assertAlmostEqual(intervals["test_b"][0], 90, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test_b"][1], 110, delta=5) # Upper bound # Test C variant - self.assertAlmostEqual(intervals["test_c"][0], 95, delta=5) # Lower bound - self.assertAlmostEqual(intervals["test_c"][1], 105, delta=5) # Upper bound + self.assertAlmostEqual(intervals["test_c"][0], 90, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test_c"][1], 110, delta=5) # Upper bound else: # Original implementation behavior for multiple variants with no clear winner self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) @@ -299,20 +299,20 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control at $100 - self.assertAlmostEqual(intervals["control"][0], 100, delta=2) - self.assertAlmostEqual(intervals["control"][1], 100, delta=2) + self.assertAlmostEqual(intervals["control"][0], 97, delta=1) + self.assertAlmostEqual(intervals["control"][1], 103, delta=1) # Test A slightly higher at $105 - self.assertAlmostEqual(intervals["test_a"][0], 105, delta=2) - self.assertAlmostEqual(intervals["test_a"][1], 105, delta=2) + self.assertAlmostEqual(intervals["test_a"][0], 102, delta=1) + self.assertAlmostEqual(intervals["test_a"][1], 108, delta=1) # Test B clearly winning at $150 - self.assertAlmostEqual(intervals["test_b"][0], 150, delta=3) - self.assertAlmostEqual(intervals["test_b"][1], 150, delta=3) + self.assertAlmostEqual(intervals["test_b"][0], 146, delta=1) + self.assertAlmostEqual(intervals["test_b"][1], 154, delta=1) # Test C slightly higher at $110 - self.assertAlmostEqual(intervals["test_c"][0], 110, delta=2) - self.assertAlmostEqual(intervals["test_c"][1], 110, delta=2) + self.assertAlmostEqual(intervals["test_c"][0], 106, delta=1) + self.assertAlmostEqual(intervals["test_c"][1], 114, delta=1) else: # Original implementation behavior for multiple variants with clear winner self.assertTrue(probabilities[2] > 0.5) # test_b should be winning @@ -353,11 +353,11 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1.0) # Both variants should have wide intervals due to small sample size - self.assertAlmostEqual(intervals["control"][0], 80, delta=10) - self.assertAlmostEqual(intervals["control"][1], 110, delta=10) + self.assertAlmostEqual(intervals["control"][0], 62, delta=10) + self.assertAlmostEqual(intervals["control"][1], 138, delta=10) - self.assertAlmostEqual(intervals["test"][0], 95, delta=10) - self.assertAlmostEqual(intervals["test"][1], 125, delta=10) + self.assertAlmostEqual(intervals["test"][0], 75, delta=10) + self.assertAlmostEqual(intervals["test"][1], 160, delta=10) else: # Original implementation behavior for insufficient sample size self.assertAlmostEqual(probabilities[0], 0.075, delta=0.025) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index 384fa90496e3a..2763e8ca2a05a 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -11,6 +11,8 @@ ALPHA_0 = 1.0 # Prior shape for variance BETA_0 = 1.0 # Prior scale for variance +LOG_VARIANCE = 2 + SAMPLE_SIZE = 10000 EPSILON = 1e-10 # Small epsilon value to handle zeros @@ -53,13 +55,12 @@ def calculate_probabilities_v2_continuous( # Calculate posterior parameters for control log_control_mean = np.log(control_variant.count + EPSILON) # Using count field to store mean value - log_variance = 2 # Assumed variance in log-space # Update parameters for control kappa_n_control = KAPPA_0 + control_variant.absolute_exposure mu_n_control = (KAPPA_0 * MU_0 + control_variant.absolute_exposure * log_control_mean) / kappa_n_control alpha_n_control = ALPHA_0 + control_variant.absolute_exposure / 2 - beta_n_control = BETA_0 + 0.5 * control_variant.absolute_exposure * log_variance + beta_n_control = BETA_0 + 0.5 * control_variant.absolute_exposure * LOG_VARIANCE # Draw samples from control posterior control_posterior = t( @@ -75,7 +76,7 @@ def calculate_probabilities_v2_continuous( kappa_n_test = KAPPA_0 + test.absolute_exposure mu_n_test = (KAPPA_0 * MU_0 + test.absolute_exposure * log_test_mean) / kappa_n_test alpha_n_test = ALPHA_0 + test.absolute_exposure / 2 - beta_n_test = BETA_0 + 0.5 * test.absolute_exposure * log_variance + beta_n_test = BETA_0 + 0.5 * test.absolute_exposure * LOG_VARIANCE test_posterior = t( df=2 * alpha_n_test, loc=mu_n_test, scale=np.sqrt(beta_n_test / (kappa_n_test * alpha_n_test)) @@ -166,13 +167,12 @@ def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, uppe try: # Log-transform the mean value, adding epsilon to handle zeros log_mean = np.log(variant.count + EPSILON) # Using count field to store mean value - log_variance = 0.25 # Calculate posterior parameters using absolute_exposure kappa_n = KAPPA_0 + variant.absolute_exposure mu_n = (KAPPA_0 * MU_0 + variant.absolute_exposure * log_mean) / kappa_n alpha_n = ALPHA_0 + variant.absolute_exposure / 2 - beta_n = BETA_0 + 0.5 * variant.absolute_exposure * log_variance + beta_n = BETA_0 + 0.5 * variant.absolute_exposure * LOG_VARIANCE # Create posterior distribution posterior = t(df=2 * alpha_n, loc=mu_n, scale=np.sqrt(beta_n / (kappa_n * alpha_n))) From 858d7b426c057344c098c18dbc735c9a09ec637d Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Mon, 16 Dec 2024 04:15:18 -0800 Subject: [PATCH 26/34] Provide a helpful message when the tests fail --- .../test/test_experiment_trends_query_runner.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py b/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py index cc18a27bd702d..129c313234ef3 100644 --- a/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py +++ b/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py @@ -907,7 +907,11 @@ def test_query_runner_with_data_warehouse_series_no_end_date_and_nested_id(self) # Assert the expected join condition in the clickhouse SQL expected_join_condition = f"and(equals(events.team_id, {query_runner.count_query_runner.team.id}), equals(event, %(hogql_val_8)s), greaterOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_9)s, 6, %(hogql_val_10)s))), lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_11)s, 6, %(hogql_val_12)s))))) AS e__events ON" - self.assertIn(expected_join_condition, str(response.clickhouse)) + self.assertIn( + expected_join_condition, + str(response.clickhouse), + "Please make sure the timestamp statements are included in the ASOF LEFT JOIN select statement. The assertion may also fail if the hogql_val_* numbers have changed.", + ) result = query_runner.calculate() @@ -1004,7 +1008,11 @@ def test_query_runner_with_data_warehouse_series_expected_query(self): # Assert the expected join condition in the clickhouse SQL expected_join_condition = f"and(equals(events.team_id, {query_runner.count_query_runner.team.id}), equals(event, %(hogql_val_7)s), greaterOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_8)s, 6, %(hogql_val_9)s))), lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_10)s, 6, %(hogql_val_11)s))))) AS e__events ON" - self.assertIn(expected_join_condition, str(response.clickhouse)) + self.assertIn( + expected_join_condition, + str(response.clickhouse), + "Please make sure the timestamp statements are included in the ASOF LEFT JOIN select statement. The assertion may also fail if the hogql_val_* numbers have changed.", + ) result = query_runner.calculate() From 00a4d64b761b95e7053de043a600974480dd320f Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Mon, 16 Dec 2024 04:16:30 -0800 Subject: [PATCH 27/34] Revert "Provide a helpful message when the tests fail" This reverts commit 858d7b426c057344c098c18dbc735c9a09ec637d. --- .../test/test_experiment_trends_query_runner.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py b/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py index 129c313234ef3..cc18a27bd702d 100644 --- a/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py +++ b/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py @@ -907,11 +907,7 @@ def test_query_runner_with_data_warehouse_series_no_end_date_and_nested_id(self) # Assert the expected join condition in the clickhouse SQL expected_join_condition = f"and(equals(events.team_id, {query_runner.count_query_runner.team.id}), equals(event, %(hogql_val_8)s), greaterOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_9)s, 6, %(hogql_val_10)s))), lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_11)s, 6, %(hogql_val_12)s))))) AS e__events ON" - self.assertIn( - expected_join_condition, - str(response.clickhouse), - "Please make sure the timestamp statements are included in the ASOF LEFT JOIN select statement. The assertion may also fail if the hogql_val_* numbers have changed.", - ) + self.assertIn(expected_join_condition, str(response.clickhouse)) result = query_runner.calculate() @@ -1008,11 +1004,7 @@ def test_query_runner_with_data_warehouse_series_expected_query(self): # Assert the expected join condition in the clickhouse SQL expected_join_condition = f"and(equals(events.team_id, {query_runner.count_query_runner.team.id}), equals(event, %(hogql_val_7)s), greaterOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_8)s, 6, %(hogql_val_9)s))), lessOrEquals(timestamp, assumeNotNull(parseDateTime64BestEffortOrNull(%(hogql_val_10)s, 6, %(hogql_val_11)s))))) AS e__events ON" - self.assertIn( - expected_join_condition, - str(response.clickhouse), - "Please make sure the timestamp statements are included in the ASOF LEFT JOIN select statement. The assertion may also fail if the hogql_val_* numbers have changed.", - ) + self.assertIn(expected_join_condition, str(response.clickhouse)) result = query_runner.calculate() From 4e81ea427c9a21a7c2045f314e04af72413eed3c Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Tue, 17 Dec 2024 05:02:41 -0800 Subject: [PATCH 28/34] Use consistent constant names --- .../experiments/trends_statistics_v2_count.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py index 0a4802e3fc7ab..710be1a46aac2 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py @@ -7,9 +7,8 @@ import numpy as np # Prior parameters (minimal prior knowledge) -PRIOR_ALPHA = 1 -PRIOR_BETA = 1 - +ALPHA_0 = 1 +BETA_0 = 1 SAMPLE_SIZE = 10000 @@ -56,8 +55,8 @@ def calculate_probabilities_v2_count( raise ValidationError("Can't calculate experiment results for less than 2 variants", code="no_data") # Calculate posterior parameters for control - alpha_control = PRIOR_ALPHA + control_variant.count - beta_control = PRIOR_BETA + control_variant.absolute_exposure + alpha_control = ALPHA_0 + control_variant.count + beta_control = BETA_0 + control_variant.absolute_exposure # Draw samples from control posterior samples_control = gamma.rvs(alpha_control, scale=1 / beta_control, size=SAMPLE_SIZE) @@ -65,8 +64,8 @@ def calculate_probabilities_v2_count( # Draw samples for each test variant test_samples = [] for test in test_variants: - alpha_test = PRIOR_ALPHA + test.count - beta_test = PRIOR_BETA + test.absolute_exposure + alpha_test = ALPHA_0 + test.count + beta_test = BETA_0 + test.absolute_exposure test_samples.append(gamma.rvs(alpha_test, scale=1 / beta_test, size=SAMPLE_SIZE)) # Calculate probabilities @@ -187,8 +186,8 @@ def calculate_credible_intervals_v2_count(variants, lower_bound=0.025, upper_bou for variant in variants: try: # Calculate posterior parameters using absolute_exposure - alpha_posterior = PRIOR_ALPHA + variant.count - beta_posterior = PRIOR_BETA + variant.absolute_exposure + alpha_posterior = ALPHA_0 + variant.count + beta_posterior = BETA_0 + variant.absolute_exposure # Calculate credible intervals using the posterior distribution credible_interval = gamma.ppf([lower_bound, upper_bound], alpha_posterior, scale=1 / beta_posterior) From f34bea52af0f936f9c7a03409bee080aa6f8c9cb Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Tue, 17 Dec 2024 14:14:33 -0800 Subject: [PATCH 29/34] Use the log variance provided by Anders --- .../test/test_trends_statistics_continuous.py | 42 +++++++++---------- .../trends_statistics_v2_continuous.py | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index ae5bf249671ce..0ed9d43a5684a 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -64,12 +64,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 1) # Control: ~$100 mean with wide interval due to small sample - self.assertAlmostEqual(intervals["control"][0], 72, delta=5) # Lower bound - self.assertAlmostEqual(intervals["control"][1], 128, delta=5) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 80, delta=5) # Lower bound + self.assertAlmostEqual(intervals["control"][1], 114, delta=5) # Upper bound # Test: ~$105 mean with wide interval due to small sample - self.assertAlmostEqual(intervals["test"][0], 75, delta=5) # Lower bound - self.assertAlmostEqual(intervals["test"][1], 130, delta=5) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 80, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test"][1], 120, delta=5) # Upper bound else: # Original implementation behavior for small sample self.assertAlmostEqual(probabilities[0], 0.5, delta=0.2) @@ -116,7 +116,7 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Test: $120 mean with narrow interval due to large sample self.assertAlmostEqual(intervals["test"][0], 116, delta=2) # Lower bound - self.assertAlmostEqual(intervals["test"][1], 124, delta=2) # Upper bound + self.assertAlmostEqual(intervals["test"][1], 122, delta=2) # Upper bound else: # Original implementation behavior for large sample self.assertAlmostEqual(probabilities[1], 0.75, delta=0.25) @@ -224,15 +224,15 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Test A variant self.assertAlmostEqual(intervals["test_a"][0], 90, delta=5) # Lower bound - self.assertAlmostEqual(intervals["test_a"][1], 110, delta=5) # Upper bound + self.assertAlmostEqual(intervals["test_a"][1], 102, delta=5) # Upper bound # Test B variant - self.assertAlmostEqual(intervals["test_b"][0], 90, delta=5) # Lower bound - self.assertAlmostEqual(intervals["test_b"][1], 110, delta=5) # Upper bound + self.assertAlmostEqual(intervals["test_b"][0], 96, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test_b"][1], 108, delta=5) # Upper bound # Test C variant - self.assertAlmostEqual(intervals["test_c"][0], 90, delta=5) # Lower bound - self.assertAlmostEqual(intervals["test_c"][1], 110, delta=5) # Upper bound + self.assertAlmostEqual(intervals["test_c"][0], 95, delta=5) # Lower bound + self.assertAlmostEqual(intervals["test_c"][1], 105, delta=5) # Upper bound else: # Original implementation behavior for multiple variants with no clear winner self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) @@ -299,20 +299,20 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control at $100 - self.assertAlmostEqual(intervals["control"][0], 97, delta=1) - self.assertAlmostEqual(intervals["control"][1], 103, delta=1) + self.assertAlmostEqual(intervals["control"][0], 98, delta=1) + self.assertAlmostEqual(intervals["control"][1], 102, delta=1) # Test A slightly higher at $105 - self.assertAlmostEqual(intervals["test_a"][0], 102, delta=1) - self.assertAlmostEqual(intervals["test_a"][1], 108, delta=1) + self.assertAlmostEqual(intervals["test_a"][0], 103, delta=1) + self.assertAlmostEqual(intervals["test_a"][1], 107, delta=1) # Test B clearly winning at $150 - self.assertAlmostEqual(intervals["test_b"][0], 146, delta=1) - self.assertAlmostEqual(intervals["test_b"][1], 154, delta=1) + self.assertAlmostEqual(intervals["test_b"][0], 147, delta=1) + self.assertAlmostEqual(intervals["test_b"][1], 153, delta=1) # Test C slightly higher at $110 - self.assertAlmostEqual(intervals["test_c"][0], 106, delta=1) - self.assertAlmostEqual(intervals["test_c"][1], 114, delta=1) + self.assertAlmostEqual(intervals["test_c"][0], 108, delta=1) + self.assertAlmostEqual(intervals["test_c"][1], 112, delta=1) else: # Original implementation behavior for multiple variants with clear winner self.assertTrue(probabilities[2] > 0.5) # test_b should be winning @@ -354,10 +354,10 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca # Both variants should have wide intervals due to small sample size self.assertAlmostEqual(intervals["control"][0], 62, delta=10) - self.assertAlmostEqual(intervals["control"][1], 138, delta=10) + self.assertAlmostEqual(intervals["control"][1], 117, delta=10) - self.assertAlmostEqual(intervals["test"][0], 75, delta=10) - self.assertAlmostEqual(intervals["test"][1], 160, delta=10) + self.assertAlmostEqual(intervals["test"][0], 85, delta=10) + self.assertAlmostEqual(intervals["test"][1], 140, delta=10) else: # Original implementation behavior for insufficient sample size self.assertAlmostEqual(probabilities[0], 0.075, delta=0.025) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index 2763e8ca2a05a..78d10ec9ace81 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -11,7 +11,7 @@ ALPHA_0 = 1.0 # Prior shape for variance BETA_0 = 1.0 # Prior scale for variance -LOG_VARIANCE = 2 +LOG_VARIANCE = 0.75 SAMPLE_SIZE = 10000 EPSILON = 1e-10 # Small epsilon value to handle zeros From ecaed9ae129910bc0b50b3151908d0b071a43d99 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Thu, 19 Dec 2024 14:23:34 -0800 Subject: [PATCH 30/34] Implement expected loss for trend count --- .../test/test_trends_statistics_count.py | 60 +++++++++++++- .../experiments/trends_statistics_v2_count.py | 82 +++++++++++++++---- 2 files changed, 126 insertions(+), 16 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py index 09d8051960d65..ee5cf1502492f 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py @@ -290,7 +290,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(probabilities[0], 0.034, places=2) # control should be losing if stats_version == 2: self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - self.assertEqual(p_value, 0) + self.assertLess(p_value, 0.01) + self.assertGreater(p_value, 0.0) else: self.assertEqual(significance, ExperimentSignificanceCode.HIGH_P_VALUE) self.assertAlmostEqual(p_value, 0.07, delta=0.01) @@ -372,3 +373,60 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 0.004, places=3) self.run_test_for_both_implementations(run_test) + + def test_expected_loss_minimal_difference(self): + """Test expected loss when variants have very similar performance""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control_absolute_exposure = 10000 + control = create_variant("control", count=1000, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 10000 + test = create_variant( + "test", + count=1075, # Slightly higher count + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) + + probabilities = calculate_probabilities(control, [test]) + significance, expected_loss = are_results_significant(control, [test], probabilities) + + if stats_version == 2: + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + # Expected loss should be relatively small + self.assertLess(expected_loss, 0.03) # Less than 3% expected loss + self.assertGreater(expected_loss, 0) # But still some loss + else: + # Original implementation behavior (returns p_value in expected_loss) + self.assertEqual(significance, ExperimentSignificanceCode.HIGH_P_VALUE) + self.assertAlmostEqual(expected_loss, 0.1, delta=0.1) + + self.run_test_for_both_implementations(run_test) + + def test_expected_loss_test_variant_clear_winner(self): + """Test expected loss when one variant is clearly better""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control_absolute_exposure = 10000 + control = create_variant("control", count=1000, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 10000 + test = create_variant( + "test", + count=2000, # Much higher count + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) + + probabilities = calculate_probabilities(control, [test]) + significance, expected_loss = are_results_significant(control, [test], probabilities) + + if stats_version == 2: + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + # Expected loss should be very close to zero since test is clearly better + self.assertLess(expected_loss, 0.001) # Essentially zero loss + else: + # Original implementation behavior + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(expected_loss, 0.001) + + self.run_test_for_both_implementations(run_test) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py index 710be1a46aac2..ec242ec3695f3 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py @@ -1,6 +1,10 @@ from rest_framework.exceptions import ValidationError from sentry_sdk import capture_exception -from posthog.hogql_queries.experiments import FF_DISTRIBUTION_THRESHOLD, MIN_PROBABILITY_FOR_SIGNIFICANCE +from posthog.hogql_queries.experiments import ( + EXPECTED_LOSS_SIGNIFICANCE_LEVEL, + FF_DISTRIBUTION_THRESHOLD, + MIN_PROBABILITY_FOR_SIGNIFICANCE, +) from posthog.hogql_queries.experiments.funnels_statistics import Probability from posthog.schema import ExperimentSignificanceCode, ExperimentVariantTrendsBaseStats from scipy.stats import gamma @@ -98,6 +102,7 @@ def are_results_significant_v2_count( is significantly better than the others. The method: 1. Checks if sample sizes meet minimum threshold requirements 2. Evaluates win probabilities from the posterior distributions + 3. Calculates expected loss for the winning variant Parameters: ----------- @@ -111,17 +116,8 @@ def are_results_significant_v2_count( Returns: -------- tuple[ExperimentSignificanceCode, Probability] - - ExperimentSignificanceCode indicating the significance status: - NOT_ENOUGH_EXPOSURE: Insufficient sample size - LOW_WIN_PROBABILITY: No variant has a high enough probability of being best - SIGNIFICANT: Clear winner with high probability of being best - - Probability value (1.0 for NOT_ENOUGH_EXPOSURE and LOW_WIN_PROBABILITY, 0.0 for SIGNIFICANT) - - Notes: - ------ - - Uses a Bayesian approach to determine significance - - Does not use credible interval comparisons - - p_value is a placeholder (1.0 or 0.0) to indicate significance status + - ExperimentSignificanceCode indicating the significance status + - Expected loss value for significant results, 1.0 for non-significant results """ # Check exposure thresholds for variant in test_variants: @@ -135,10 +131,22 @@ def are_results_significant_v2_count( max_probability = max(probabilities) # Check if any variant has a high enough probability of being best - if max_probability < MIN_PROBABILITY_FOR_SIGNIFICANCE: - return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0 + if max_probability >= MIN_PROBABILITY_FOR_SIGNIFICANCE: + # Find best performing variant + all_variants = [control_variant, *test_variants] + rates = [v.count / v.absolute_exposure for v in all_variants] + best_idx = np.argmax(rates) + best_variant = all_variants[best_idx] + other_variants = all_variants[:best_idx] + all_variants[best_idx + 1 :] + + expected_loss = calculate_expected_loss_v2_count(best_variant, other_variants) + + if expected_loss >= EXPECTED_LOSS_SIGNIFICANCE_LEVEL: + return ExperimentSignificanceCode.HIGH_LOSS, expected_loss + + return ExperimentSignificanceCode.SIGNIFICANT, expected_loss - return ExperimentSignificanceCode.SIGNIFICANT, 0.0 + return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0 def calculate_credible_intervals_v2_count(variants, lower_bound=0.025, upper_bound=0.975): @@ -201,3 +209,47 @@ def calculate_credible_intervals_v2_count(variants, lower_bound=0.025, upper_bou return {} return intervals + + +def calculate_expected_loss_v2_count( + target_variant: ExperimentVariantTrendsBaseStats, variants: list[ExperimentVariantTrendsBaseStats] +) -> float: + """ + Calculates expected loss in count/rate using Gamma-Poisson conjugate prior. + + This implementation uses a Bayesian approach with Gamma-Poisson model + to estimate the expected loss when choosing the target variant over others. + + Parameters: + ----------- + target_variant : ExperimentVariantTrendsBaseStats + The variant being evaluated for loss + variants : list[ExperimentVariantTrendsBaseStats] + List of other variants to compare against + + Returns: + -------- + float + Expected loss in rate if choosing the target variant + """ + # Calculate posterior parameters for target variant + target_alpha = ALPHA_0 + target_variant.count + target_beta = BETA_0 + target_variant.absolute_exposure + + # Draw samples from target variant's Gamma posterior + target_samples = gamma.rvs(target_alpha, scale=1 / target_beta, size=SAMPLE_SIZE) + + # Draw samples from each comparison variant's Gamma posterior + variant_samples = [] + for variant in variants: + alpha = ALPHA_0 + variant.count + beta = BETA_0 + variant.absolute_exposure + samples = gamma.rvs(alpha, scale=1 / beta, size=SAMPLE_SIZE) + variant_samples.append(samples) + + # Calculate loss + variant_max = np.maximum.reduce(variant_samples) + losses = np.maximum(0, variant_max - target_samples) + expected_loss = float(np.mean(losses)) + + return expected_loss From 17ea023f733c984b2ca64ac6a679c21be5f75a94 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Thu, 19 Dec 2024 14:44:48 -0800 Subject: [PATCH 31/34] Apply expected loss to trends continuous --- .../test/test_trends_statistics_continuous.py | 57 ++++++++++++ .../trends_statistics_v2_continuous.py | 88 +++++++++++++++++-- 2 files changed, 140 insertions(+), 5 deletions(-) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index 0ed9d43a5684a..a2b3c0a54fa7e 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -474,3 +474,60 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][1], 0.0025, delta=0.0025) # Upper bound ratio self.run_test_for_both_implementations(run_test) + + def test_expected_loss_minimal_difference(self): + """Test expected loss when variants have very similar performance""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control_absolute_exposure = 600 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 600 + test = create_variant( + "test", + mean=120.0, # Slightly higher mean + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) + + probabilities = calculate_probabilities(control, [test]) + significance, expected_loss = are_results_significant(control, [test], probabilities) + + if stats_version == 2: + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + # Expected loss should be relatively small + self.assertLess(expected_loss, 3.0) # Less than $3 expected loss + self.assertGreater(expected_loss, 0) # But still some loss + else: + # Original implementation behavior (returns p_value in expected_loss) + self.assertEqual(significance, ExperimentSignificanceCode.HIGH_P_VALUE) + self.assertAlmostEqual(expected_loss, 0.2, delta=0.1) + + self.run_test_for_both_implementations(run_test) + + def test_expected_loss_test_variant_clear_winner(self): + """Test expected loss when one variant is clearly better""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control_absolute_exposure = 10000 + control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + test_absolute_exposure = 10000 + test = create_variant( + "test", + mean=200.0, # Much higher mean + exposure=test_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_absolute_exposure, + ) + + probabilities = calculate_probabilities(control, [test]) + significance, expected_loss = are_results_significant(control, [test], probabilities) + + if stats_version == 2: + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + # Expected loss should be very close to zero since test is clearly better + self.assertLess(expected_loss, 0.1) # Essentially zero loss + else: + # Original implementation behavior + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(expected_loss, 0.001) + + self.run_test_for_both_implementations(run_test) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index 78d10ec9ace81..c0894302143ec 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -1,6 +1,10 @@ from rest_framework.exceptions import ValidationError from sentry_sdk import capture_exception -from posthog.hogql_queries.experiments import FF_DISTRIBUTION_THRESHOLD, MIN_PROBABILITY_FOR_SIGNIFICANCE +from posthog.hogql_queries.experiments import ( + FF_DISTRIBUTION_THRESHOLD, + MIN_PROBABILITY_FOR_SIGNIFICANCE, + EXPECTED_LOSS_SIGNIFICANCE_LEVEL, +) from posthog.schema import ExperimentSignificanceCode, ExperimentVariantTrendsBaseStats from scipy.stats import t import numpy as np @@ -123,7 +127,7 @@ def are_results_significant_v2_continuous( -------- tuple[ExperimentSignificanceCode, float] - ExperimentSignificanceCode indicating the significance status - - Probability value + - Expected loss value for significant results, 1.0 for non-significant results """ # Check exposure thresholds for variant in test_variants: @@ -137,10 +141,22 @@ def are_results_significant_v2_continuous( max_probability = max(probabilities) # Check if any variant has a high enough probability of being best - if max_probability < MIN_PROBABILITY_FOR_SIGNIFICANCE: - return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0 + if max_probability >= MIN_PROBABILITY_FOR_SIGNIFICANCE: + # Find best performing variant + all_variants = [control_variant, *test_variants] + means = [v.count for v in all_variants] # count field stores mean value + best_idx = np.argmax(means) + best_variant = all_variants[best_idx] + other_variants = all_variants[:best_idx] + all_variants[best_idx + 1 :] - return ExperimentSignificanceCode.SIGNIFICANT, 0.0 + expected_loss = calculate_expected_loss_v2_continuous(best_variant, other_variants) + + if expected_loss >= EXPECTED_LOSS_SIGNIFICANCE_LEVEL: + return ExperimentSignificanceCode.HIGH_LOSS, expected_loss + + return ExperimentSignificanceCode.SIGNIFICANT, expected_loss + + return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1.0 def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, upper_bound=0.975): @@ -193,3 +209,65 @@ def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, uppe return {} return intervals + + +def calculate_expected_loss_v2_continuous( + target_variant: ExperimentVariantTrendsBaseStats, variants: list[ExperimentVariantTrendsBaseStats] +) -> float: + """ + Calculates expected loss in mean value using Normal-Inverse-Gamma conjugate prior. + + This implementation uses a Bayesian approach with Normal-Inverse-Gamma model + to estimate the expected loss when choosing the target variant over others. + The data is log-transformed to handle typical revenue/continuous metric distributions. + + Parameters: + ----------- + target_variant : ExperimentVariantTrendsBaseStats + The variant being evaluated for loss + variants : list[ExperimentVariantTrendsBaseStats] + List of other variants to compare against + + Returns: + -------- + float + Expected loss in mean value if choosing the target variant + """ + # Calculate posterior parameters for target variant + log_target_mean = np.log(target_variant.count + EPSILON) + + # Update parameters for target variant + kappa_n_target = KAPPA_0 + target_variant.absolute_exposure + mu_n_target = (KAPPA_0 * MU_0 + target_variant.absolute_exposure * log_target_mean) / kappa_n_target + alpha_n_target = ALPHA_0 + target_variant.absolute_exposure / 2 + beta_n_target = BETA_0 + 0.5 * target_variant.absolute_exposure * LOG_VARIANCE + + # Draw samples from target variant's posterior + target_posterior = t( + df=2 * alpha_n_target, loc=mu_n_target, scale=np.sqrt(beta_n_target / (kappa_n_target * alpha_n_target)) + ) + target_samples = target_posterior.rvs(SAMPLE_SIZE) + + # Draw samples from each comparison variant's posterior + variant_samples = [] + for variant in variants: + log_variant_mean = np.log(variant.count + EPSILON) + + kappa_n = KAPPA_0 + variant.absolute_exposure + mu_n = (KAPPA_0 * MU_0 + variant.absolute_exposure * log_variant_mean) / kappa_n + alpha_n = ALPHA_0 + variant.absolute_exposure / 2 + beta_n = BETA_0 + 0.5 * variant.absolute_exposure * LOG_VARIANCE + + variant_posterior = t(df=2 * alpha_n, loc=mu_n, scale=np.sqrt(beta_n / (kappa_n * alpha_n))) + variant_samples.append(variant_posterior.rvs(SAMPLE_SIZE)) + + # Transform samples back from log space + target_samples = np.exp(target_samples) - EPSILON + variant_samples = [np.exp(samples) - EPSILON for samples in variant_samples] + + # Calculate loss + variant_max = np.maximum.reduce(variant_samples) + losses = np.maximum(0, variant_max - target_samples) + expected_loss = float(np.mean(losses)) + + return expected_loss From 78af92daee5b1013998b8ca219fe0f1c78724572 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Thu, 19 Dec 2024 14:45:17 -0800 Subject: [PATCH 32/34] More informative description --- .../experiments/trends_statistics_v2_count.py | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py index ec242ec3695f3..208747a14c1a1 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py @@ -217,20 +217,39 @@ def calculate_expected_loss_v2_count( """ Calculates expected loss in count/rate using Gamma-Poisson conjugate prior. - This implementation uses a Bayesian approach with Gamma-Poisson model - to estimate the expected loss when choosing the target variant over others. + This implementation uses a Bayesian approach with Gamma-Poisson model to estimate + the expected loss when choosing the target variant over others. The Gamma-Poisson + model is used because: + 1. Count data follows a Poisson distribution (discrete events over time/exposure) + 2. The Gamma distribution is the conjugate prior for the Poisson rate parameter + 3. This combination allows for analytical posterior updates and handles rate uncertainty + + The model assumes: + - Events occur independently at a constant rate + - The number of events in any interval follows a Poisson distribution + - The rate parameter has a Gamma prior distribution + - The posterior distribution of the rate is also Gamma Parameters: ----------- target_variant : ExperimentVariantTrendsBaseStats - The variant being evaluated for loss + The variant being evaluated for loss, containing count and exposure data variants : list[ExperimentVariantTrendsBaseStats] List of other variants to compare against Returns: -------- float - Expected loss in rate if choosing the target variant + Expected loss in rate if choosing the target variant. This represents + the expected difference in rate between the target variant and the best + performing alternative. + + Notes: + ------ + - Uses minimally informative prior: Gamma(1,1) + - Posterior parameters: alpha = prior_alpha + count, beta = prior_beta + exposure + - Samples are drawn from posterior distributions to estimate expected loss + - Loss is calculated as max(0, best_alternative - target) for each sample """ # Calculate posterior parameters for target variant target_alpha = ALPHA_0 + target_variant.count From 54f989ebd8b8fd9a7ae96a62292d3fcde05006a6 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Thu, 19 Dec 2024 14:52:59 -0800 Subject: [PATCH 33/34] Incorporate continuous methods into TrendsQueryRunner --- .../experiment_trends_query_runner.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/posthog/hogql_queries/experiments/experiment_trends_query_runner.py b/posthog/hogql_queries/experiments/experiment_trends_query_runner.py index d1d32e341821a..f43c7c1998a82 100644 --- a/posthog/hogql_queries/experiments/experiment_trends_query_runner.py +++ b/posthog/hogql_queries/experiments/experiment_trends_query_runner.py @@ -14,6 +14,11 @@ calculate_credible_intervals_v2_count, calculate_probabilities_v2_count, ) +from posthog.hogql_queries.experiments.trends_statistics_v2_continuous import ( + are_results_significant_v2_continuous, + calculate_credible_intervals_v2_continuous, + calculate_probabilities_v2_continuous, +) from posthog.hogql_queries.insights.trends.trends_query_runner import TrendsQueryRunner from posthog.hogql_queries.query_runner import QueryRunner from posthog.models.experiment import Experiment @@ -316,9 +321,18 @@ def run(query_runner: TrendsQueryRunner, result_key: str, is_parallel: bool): # Statistical analysis control_variant, test_variants = self._get_variants_with_base_stats(count_result, exposure_result) if self.stats_version == 2: - probabilities = calculate_probabilities_v2_count(control_variant, test_variants) - significance_code, p_value = are_results_significant_v2_count(control_variant, test_variants, probabilities) - credible_intervals = calculate_credible_intervals_v2_count([control_variant, *test_variants]) + if self.query.count_query.series[0].math: + probabilities = calculate_probabilities_v2_continuous(control_variant, test_variants) + significance_code, p_value = are_results_significant_v2_continuous( + control_variant, test_variants, probabilities + ) + credible_intervals = calculate_credible_intervals_v2_continuous([control_variant, *test_variants]) + else: + probabilities = calculate_probabilities_v2_count(control_variant, test_variants) + significance_code, p_value = are_results_significant_v2_count( + control_variant, test_variants, probabilities + ) + credible_intervals = calculate_credible_intervals_v2_count([control_variant, *test_variants]) else: probabilities = calculate_probabilities(control_variant, test_variants) significance_code, p_value = are_results_significant(control_variant, test_variants, probabilities) From 340ed688d90ff30107f48f7d747c950baf686547 Mon Sep 17 00:00:00 2001 From: Daniel Bachhuber Date: Thu, 19 Dec 2024 15:02:51 -0800 Subject: [PATCH 34/34] Add a test for 54f989ebd8b8fd9a7ae96a62292d3fcde05006a6 --- .../test_experiment_trends_query_runner.py | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py b/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py index d1f6c4905dd93..fef6608fb62ef 100644 --- a/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py +++ b/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py @@ -1253,6 +1253,124 @@ def test_query_runner_with_avg_math(self): result = query_runner.calculate() trend_result = cast(ExperimentTrendsQueryResponse, result) + self.assertEqual(trend_result.stats_version, 1) + self.assertEqual(trend_result.significant, False) + self.assertEqual(trend_result.significance_code, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertEqual(trend_result.p_value, 1.0) + + self.assertEqual(len(result.variants), 2) + + control_result = next(variant for variant in trend_result.variants if variant.key == "control") + test_result = next(variant for variant in trend_result.variants if variant.key == "test") + + control_insight = next(variant for variant in trend_result.insight if variant["breakdown_value"] == "control") + test_insight = next(variant for variant in trend_result.insight if variant["breakdown_value"] == "test") + + self.assertEqual(control_result.count, 100) + self.assertAlmostEqual(test_result.count, 205) + self.assertEqual(control_result.absolute_exposure, 1) + self.assertEqual(test_result.absolute_exposure, 3) + + self.assertEqual( + control_insight["data"], + [0.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0], + ) + self.assertEqual( + test_insight["data"], + [0.0, 50.0, 125.0, 125.0, 125.0, 205.0, 205.0, 205.0, 205.0, 205.0, 205.0, 205.0, 205.0, 205.0, 205.0], + ) + + # Uses the same values as test_query_runner_with_data_warehouse_series_avg_amount for easy comparison + @freeze_time("2020-01-01T12:00:00Z") + def test_query_runner_with_avg_math_v2_stats(self): + feature_flag = self.create_feature_flag() + experiment = self.create_experiment(feature_flag=feature_flag) + + feature_flag_property = f"$feature/{feature_flag.key}" + + count_query = TrendsQuery( + series=[ + EventsNode(event="purchase", math="avg", math_property="amount", math_property_type="event_properties") + ], + ) + exposure_query = TrendsQuery(series=[EventsNode(event="$feature_flag_called")]) + + experiment_query = ExperimentTrendsQuery( + experiment_id=experiment.id, + kind="ExperimentTrendsQuery", + count_query=count_query, + exposure_query=exposure_query, + stats_version=2, + ) + + experiment.metrics = [{"type": "primary", "query": experiment_query.model_dump()}] + experiment.save() + + query_runner = ExperimentTrendsQueryRunner( + query=ExperimentTrendsQuery(**experiment.metrics[0]["query"]), team=self.team + ) + + # Populate exposure events - same as data warehouse test + for variant, count in [("control", 1), ("test", 3)]: + for i in range(count): + _create_event( + team=self.team, + event="$feature_flag_called", + distinct_id=f"user_{variant}_{i}", + properties={ + "$feature_flag_response": variant, + feature_flag_property: variant, + "$feature_flag": feature_flag.key, + }, + timestamp=datetime(2020, 1, i + 1), + ) + + # Create purchase events with same amounts as data warehouse test + # Control: 1 purchase of 100 + # Test: 3 purchases of 50, 75, and 80 + _create_event( + team=self.team, + event="purchase", + distinct_id="user_control_0", + properties={feature_flag_property: "control", "amount": 100}, + timestamp=datetime(2020, 1, 2), + ) + + _create_event( + team=self.team, + event="purchase", + distinct_id="user_test_1", + properties={feature_flag_property: "test", "amount": 50}, + timestamp=datetime(2020, 1, 2), + ) + _create_event( + team=self.team, + event="purchase", + distinct_id="user_test_2", + properties={feature_flag_property: "test", "amount": 75}, + timestamp=datetime(2020, 1, 3), + ) + _create_event( + team=self.team, + event="purchase", + distinct_id="user_test_3", + properties={feature_flag_property: "test", "amount": 80}, + timestamp=datetime(2020, 1, 6), + ) + + flush_persons_and_events() + + prepared_count_query = query_runner.prepared_count_query + self.assertEqual(prepared_count_query.series[0].math, "sum") + + result = query_runner.calculate() + trend_result = cast(ExperimentTrendsQueryResponse, result) + + self.assertEqual(trend_result.stats_version, 2) + self.assertEqual(trend_result.significant, False) + self.assertEqual(trend_result.significance_code, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertEqual(trend_result.p_value, 1.0) + self.assertEqual(len(result.variants), 2) control_result = next(variant for variant in trend_result.variants if variant.key == "control")