diff --git a/frontend/src/scenes/experiments/experimentLogic.tsx b/frontend/src/scenes/experiments/experimentLogic.tsx index f60fc6195fb4d..ced1ac2e42d59 100644 --- a/frontend/src/scenes/experiments/experimentLogic.tsx +++ b/frontend/src/scenes/experiments/experimentLogic.tsx @@ -1427,8 +1427,8 @@ export const experimentLogic = kea([ }, ], credibleIntervalForVariant: [ - (s) => [s.experimentStatsVersion], - (experimentStatsVersion) => + () => [], + () => ( metricResult: CachedExperimentTrendsQueryResponse | CachedExperimentFunnelsQueryResponse | null, variantKey: string, @@ -1460,26 +1460,14 @@ export const experimentLogic = kea([ const controlVariant = (metricResult.variants as TrendExperimentVariant[]).find( ({ key }) => key === 'control' ) as TrendExperimentVariant - const variant = (metricResult.variants as TrendExperimentVariant[]).find( - ({ key }) => key === variantKey - ) as TrendExperimentVariant const controlMean = controlVariant.count / controlVariant.absolute_exposure - const meanLowerBound = - experimentStatsVersion === 2 - ? credibleInterval[0] / variant.absolute_exposure - : credibleInterval[0] - const meanUpperBound = - experimentStatsVersion === 2 - ? credibleInterval[1] / variant.absolute_exposure - : credibleInterval[1] - // Calculate the percentage difference between the credible interval bounds of the variant and the control's mean. // This represents the range in which the true percentage change relative to the control is likely to fall. - const lowerBound = ((meanLowerBound - controlMean) / controlMean) * 100 - const upperBound = ((meanUpperBound - controlMean) / controlMean) * 100 - return [lowerBound, upperBound] + const relativeLowerBound = ((credibleInterval[0] - controlMean) / controlMean) * 100 + const relativeUpperBound = ((credibleInterval[1] - controlMean) / controlMean) * 100 + return [relativeLowerBound, relativeUpperBound] }, ], getIndexForVariant: [ diff --git a/posthog/hogql_queries/experiments/experiment_trends_query_runner.py b/posthog/hogql_queries/experiments/experiment_trends_query_runner.py index 291047e53aed1..bf9059f671b1c 100644 --- a/posthog/hogql_queries/experiments/experiment_trends_query_runner.py +++ b/posthog/hogql_queries/experiments/experiment_trends_query_runner.py @@ -4,6 +4,7 @@ from posthog.constants import ExperimentNoResultsErrorKeys from posthog.hogql import ast from posthog.hogql_queries.experiments import CONTROL_VARIANT_KEY +from posthog.hogql_queries.experiments.types import ExperimentMetricType from posthog.hogql_queries.experiments.trends_statistics import ( are_results_significant, calculate_credible_intervals, @@ -69,6 +70,8 @@ def __init__(self, *args, **kwargs): self.stats_version = self.experiment.get_stats_config("version") or 1 + self._fix_math_aggregation() + self.prepared_count_query = self._prepare_count_query() self.prepared_exposure_query = self._prepare_exposure_query() @@ -86,6 +89,14 @@ def _uses_math_aggregation_by_user_or_property_value(self, query: TrendsQuery): math_keys.remove("sum") return any(entity.math in math_keys for entity in query.series) + def _fix_math_aggregation(self): + """ + Switch unsupported math aggregations to SUM + """ + uses_math_aggregation = self._uses_math_aggregation_by_user_or_property_value(self.query.count_query) + if uses_math_aggregation: + self.query.count_query.series[0].math = PropertyMathType.SUM + def _get_date_range(self) -> DateRange: """ Returns an DateRange object based on the experiment's start and end dates, @@ -117,6 +128,14 @@ def _get_data_warehouse_breakdown_filter(self) -> BreakdownFilter: breakdown_type="data_warehouse", ) + def _get_metric_type(self) -> ExperimentMetricType: + # Currently, we rely on the math type to determine the metric type + match self.query.count_query.series[0].math: + case PropertyMathType.SUM | "hogql": + return ExperimentMetricType.CONTINUOUS + case _: + return ExperimentMetricType.COUNT + def _prepare_count_query(self) -> TrendsQuery: """ This method takes the raw trend query and adapts it @@ -129,13 +148,6 @@ def _prepare_count_query(self) -> TrendsQuery: """ prepared_count_query = TrendsQuery(**self.query.count_query.model_dump()) - uses_math_aggregation = self._uses_math_aggregation_by_user_or_property_value(prepared_count_query) - - # Only SUM is supported now, but some earlier experiments AVG. That does not - # make sense as input for experiment analysis, so we'll swithc that to SUM here - if uses_math_aggregation: - prepared_count_query.series[0].math = PropertyMathType.SUM - prepared_count_query.trendsFilter = TrendsFilter(display=ChartDisplayType.ACTIONS_LINE_GRAPH_CUMULATIVE) prepared_count_query.dateRange = self._get_date_range() if self._is_data_warehouse_query(prepared_count_query): @@ -270,18 +282,21 @@ def run(query_runner: TrendsQueryRunner, result_key: str, is_parallel: bool): # Statistical analysis control_variant, test_variants = self._get_variants_with_base_stats(count_result, exposure_result) if self.stats_version == 2: - if self.query.count_query.series[0].math: - probabilities = calculate_probabilities_v2_continuous(control_variant, test_variants) - significance_code, p_value = are_results_significant_v2_continuous( - control_variant, test_variants, probabilities - ) - credible_intervals = calculate_credible_intervals_v2_continuous([control_variant, *test_variants]) - else: - probabilities = calculate_probabilities_v2_count(control_variant, test_variants) - significance_code, p_value = are_results_significant_v2_count( - control_variant, test_variants, probabilities - ) - credible_intervals = calculate_credible_intervals_v2_count([control_variant, *test_variants]) + match self._get_metric_type(): + case ExperimentMetricType.CONTINUOUS: + probabilities = calculate_probabilities_v2_continuous(control_variant, test_variants) + significance_code, p_value = are_results_significant_v2_continuous( + control_variant, test_variants, probabilities + ) + credible_intervals = calculate_credible_intervals_v2_continuous([control_variant, *test_variants]) + case ExperimentMetricType.COUNT: + probabilities = calculate_probabilities_v2_count(control_variant, test_variants) + significance_code, p_value = are_results_significant_v2_count( + control_variant, test_variants, probabilities + ) + credible_intervals = calculate_credible_intervals_v2_count([control_variant, *test_variants]) + case _: + raise ValueError(f"Unsupported metric type: {self._get_metric_type()}") else: probabilities = calculate_probabilities(control_variant, test_variants) significance_code, p_value = are_results_significant(control_variant, test_variants, probabilities) diff --git a/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py b/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py index d665ce227011d..6c8062b969167 100644 --- a/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py +++ b/posthog/hogql_queries/experiments/test/test_experiment_trends_query_runner.py @@ -1,15 +1,18 @@ from django.test import override_settings from ee.clickhouse.materialized_columns.columns import get_enabled_materialized_columns, materialize from posthog.hogql_queries.experiments.experiment_trends_query_runner import ExperimentTrendsQueryRunner +from posthog.hogql_queries.experiments.types import ExperimentMetricType from posthog.models.experiment import Experiment, ExperimentHoldout from posthog.models.feature_flag.feature_flag import FeatureFlag from posthog.schema import ( + BaseMathType, DataWarehouseNode, EventsNode, ExperimentSignificanceCode, ExperimentTrendsQuery, ExperimentTrendsQueryResponse, PersonsOnEventsMode, + PropertyMathType, TrendsQuery, ) from posthog.settings import ( @@ -27,7 +30,7 @@ flush_persons_and_events, ) from freezegun import freeze_time -from typing import cast +from typing import cast, Any from django.utils import timezone from datetime import datetime, timedelta from posthog.test.test_journeys import journeys_for @@ -2363,3 +2366,47 @@ def test_validate_event_variants_no_exposure(self): } ) self.assertEqual(cast(list, context.exception.detail)[0], expected_errors) + + def test_get_metric_type(self): + feature_flag = self.create_feature_flag() + experiment = self.create_experiment(feature_flag=feature_flag) + + # Test allowed count math types + allowed_count_math_types = [BaseMathType.TOTAL, BaseMathType.DAU, BaseMathType.UNIQUE_SESSION, None] + for math_type in allowed_count_math_types: + count_query = TrendsQuery(series=[EventsNode(event="$pageview", math=math_type)]) + experiment_query = ExperimentTrendsQuery( + experiment_id=experiment.id, + kind="ExperimentTrendsQuery", + count_query=count_query, + ) + query_runner = ExperimentTrendsQueryRunner(query=experiment_query, team=self.team) + self.assertEqual(query_runner._get_metric_type(), ExperimentMetricType.COUNT) + + # Test allowed sum math types + allowed_sum_math_types: list[Any] = [PropertyMathType.SUM, "hogql"] + for math_type in allowed_sum_math_types: + count_query = TrendsQuery( + series=[EventsNode(event="checkout completed", math=math_type, math_property="revenue")] + ) + experiment_query = ExperimentTrendsQuery( + experiment_id=experiment.id, + kind="ExperimentTrendsQuery", + count_query=count_query, + ) + query_runner = ExperimentTrendsQueryRunner(query=experiment_query, team=self.team) + self.assertEqual(query_runner._get_metric_type(), ExperimentMetricType.CONTINUOUS) + + # Test that AVG math gets converted to SUM and returns CONTINUOUS + count_query = TrendsQuery( + series=[EventsNode(event="checkout completed", math=PropertyMathType.AVG, math_property="revenue")] + ) + experiment_query = ExperimentTrendsQuery( + experiment_id=experiment.id, + kind="ExperimentTrendsQuery", + count_query=count_query, + ) + query_runner = ExperimentTrendsQueryRunner(query=experiment_query, team=self.team) + self.assertEqual(query_runner._get_metric_type(), ExperimentMetricType.CONTINUOUS) + # Verify the math type was converted to sum + self.assertEqual(query_runner.query.count_query.series[0].math, PropertyMathType.SUM) diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index d1185abd73c80..b3da850d35c57 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -14,9 +14,13 @@ from flaky import flaky -def create_variant(key: str, mean: float, exposure: float, absolute_exposure: int) -> ExperimentVariantTrendsBaseStats: - # Note: We use the count field to store the mean value for continuous metrics - return ExperimentVariantTrendsBaseStats(key=key, count=mean, exposure=exposure, absolute_exposure=absolute_exposure) +def create_variant( + key: str, total_sum: float, exposure: float, absolute_exposure: int +) -> ExperimentVariantTrendsBaseStats: + # Note: We use the count field to store the total sum for continuous metrics + return ExperimentVariantTrendsBaseStats( + key=key, count=total_sum, exposure=exposure, absolute_exposure=absolute_exposure + ) class TestExperimentTrendsStatisticsContinuous(APIBaseTest): @@ -45,11 +49,18 @@ def test_small_sample_two_variants_not_significant(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 100 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 100.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_absolute_exposure = 100 + test_mean = 105.0 test = create_variant( "test", - mean=105.0, + total_sum=test_mean * test_absolute_exposure, exposure=test_absolute_exposure / control_absolute_exposure, absolute_exposure=test_absolute_exposure, ) @@ -73,17 +84,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][0], 80, delta=5) # Lower bound self.assertAlmostEqual(intervals["test"][1], 120, delta=5) # Upper bound else: - # Original implementation behavior for small sample - self.assertAlmostEqual(probabilities[0], 0.5, delta=0.2) - self.assertAlmostEqual(probabilities[1], 0.5, delta=0.2) - self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) - self.assertEqual(p_value, 1) + self.assertAlmostEqual(probabilities[0], 0.0, delta=0.005) + self.assertAlmostEqual(probabilities[1], 1.0, delta=0.005) + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertAlmostEqual(p_value, 0.00049, delta=0.00001) - # Original implementation returns intervals as ratios/multipliers of the mean - self.assertAlmostEqual(intervals["control"][0], 1.0, delta=0.2) # Lower bound is less than mean - self.assertAlmostEqual(intervals["control"][1], 1.2, delta=0.1) # Upper bound is greater than mean - self.assertAlmostEqual(intervals["test"][0], 1.0, delta=0.2) - self.assertAlmostEqual(intervals["test"][1], 1.2, delta=0.1) + # Control: ~$100 mean with narrow interval due to old implementation + self.assertAlmostEqual(intervals["control"][0], 98, delta=3) + self.assertAlmostEqual(intervals["control"][1], 102, delta=3) + + # Test: ~$105 mean with narrow interval due to old implementation + self.assertAlmostEqual(intervals["test"][0], 103, delta=3) + self.assertAlmostEqual(intervals["test"][1], 107, delta=3) self.run_test_for_both_implementations(run_test) @@ -93,11 +105,18 @@ def test_large_sample_two_variants_significant(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 10000 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 100.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_absolute_exposure = 10000 + test_mean = 120.0 test = create_variant( "test", - mean=120.0, + total_sum=test_mean * test_absolute_exposure, exposure=test_absolute_exposure / control_absolute_exposure, absolute_exposure=test_absolute_exposure, ) @@ -121,19 +140,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][0], 116, delta=2) # Lower bound self.assertAlmostEqual(intervals["test"][1], 122, delta=2) # Upper bound else: - # Original implementation behavior for large sample - self.assertAlmostEqual(probabilities[1], 0.75, delta=0.25) - self.assertAlmostEqual(probabilities[0], 0.25, delta=0.25) - self.assertTrue( - significance in [ExperimentSignificanceCode.HIGH_P_VALUE, ExperimentSignificanceCode.SIGNIFICANT] - ) - self.assertAlmostEqual(p_value, 0.15, delta=0.15) + self.assertAlmostEqual(probabilities[1], 1.0, delta=0.025) + self.assertAlmostEqual(probabilities[0], 0.0, delta=0.025) + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertEqual(p_value, 0) - # Original implementation returns intervals as ratios/multipliers of the mean - self.assertAlmostEqual(intervals["control"][0], 0.05, delta=0.05) # Lower bound less than mean - self.assertAlmostEqual(intervals["control"][1], 0.015, delta=0.005) # Upper bound greater than mean - self.assertAlmostEqual(intervals["test"][0], 0.05, delta=0.05) # Lower bound less than mean - self.assertAlmostEqual(intervals["test"][1], 0.015, delta=0.005) # Upper bound greater than mean + # Control: $100 mean with narrow interval due to large sample + self.assertAlmostEqual(intervals["control"][0], 99.8, delta=2) + self.assertAlmostEqual(intervals["control"][1], 100.2, delta=2) + + # Test: $120 mean with narrow interval due to large sample + self.assertAlmostEqual(intervals["test"][0], 119, delta=2) + self.assertAlmostEqual(intervals["test"][1], 121, delta=2) self.run_test_for_both_implementations(run_test) @@ -143,11 +161,18 @@ def test_large_sample_two_variants_strongly_significant(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 10000 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 100.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_absolute_exposure = 10000 + test_mean = 150.0 test = create_variant( "test", - mean=150.0, + total_sum=test_mean * test_absolute_exposure, exposure=test_absolute_exposure / control_absolute_exposure, absolute_exposure=test_absolute_exposure, ) @@ -164,22 +189,25 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(p_value, 0) # Control: $100 mean - self.assertAlmostEqual(intervals["control"][0], 97, delta=2) # Lower bound - self.assertAlmostEqual(intervals["control"][1], 103, delta=2) # Upper bound + self.assertAlmostEqual(intervals["control"][0], 99.8, delta=2) + self.assertAlmostEqual(intervals["control"][1], 100.2, delta=2) # Test: $150 mean, clearly higher than control - self.assertAlmostEqual(intervals["test"][0], 146, delta=3) # Lower bound - self.assertAlmostEqual(intervals["test"][1], 154, delta=3) # Upper bound + self.assertAlmostEqual(intervals["test"][0], 146, delta=3) + self.assertAlmostEqual(intervals["test"][1], 154, delta=3) else: - # Original implementation behavior for strongly significant case - self.assertTrue(probabilities[1] > 0.5) # Test variant winning - self.assertTrue(probabilities[0] < 0.5) # Control variant losing + self.assertAlmostEqual(probabilities[1], 1.0, delta=0.005) + self.assertAlmostEqual(probabilities[0], 0.0, delta=0.005) self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - self.assertLess(p_value, 0.05) + self.assertEqual(p_value, 0) - # Original implementation returns intervals as ratios/multipliers of the mean - # For strongly significant differences, the intervals should not overlap when scaled - self.assertTrue(intervals["control"][1] * 100 < intervals["test"][0] * 150) + # Control: $100 mean + self.assertAlmostEqual(intervals["control"][0], 99.8, delta=2) + self.assertAlmostEqual(intervals["control"][1], 100.2, delta=2) + + # Test: $150 mean, clearly higher than control + self.assertAlmostEqual(intervals["test"][0], 149, delta=3) + self.assertAlmostEqual(intervals["test"][1], 151, delta=3) self.run_test_for_both_implementations(run_test) @@ -189,25 +217,34 @@ def test_many_variants_not_significant(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 1000 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 100.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_a_absolute_exposure = 1000 + test_a_mean = 98.0 test_a = create_variant( "test_a", - mean=98.0, + total_sum=test_a_mean * test_a_absolute_exposure, exposure=test_a_absolute_exposure / control_absolute_exposure, absolute_exposure=test_a_absolute_exposure, ) test_b_absolute_exposure = 1000 + test_b_mean = 102.0 test_b = create_variant( "test_b", - mean=102.0, + total_sum=test_b_mean * test_b_absolute_exposure, exposure=test_b_absolute_exposure / control_absolute_exposure, absolute_exposure=test_b_absolute_exposure, ) test_c_absolute_exposure = 1000 + test_c_mean = 101.0 test_c = create_variant( "test_c", - mean=101.0, + total_sum=test_c_mean * test_c_absolute_exposure, exposure=test_c_absolute_exposure / control_absolute_exposure, absolute_exposure=test_c_absolute_exposure, ) @@ -239,27 +276,26 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test_c"][0], 95, delta=5) # Lower bound self.assertAlmostEqual(intervals["test_c"][1], 105, delta=5) # Upper bound else: - # Original implementation behavior for multiple variants with no clear winner - self.assertTrue(all(0.1 < p < 0.9 for p in probabilities)) - self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) - self.assertEqual(p_value, 1) + self.assertTrue(any(p > MIN_PROBABILITY_FOR_SIGNIFICANCE for p in probabilities)) + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertAlmostEqual(p_value, 0, delta=0.00001) - # Original implementation returns intervals as ratios/multipliers of the mean + # Generally narrower intervals in old implementation # Control variant - self.assertAlmostEqual(intervals["control"][0], 0.085, delta=0.01) # ~8.5% - self.assertAlmostEqual(intervals["control"][1], 0.12, delta=0.01) # ~12% + self.assertAlmostEqual(intervals["control"][0], 99.8, delta=5) + self.assertAlmostEqual(intervals["control"][1], 100.2, delta=5) # Test A variant - self.assertAlmostEqual(intervals["test_a"][0], 0.085, delta=0.01) # ~8.5% - self.assertAlmostEqual(intervals["test_a"][1], 0.12, delta=0.01) # ~12% + self.assertAlmostEqual(intervals["test_a"][0], 97, delta=5) + self.assertAlmostEqual(intervals["test_a"][1], 99, delta=5) # Test B variant - self.assertAlmostEqual(intervals["test_b"][0], 0.085, delta=0.01) # ~8.5% - self.assertAlmostEqual(intervals["test_b"][1], 0.12, delta=0.01) # ~12% + self.assertAlmostEqual(intervals["test_b"][0], 101, delta=5) + self.assertAlmostEqual(intervals["test_b"][1], 103, delta=5) # Test C variant - self.assertAlmostEqual(intervals["test_c"][0], 0.085, delta=0.01) # ~8.5% - self.assertAlmostEqual(intervals["test_c"][1], 0.12, delta=0.01) # ~12% + self.assertAlmostEqual(intervals["test_c"][0], 99, delta=5) + self.assertAlmostEqual(intervals["test_c"][1], 101, delta=5) self.run_test_for_both_implementations(run_test) @@ -269,25 +305,34 @@ def test_many_variants_significant(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 10000 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 100.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_a_absolute_exposure = 10000 + test_a_mean = 105.0 test_a = create_variant( "test_a", - mean=105.0, + total_sum=test_a_mean * test_a_absolute_exposure, exposure=test_a_absolute_exposure / control_absolute_exposure, absolute_exposure=test_a_absolute_exposure, ) test_b_absolute_exposure = 10000 + test_b_mean = 150.0 test_b = create_variant( "test_b", - mean=150.0, + total_sum=test_b_mean * test_b_absolute_exposure, exposure=test_b_absolute_exposure / control_absolute_exposure, absolute_exposure=test_b_absolute_exposure, ) test_c_absolute_exposure = 10000 + test_c_mean = 110.0 test_c = create_variant( "test_c", - mean=110.0, + total_sum=test_c_mean * test_c_absolute_exposure, exposure=test_c_absolute_exposure / control_absolute_exposure, absolute_exposure=test_c_absolute_exposure, ) @@ -298,9 +343,9 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 4) if stats_version == 2: - self.assertTrue(probabilities[2] > 0.9) # test_b should be winning - self.assertTrue(probabilities[1] < 0.1) # test_a should be losing - self.assertTrue(probabilities[0] < 0.1) # control should be losing + self.assertTrue(probabilities[2] > 0.9) + self.assertTrue(probabilities[1] < 0.1) + self.assertTrue(probabilities[0] < 0.1) self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertEqual(p_value, 0) @@ -320,16 +365,27 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test_c"][0], 108, delta=1) self.assertAlmostEqual(intervals["test_c"][1], 112, delta=1) else: - # Original implementation behavior for multiple variants with clear winner - self.assertTrue(probabilities[2] > 0.5) # test_b should be winning + self.assertTrue(probabilities[2] > 0.9) + self.assertTrue(probabilities[1] < 0.1) + self.assertTrue(probabilities[0] < 0.1) self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - self.assertLess(p_value, 0.05) + self.assertEqual(p_value, 0) - # Original implementation returns intervals as ratios/multipliers of the mean - # Test B (150.0) should have non-overlapping intervals with others when scaled - self.assertTrue(intervals["control"][1] * 100 < intervals["test_b"][0] * 150) - self.assertTrue(intervals["test_a"][1] * 105 < intervals["test_b"][0] * 150) - self.assertTrue(intervals["test_c"][1] * 110 < intervals["test_b"][0] * 150) + # Control at $100 + self.assertAlmostEqual(intervals["control"][0], 99.0, delta=1) + self.assertAlmostEqual(intervals["control"][1], 101.0, delta=1) + + # Test A slightly higher at $105 + self.assertAlmostEqual(intervals["test_a"][0], 105, delta=1) + self.assertAlmostEqual(intervals["test_a"][1], 105, delta=1) + + # Test B clearly winning at $150 + self.assertAlmostEqual(intervals["test_b"][0], 150, delta=1) + self.assertAlmostEqual(intervals["test_b"][1], 150, delta=1) + + # Test C slightly higher at $110 + self.assertAlmostEqual(intervals["test_c"][0], 110, delta=1) + self.assertAlmostEqual(intervals["test_c"][1], 110, delta=1) self.run_test_for_both_implementations(run_test) @@ -339,11 +395,18 @@ def test_insufficient_sample_size(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 50 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 100.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_absolute_exposure = 50 + test_mean = 120.0 test = create_variant( "test", - mean=120.0, + total_sum=test_mean * test_absolute_exposure, exposure=test_absolute_exposure / control_absolute_exposure, absolute_exposure=test_absolute_exposure, ) @@ -366,17 +429,17 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][0], 85, delta=10) self.assertAlmostEqual(intervals["test"][1], 140, delta=10) else: - # Original implementation behavior for insufficient sample size - self.assertAlmostEqual(probabilities[0], 0.075, delta=0.025) - self.assertAlmostEqual(probabilities[1], 0.925, delta=0.075) + self.assertAlmostEqual(probabilities[0], 0.25, delta=0.25) + self.assertAlmostEqual(probabilities[1], 0.75, delta=0.25) self.assertEqual(significance, ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) self.assertEqual(p_value, 1.0) - # Original implementation returns intervals as ratios/multipliers of the mean - self.assertAlmostEqual(intervals["control"][0], 1.65, delta=0.15) - self.assertAlmostEqual(intervals["control"][1], 2.45, delta=0.15) - self.assertAlmostEqual(intervals["test"][0], 1.95, delta=0.15) - self.assertAlmostEqual(intervals["test"][1], 2.75, delta=0.15) + # Generally narrower intervals in old implementation + self.assertAlmostEqual(intervals["control"][0], 97, delta=3) + self.assertAlmostEqual(intervals["control"][1], 102, delta=3) + + self.assertAlmostEqual(intervals["test"][0], 117, delta=3) + self.assertAlmostEqual(intervals["test"][1], 123, delta=3) self.run_test_for_both_implementations(run_test) @@ -386,11 +449,18 @@ def test_edge_cases_zero_means(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 1000 - control = create_variant("control", mean=0.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 0.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_absolute_exposure = 1000 + test_mean = 0.0 test = create_variant( "test", - mean=0.0, + total_sum=test_mean * test_absolute_exposure, exposure=test_absolute_exposure / control_absolute_exposure, absolute_exposure=test_absolute_exposure, ) @@ -401,8 +471,8 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 2) if stats_version == 2: - self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) # Should be close to 50/50 - self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) # Should be close to 50/50 + self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) + self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) @@ -413,18 +483,17 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][0], 0, delta=0.05) self.assertAlmostEqual(intervals["test"][1], 0, delta=0.05) else: - # Original implementation behavior for zero means self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) self.assertEqual(p_value, 1) - # Original implementation returns intervals as ratios/multipliers of the mean - # For zero means, the intervals should still be valid ratios - self.assertAlmostEqual(intervals["control"][0], 0, delta=0.1) - self.assertAlmostEqual(intervals["control"][1], 0, delta=0.1) - self.assertAlmostEqual(intervals["test"][0], 0, delta=0.1) - self.assertAlmostEqual(intervals["test"][1], 0, delta=0.1) + # Both variants should have very small intervals near zero + self.assertAlmostEqual(intervals["control"][0], 0, delta=0.05) + self.assertAlmostEqual(intervals["control"][1], 0, delta=0.05) + + self.assertAlmostEqual(intervals["test"][0], 0, delta=0.05) + self.assertAlmostEqual(intervals["test"][1], 0, delta=0.05) self.run_test_for_both_implementations(run_test) @@ -433,18 +502,19 @@ def test_edge_cases_near_zero_means(self): """Test edge cases like near-zero means""" def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): - # Using very small positive values instead of exact zeros control_absolute_exposure = 1000 + control_mean = 0.0001 control = create_variant( "control", - mean=0.0001, + total_sum=control_mean * control_absolute_exposure, exposure=1, absolute_exposure=control_absolute_exposure, ) test_absolute_exposure = 1000 + test_mean = 0.0001 test = create_variant( "test", - mean=0.0001, + total_sum=test_mean * test_absolute_exposure, exposure=test_absolute_exposure / control_absolute_exposure, absolute_exposure=test_absolute_exposure, ) @@ -468,7 +538,6 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertAlmostEqual(intervals["test"][0], 0.0001, delta=0.00015) # Lower bound self.assertAlmostEqual(intervals["test"][1], 0.0001, delta=0.00015) # Upper bound else: - # Original implementation behavior for near-zero means self.assertAlmostEqual(probabilities[0], 0.5, delta=0.1) self.assertAlmostEqual(probabilities[1], 0.5, delta=0.1) self.assertEqual(significance, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) @@ -490,11 +559,18 @@ def test_expected_loss_minimal_difference(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 600 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 100.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_absolute_exposure = 600 + test_mean = 120.0 test = create_variant( "test", - mean=120.0, # Slightly higher mean + total_sum=test_mean * test_absolute_exposure, exposure=test_absolute_exposure / control_absolute_exposure, absolute_exposure=test_absolute_exposure, ) @@ -504,13 +580,12 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca if stats_version == 2: self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - # Expected loss should be relatively small - self.assertLess(expected_loss, 3.0) # Less than $3 expected loss - self.assertGreater(expected_loss, 0) # But still some loss + self.assertLess(expected_loss, 3.0) + self.assertGreater(expected_loss, 0) else: - # Original implementation behavior (returns p_value in expected_loss) - self.assertEqual(significance, ExperimentSignificanceCode.HIGH_P_VALUE) - self.assertAlmostEqual(expected_loss, 0.2, delta=0.1) + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) + self.assertLess(expected_loss, 3.0) + self.assertGreater(expected_loss, 0) self.run_test_for_both_implementations(run_test) @@ -520,11 +595,18 @@ def test_expected_loss_test_variant_clear_winner(self): def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): control_absolute_exposure = 10000 - control = create_variant("control", mean=100.0, exposure=1, absolute_exposure=control_absolute_exposure) + control_mean = 100.0 + control = create_variant( + "control", + total_sum=control_mean * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) test_absolute_exposure = 10000 + test_mean = 200.0 test = create_variant( "test", - mean=200.0, # Much higher mean + total_sum=test_mean * test_absolute_exposure, exposure=test_absolute_exposure / control_absolute_exposure, absolute_exposure=test_absolute_exposure, ) @@ -534,11 +616,9 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca if stats_version == 2: self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - # Expected loss should be very close to zero since test is clearly better - self.assertLess(expected_loss, 0.1) # Essentially zero loss + self.assertLess(expected_loss, 0.1) else: - # Original implementation behavior self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) - self.assertLess(expected_loss, 0.001) + self.assertLess(expected_loss, 0.1) self.run_test_for_both_implementations(run_test) diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index 4931d0a07212c..359951ba38f07 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -254,7 +254,8 @@ def calculate_credible_intervals_v2_continuous(variants, lower_bound=0.025, uppe for variant in variants: try: # Log-transform the mean value, adding epsilon to handle zeros - log_mean = np.log(variant.count + EPSILON) # Using count field to store mean value + mean_value = variant.count / variant.absolute_exposure + log_mean = np.log(mean_value + EPSILON) # Calculate posterior parameters using absolute_exposure kappa_n = KAPPA_0 + variant.absolute_exposure @@ -306,7 +307,8 @@ def calculate_expected_loss_v2_continuous( Expected loss in mean value if choosing the target variant """ # Calculate posterior parameters for target variant - log_target_mean = np.log(target_variant.count + EPSILON) + target_mean = target_variant.count / target_variant.absolute_exposure + log_target_mean = np.log(target_mean + EPSILON) # Update parameters for target variant kappa_n_target = KAPPA_0 + target_variant.absolute_exposure @@ -323,7 +325,8 @@ def calculate_expected_loss_v2_continuous( # Draw samples from each comparison variant's posterior variant_samples = [] for variant in variants: - log_variant_mean = np.log(variant.count + EPSILON) + variant_mean = variant.count / variant.absolute_exposure + log_variant_mean = np.log(variant_mean + EPSILON) kappa_n = KAPPA_0 + variant.absolute_exposure mu_n = (KAPPA_0 * MU_0 + variant.absolute_exposure * log_variant_mean) / kappa_n diff --git a/posthog/hogql_queries/experiments/types.py b/posthog/hogql_queries/experiments/types.py new file mode 100644 index 0000000000000..17a29aef766fd --- /dev/null +++ b/posthog/hogql_queries/experiments/types.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class ExperimentMetricType(Enum): + COUNT = "count" + CONTINUOUS = "continuous" + FUNNEL = "funnel"