From 3328603f3596a9a71777b076b06d3cd189076c49 Mon Sep 17 00:00:00 2001 From: Differential Privacy Team Date: Tue, 29 Oct 2024 03:56:24 -0700 Subject: [PATCH] Publish PipelineDP4j, JVM version of PipelineDP PipelineDP4j is an end-to-end differential privacy solution for JVM that supports various frameworks for distributed data processing such as Apache Spark and Apache Beam. It is the JVM implementation of PipelineDP (https://pipelinedp.io/), and is conceptually similar to Privacy on Beam. Change-Id: I84a0da8fbfd55bbbb37a7013e66c22e1f222e566 GitOrigin-RevId: f574c7e2a8601092d073377ef4afc07187697132 --- examples/pipelinedp4j/.bazelrc | 18 + examples/pipelinedp4j/.bazelversion | 1 + examples/pipelinedp4j/BUILD.bazel | 34 + examples/pipelinedp4j/BeamExample.java | 205 +++ examples/pipelinedp4j/MovieMetrics.java | 45 + examples/pipelinedp4j/MovieView.java | 55 + examples/pipelinedp4j/README.md | 63 + examples/pipelinedp4j/WORKSPACE.bazel | 91 ++ pipelinedp4j/BUILD.bazel | 49 + pipelinedp4j/README.md | 34 + pipelinedp4j/WORKSPACE.bazel | 152 +++ .../pipelinedp4j/api/AggregationSpec.kt | 233 ++++ .../pipelinedp4j/api/BUILD.bazel | 43 + .../pipelinedp4j/api/BeamQuery.kt | 96 ++ .../pipelinedp4j/api/BeamQueryBuilder.kt | 67 + .../pipelinedp4j/api/BudgetSpec.kt | 150 +++ .../pipelinedp4j/api/NoiseKind.kt | 37 + .../pipelinedp4j/api/PipelineDpCollection.kt | 54 + .../pipelinedp4j/api/Query.kt | 122 ++ .../pipelinedp4j/api/QueryBuilder.kt | 229 ++++ .../pipelinedp4j/api/QueryPerGroupResult.kt | 30 + .../pipelinedp4j/beam/BUILD.bazel | 57 + .../pipelinedp4j/beam/BeamCollection.kt | 89 ++ .../pipelinedp4j/beam/BeamDpEngineFactory.kt | 27 + .../pipelinedp4j/beam/BeamEncoders.kt | 68 + .../pipelinedp4j/beam/BeamTable.kt | 269 ++++ .../pipelinedp4j/core/BUILD.bazel | 119 ++ .../pipelinedp4j/core/Combiners.kt | 1021 ++++++++++++++ .../pipelinedp4j/core/ComputationalGraph.kt | 275 ++++ .../pipelinedp4j/core/ContributionSampler.kt | 66 + .../pipelinedp4j/core/CoreTypes.kt | 54 + .../pipelinedp4j/core/DataExtractors.kt | 82 ++ .../pipelinedp4j/core/DpEngine.kt | 477 +++++++ .../pipelinedp4j/core/DpFunctionsParams.kt | 437 ++++++ .../pipelinedp4j/core/Encoders.kt | 55 + .../pipelinedp4j/core/FrameworkCollection.kt | 60 + .../pipelinedp4j/core/FrameworkTable.kt | 122 ++ .../pipelinedp4j/core/NoPrivacySampler.kt | 38 + .../core/PartitionAndPerPartitionSampler.kt | 69 + .../pipelinedp4j/core/PartitionSampler.kt | 110 ++ .../core/PerPartitionContributionsSampler.kt | 76 ++ .../pipelinedp4j/core/PrivatePartitions.kt | 132 ++ .../pipelinedp4j/core/PublicPartitions.kt | 86 ++ .../core/budget/AllocatedBudget.kt | 79 ++ .../pipelinedp4j/core/budget/BUILD.bazel | 43 + .../core/budget/BudgetAccountant.kt | 266 ++++ .../pipelinedp4j/core/budget/BudgetSpec.kt | 122 ++ .../pipelinedp4j/dplibrary/BUILD.bazel | 38 + .../pipelinedp4j/dplibrary/NoiseFactories.kt | 40 + ...PreAggregationPartitionSelectionFactory.kt | 35 + .../pipelinedp4j/local/BUILD.bazel | 51 + .../pipelinedp4j/local/LocalCollection.kt | 54 + .../local/LocalDpEngineFactory.kt | 24 + .../pipelinedp4j/local/LocalEncoderFactory.kt | 45 + .../pipelinedp4j/local/LocalTable.kt | 114 ++ .../pipelinedp4j/proto/BUILD.bazel | 41 + .../pipelinedp4j/proto/accumulators.proto | 59 + .../pipelinedp4j/proto/dpaggregates.proto | 40 + pipelinedp4j/pom.template | 42 + .../pipelinedp4j/api/ApiTests.kt | 23 + .../pipelinedp4j/api/BUILD.bazel | 37 + .../pipelinedp4j/api/BeamQueryBuilderTest.kt | 98 ++ .../pipelinedp4j/api/BeamQueryTest.kt | 175 +++ .../pipelinedp4j/beam/BUILD.bazel | 42 + .../pipelinedp4j/beam/BeamCollectionTest.kt | 106 ++ .../pipelinedp4j/beam/BeamEncodersTest.kt | 143 ++ .../pipelinedp4j/beam/BeamTableTest.kt | 344 +++++ .../pipelinedp4j/beam/BeamTests.kt | 25 + .../pipelinedp4j/core/BUILD.bazel | 89 ++ .../pipelinedp4j/core/CompoundCombinerTest.kt | 538 ++++++++ .../pipelinedp4j/core/CoreTests.kt | 49 + .../pipelinedp4j/core/CountCombinerTest.kt | 190 +++ .../pipelinedp4j/core/DataExtractorsTest.kt | 65 + .../pipelinedp4j/core/DpEngineTest.kt | 1171 +++++++++++++++++ .../pipelinedp4j/core/DpEngineTestFactory.kt | 34 + .../core/DpFunctionsParamsTest.kt | 490 +++++++ .../pipelinedp4j/core/EndToEndTest.kt | 279 ++++ .../core/ExactPrivacyIdCountCombinerTest.kt | 66 + .../pipelinedp4j/core/MeanCombinerTest.kt | 433 ++++++ .../pipelinedp4j/core/NoPrivacySamplerTest.kt | 93 ++ .../PartitionAndPerPartitionSamplerTest.kt | 183 +++ .../pipelinedp4j/core/PartitionSamplerTest.kt | 133 ++ .../core/PartitionSamplerWithoutValuesTest.kt | 131 ++ .../PerPartitionContributionsSamplerTest.kt | 151 +++ ...gregationPartitionSelectionCombinerTest.kt | 178 +++ .../core/PrivacyIdCombinerTest.kt | 125 ++ ...PrivatePartitionsComputationalGraphTest.kt | 357 +++++ .../core/PrivatePartitionsTest.kt | 264 ++++ .../PublicPartitionsComputationalGraphTest.kt | 345 +++++ .../pipelinedp4j/core/PublicPartitionsTest.kt | 107 ++ .../core/QuantilesCombinerTest.kt | 117 ++ .../SelectPartitionsComputationalGraphTest.kt | 130 ++ .../pipelinedp4j/core/SumCombinerTest.kt | 220 ++++ .../pipelinedp4j/core/TestDataTypes.kt | 34 + .../pipelinedp4j/core/VarianceCombinerTest.kt | 525 ++++++++ .../budget/AbsoluteBudgetPerOpSpecTest.kt | 68 + .../pipelinedp4j/core/budget/BUILD.bazel | 36 + .../pipelinedp4j/core/budget/BudgetTests.kt | 30 + .../core/budget/NaiveBudgetAccountantTest.kt | 272 ++++ .../budget/RelativeBudgetPerOpSpecTest.kt | 60 + .../core/budget/TotalBudgetTest.kt | 47 + .../pipelinedp4j/local/BUILD.bazel | 30 + .../pipelinedp4j/local/LocalCollectionTest.kt | 73 + .../pipelinedp4j/local/LocalFrameworkTests.kt | 25 + .../pipelinedp4j/local/LocalTableTest.kt | 237 ++++ 105 files changed, 15058 insertions(+) create mode 100644 examples/pipelinedp4j/.bazelrc create mode 100644 examples/pipelinedp4j/.bazelversion create mode 100644 examples/pipelinedp4j/BUILD.bazel create mode 100644 examples/pipelinedp4j/BeamExample.java create mode 100644 examples/pipelinedp4j/MovieMetrics.java create mode 100644 examples/pipelinedp4j/MovieView.java create mode 100644 examples/pipelinedp4j/README.md create mode 100644 examples/pipelinedp4j/WORKSPACE.bazel create mode 100644 pipelinedp4j/BUILD.bazel create mode 100644 pipelinedp4j/README.md create mode 100644 pipelinedp4j/WORKSPACE.bazel create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/AggregationSpec.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BUILD.bazel create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQuery.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryBuilder.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BudgetSpec.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/NoiseKind.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/PipelineDpCollection.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/Query.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/QueryBuilder.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/QueryPerGroupResult.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BUILD.bazel create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamCollection.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamDpEngineFactory.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamEncoders.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTable.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/BUILD.bazel create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/Combiners.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/ComputationalGraph.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/ContributionSampler.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/CoreTypes.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DataExtractors.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngine.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpFunctionsParams.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/Encoders.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/FrameworkCollection.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/FrameworkTable.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/NoPrivacySampler.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionAndPerPartitionSampler.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSampler.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PerPartitionContributionsSampler.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitions.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitions.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/AllocatedBudget.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BUILD.bazel create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetAccountant.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetSpec.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/BUILD.bazel create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/NoiseFactories.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/PreAggregationPartitionSelectionFactory.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/BUILD.bazel create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalCollection.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalDpEngineFactory.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalEncoderFactory.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalTable.kt create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/BUILD.bazel create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/accumulators.proto create mode 100644 pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/dpaggregates.proto create mode 100644 pipelinedp4j/pom.template create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/ApiTests.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BUILD.bazel create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryBuilderTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BUILD.bazel create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamCollectionTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamEncodersTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTableTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTests.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/BUILD.bazel create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CompoundCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CoreTests.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CountCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DataExtractorsTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngineTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngineTestFactory.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpFunctionsParamsTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/EndToEndTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/ExactPrivacyIdCountCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/MeanCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/NoPrivacySamplerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionAndPerPartitionSamplerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSamplerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSamplerWithoutValuesTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PerPartitionContributionsSamplerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PostAggregationPartitionSelectionCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivacyIdCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitionsComputationalGraphTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitionsTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitionsComputationalGraphTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitionsTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/QuantilesCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/SelectPartitionsComputationalGraphTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/SumCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/TestDataTypes.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/VarianceCombinerTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/AbsoluteBudgetPerOpSpecTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BUILD.bazel create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetTests.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/NaiveBudgetAccountantTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/RelativeBudgetPerOpSpecTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/TotalBudgetTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/BUILD.bazel create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalCollectionTest.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalFrameworkTests.kt create mode 100644 pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalTableTest.kt diff --git a/examples/pipelinedp4j/.bazelrc b/examples/pipelinedp4j/.bazelrc new file mode 100644 index 00000000..13615996 --- /dev/null +++ b/examples/pipelinedp4j/.bazelrc @@ -0,0 +1,18 @@ +# +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +build --java_language_version=11 +build --java_runtime_version=remotejdk_11 \ No newline at end of file diff --git a/examples/pipelinedp4j/.bazelversion b/examples/pipelinedp4j/.bazelversion new file mode 100644 index 00000000..34a8f745 --- /dev/null +++ b/examples/pipelinedp4j/.bazelversion @@ -0,0 +1 @@ +7.3.1 \ No newline at end of file diff --git a/examples/pipelinedp4j/BUILD.bazel b/examples/pipelinedp4j/BUILD.bazel new file mode 100644 index 00000000..191b4b34 --- /dev/null +++ b/examples/pipelinedp4j/BUILD.bazel @@ -0,0 +1,34 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +java_binary( + name = "BeamExample", + srcs = [ + "BeamExample.java", + "MovieMetrics.java", + "MovieView.java", + ], + main_class = "com.google.privacy.differentialprivacy.pipelinedp4j.examples.BeamExample", + runtime_deps = [ + "@maven//:org_apache_beam_beam_runners_direct_java", + ], + deps = [ + "@com_google_privacy_differentialprivacy_pipielinedp4j//main/com/google/privacy/differentialprivacy/pipelinedp4j/api", + "@maven//:com_google_guava_guava", + "@maven//:info_picocli_picocli", + "@maven//:org_apache_beam_beam_sdks_java_core", + "@maven//:org_apache_beam_beam_sdks_java_extensions_avro", + "@maven//:org_jetbrains_kotlin_kotlin_stdlib", + ], +) diff --git a/examples/pipelinedp4j/BeamExample.java b/examples/pipelinedp4j/BeamExample.java new file mode 100644 index 00000000..fee9e0ca --- /dev/null +++ b/examples/pipelinedp4j/BeamExample.java @@ -0,0 +1,205 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An end-to-end example how to compute DP metrics on a Netflix dataset using the library. + * + *

See README for details including how to run the example. + */ +package com.google.privacy.differentialprivacy.pipelinedp4j.examples; + +import static java.lang.Math.round; +import static java.util.stream.Collectors.toCollection; + +import com.google.privacy.differentialprivacy.pipelinedp4j.api.NoiseKind; +import com.google.privacy.differentialprivacy.pipelinedp4j.api.QueryBuilder; +import com.google.privacy.differentialprivacy.pipelinedp4j.api.QueryPerGroupResult; +import com.google.privacy.differentialprivacy.pipelinedp4j.api.TotalBudget; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.stream.IntStream; +import kotlin.jvm.functions.Function1; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.extensions.avro.coders.AvroCoder; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.PCollection; +import picocli.CommandLine; +import picocli.CommandLine.Command; +import picocli.CommandLine.Option; + +/** + * An end-to-end example how to compute DP metrics on a Netflix dataset using the library on Beam. + * + *

See README for details including how to run the example. + */ +@Command( + name = "BeamExample", + version = {"BeamExample 1.0"}, + mixinStandardHelpOptions = true) +public class BeamExample implements Runnable { + @Option( + names = "--use-public-groups", + description = + "If true we will assume in the example that movie ids are publicly known and are from " + + "4500 to 4509" + + ". Default is false, i.e. we will choose movie ids in a differentially" + + " private way.", + defaultValue = "false") + private boolean usePublicGroups = false; + + @Option( + names = "--local-input-file-path", + description = + "Input file. For using as input file you can download data from" + + " https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data. Use only part of" + + " it to speed up the calculations.", + required = true) + private String localInputFilePath; + + @Option( + names = "--local-output-file-path", + description = "Output file.", + defaultValue = "/tmp/anonymized_output.txt") + private String localOutputFilePath; + + public static void main(String[] args) { + int exitCode = new CommandLine(new BeamExample()).execute(args); + System.exit(exitCode); + } + + @Override + public void run() { + System.out.println("Starting calculations..."); + + var pipeline = initBeam(); + // Read the input data, these are movie views that contain movie id, user id and rating. + PCollection data = readData(pipeline); + + // Define the query + var query = + QueryBuilder.from(data, new UserIdExtractor()) + .groupBy( + /* groupKeyExtractor= */ new MovieIdExtractor(), + /* maxGroupsContributed= */ 3, + /* maxContributionsPerGroup= */ 1, + usePublicGroups ? publiclyKnownMovieIds(pipeline) : null) + .count(/* outputColumnName= */ "numberOfViews") + .sum( + new RatingExtractor(), + /* minTotalValuePerPrivacyUnitInGroup= */ 1.0, + /* maxTotalValuePerPrivacyUnitInGroup= */ 5.0, + /* outputColumnName= */ "sumOfRatings", + /* budget= */ null) + .build(); + // Run the query with DP parameters. + PCollection result = + query.run(new TotalBudget(/* epsilon= */ 1.1, /* delta= */ 1e-10), NoiseKind.LAPLACE); + + // Convert the result to better representation, i.e. to MovieMetrics. + var movieMetricsCoder = AvroCoder.of(MovieMetrics.class); + SerializableFunction mapToMovieMetricsFn = + perGroupResult -> { + String movieId = perGroupResult.getGroupKey(); + long numberOfViews = round(perGroupResult.getAggregationResults().get("numberOfViews")); + long sumOfRatings = round(perGroupResult.getAggregationResults().get("sumOfRatings")); + return new MovieMetrics(movieId, numberOfViews, sumOfRatings); + }; + // We now have our anonymized metrics of movie views. + PCollection anonymizedMovieMetrics = + result + .apply( + "Map query result to MovieMetrics", + MapElements.into(movieMetricsCoder.getEncodedTypeDescriptor()) + .via(mapToMovieMetricsFn)) + .setCoder(movieMetricsCoder); + + // Save the result to a file. + writeOutput(anonymizedMovieMetrics); + + // Run the scheduled calculations in the pipeline. + pipeline.run().waitUntilFinish(); + System.out.println("Finished calculations."); + } + + // Data extractors. They always have to implement Function1 and Serializable interfaces. If it + // doesn't implement Serializable interface, it will fail on Beam. If it doesn't implement + // Function1, it will at compile time due to types mismatch. Do not use lambdas for data + // extractors as they won't be serializable. + static class UserIdExtractor implements Function1, Serializable { + @Override + public String invoke(MovieView movieView) { + return movieView.getUserId(); + } + } + + static class MovieIdExtractor implements Function1, Serializable { + @Override + public String invoke(MovieView movieView) { + return movieView.getMovieId(); + } + } + + static class RatingExtractor implements Function1, Serializable { + @Override + public Double invoke(MovieView movieView) { + return movieView.getRating(); + } + } + + private static Pipeline initBeam() { + var options = PipelineOptionsFactory.create(); + return Pipeline.create(options); + } + + private PCollection readData(Pipeline pipeline) { + PCollection inputPCollection = + pipeline.apply("Read input", TextIO.read().from(localInputFilePath)); + var coder = AvroCoder.of(MovieView.class); + SerializableFunction parseFunction = MovieView::parseView; + return inputPCollection + .apply("Parse input", MapElements.into(coder.getEncodedTypeDescriptor()).via(parseFunction)) + .setCoder(coder); + } + + /** + * Movie ids (which are group keys for this dataset) are integers from 1 to ~17000. Set public + * groups 1-10. + */ + private static PCollection publiclyKnownMovieIds(Pipeline pipeline) { + var publicGroupsAsJavaList = + IntStream.rangeClosed( + 4500, 4509 + ) + .mapToObj(Integer::toString) + .collect(toCollection(ArrayList::new)); + return pipeline.apply("Create public groups", Create.of(publicGroupsAsJavaList)); + } + + private void writeOutput(PCollection result) { + SerializableFunction toStringFunction = MovieMetrics::toString; + var lines = + result.apply( + "Map MovieMetrics to string", + MapElements.into(StringUtf8Coder.of().getEncodedTypeDescriptor()) + .via(toStringFunction)); + lines.apply("Write output to file", TextIO.write().withoutSharding().to(localOutputFilePath)); + } +} diff --git a/examples/pipelinedp4j/MovieMetrics.java b/examples/pipelinedp4j/MovieMetrics.java new file mode 100644 index 00000000..caad57d0 --- /dev/null +++ b/examples/pipelinedp4j/MovieMetrics.java @@ -0,0 +1,45 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.examples; + +/** + * Metrics for a single movie in the Netflix dataset. + * + *

It is the result of the DP metrics query. + */ +final class MovieMetrics { + private final String movieId; + private final long numberOfViews; + private final long sumOfRatings; + + MovieMetrics(String movieId, long numberOfViews, long sumOfRatings) { + this.movieId = movieId; + this.numberOfViews = numberOfViews; + this.sumOfRatings = sumOfRatings; + } + + // 0-arg constructor is necessary for serialization to work. + private MovieMetrics() { + this("", 0, 0); + } + + @Override + public String toString() { + return String.format( + "movieId=%s, numberOfViews=%s, sumOfRatings=%s", movieId, numberOfViews, sumOfRatings); + } +} diff --git a/examples/pipelinedp4j/MovieView.java b/examples/pipelinedp4j/MovieView.java new file mode 100644 index 00000000..f5117e8d --- /dev/null +++ b/examples/pipelinedp4j/MovieView.java @@ -0,0 +1,55 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.examples; + +import com.google.common.base.Splitter; +import java.util.List; + +/** Represents a single movie view from the Netflix dataset. */ +final class MovieView { + private final String userId; + private final String movieId; + private final Double rating; + + MovieView(String userId, String movieId, Double rating) { + this.userId = userId; + this.movieId = movieId; + this.rating = rating; + } + + // 0-arg constructor is necessary for serialization to work. + private MovieView() { + this("", "", 0.0); + } + + String getUserId() { + return userId; + } + + String getMovieId() { + return movieId; + } + + Double getRating() { + return rating; + } + + static MovieView parseView(String s) { + List spl = Splitter.on(',').splitToList(s); + return new MovieView(spl.get(1), spl.get(0), Double.parseDouble(spl.get(2))); + } +} diff --git a/examples/pipelinedp4j/README.md b/examples/pipelinedp4j/README.md new file mode 100644 index 00000000..06237b8b --- /dev/null +++ b/examples/pipelinedp4j/README.md @@ -0,0 +1,63 @@ +This example demonstrates how to compute differentially private statistics on a +[Netflix dataset](https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data). +To speed up calculations, we'll use a smaller sample of the full dataset. + +The example code expects a CSV file in the following format: `movie_id`, +`user_id`, `rating`, `date`. + +Using this data, the library computes these statistics: + +* Number of views of a certain movie (`count` metric) +* Sum of all ratings of a certain movie (`sum` metric) +* Number of users who watched a certain movie (`privacy_id_count` metric) +* Average rating of a certain movie (`mean` metric) + +The output is a TXT file in this format: + +``` +(movieId=, numberOfViews=, sumOfRatings=) +``` + +The entries will be sorted by `movie_id`. The counts are not rounded to the +nearest integers. You can do this yourself if you want. + +Here's are the steps to run the example: + +1. Go to the example directory: + + ```shell + cd examples/kotlin + ``` + +1. Download the + [Netflix Prize data](https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data) + and extract `combined_data_2.txt` into the `examples/kotlin` directory. + +1. Create a sample dataset: + + ```shell + awk -v OFS=',' '/^[0-9]+:$/ {movie_id=substr($1, 1, length($1)-1)} /^[0-9]+,[0-9]+/ {print movie_id, $0}' combined_data_2.txt | \ + head -n 10000 > netflix_data.csv + ``` + + This command takes the first 10,000 lines from `combined_data_2.txt`, + reformats them into the expected format, and saves them in + `netflix_data.csv`. + +1. Build the program: + + ```shell + bazel build ... + ``` + +1. Run the program: + + ```shell + bazel-bin/BeamExample --local-input-file-path="./netflix_data.csv" --local-output-file-path="./output.txt" + ``` + +1. View the results: + + ```shell + cat output.txt + ``` diff --git a/examples/pipelinedp4j/WORKSPACE.bazel b/examples/pipelinedp4j/WORKSPACE.bazel new file mode 100644 index 00000000..1ef1c9d6 --- /dev/null +++ b/examples/pipelinedp4j/WORKSPACE.bazel @@ -0,0 +1,91 @@ +workspace(name = "examples_kotlin") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +# JVM +http_archive( + name = "rules_jvm_external", + sha256 = "a4ccfc3cb86448f0c8f1d2d6ba59ea34f771f854e7d16e38ef529a31a739e250", + strip_prefix = "rules_jvm_external-6.4", + url = "https://github.com/bazelbuild/rules_jvm_external/archive/6.4.tar.gz", +) + +load("@rules_jvm_external//:repositories.bzl", "rules_jvm_external_deps") + +rules_jvm_external_deps() + +load("@rules_jvm_external//:setup.bzl", "rules_jvm_external_setup") + +rules_jvm_external_setup() + +load("@rules_jvm_external//:defs.bzl", "maven_install") + +# Kotlin +http_archive( + name = "rules_kotlin", + sha256 = "3b772976fec7bdcda1d84b9d39b176589424c047eb2175bed09aac630e50af43", + url = "https://github.com/bazelbuild/rules_kotlin/releases/download/v1.9.6/rules_kotlin-v1.9.6.tar.gz", +) + +load("@rules_kotlin//kotlin:repositories.bzl", "kotlin_repositories") + +kotlin_repositories() + +load("@rules_kotlin//kotlin:core.bzl", "kt_register_toolchains") + +kt_register_toolchains() + +# Kotlin gRPC +http_archive( + name = "com_github_grpc_grpc_kotlin", + repo_mapping = {"@io_bazel_rules_kotlin": "@rules_kotlin"}, + strip_prefix = "grpc-kotlin-1.4.2", + url = "https://github.com/grpc/grpc-kotlin/archive/refs/tags/v1.4.2.tar.gz", +) + +# Repositories +load( + "@com_github_grpc_grpc_kotlin//:repositories.bzl", + "grpc_kt_repositories", + "io_grpc_grpc_java", +) + +io_grpc_grpc_java() + +load( + "@io_grpc_grpc_java//:repositories.bzl", + "grpc_java_repositories", +) + +# Maven +maven_install( + artifacts = [ + "com.google.privacy.differentialprivacy:differentialprivacy:3.0.0", + "com.google.protobuf:protobuf-kotlin:3.18.0", + "info.picocli:picocli:4.7.6", + "org.apache.beam:beam-runners-direct-java:2.49.0", + "org.apache.beam:beam-sdks-java-core:2.49.0", + "org.apache.beam:beam-sdks-java-extensions-avro:2.49.0", + "org.apache.beam:beam-sdks-java-extensions-protobuf:2.49.0", + ], + repositories = [ + "https://maven.google.com", + "https://repo.maven.apache.org/maven2/", + ], +) + +# gRPC +grpc_kt_repositories() + +grpc_java_repositories() + +# Protocol Buffers +load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") + +protobuf_deps() + +# Kotlin DP library repository +local_repository( + name = "com_google_privacy_differentialprivacy_pipielinedp4j", + path = "../../pipelinedp4j", +) diff --git a/pipelinedp4j/BUILD.bazel b/pipelinedp4j/BUILD.bazel new file mode 100644 index 00000000..73dc06c5 --- /dev/null +++ b/pipelinedp4j/BUILD.bazel @@ -0,0 +1,49 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@bazel_common//tools/maven:pom_file.bzl", "pom_file") +load("@rules_jvm_external//:kt_defs.bzl", "kt_jvm_export") +load("@rules_kotlin//kotlin:core.bzl", "kt_kotlinc_options") + +package( + default_visibility = [ + # TODO: make visibility as strict as possible. + "//visibility:public", + ], +) + +kt_kotlinc_options( + name = "kotlinc_options_for_parameterized_tests", + java_parameters = True, + jvm_target = "11", +) + +# Update the following version for packaging of a new release. +_RELEASE_VERSION = "0.0.1" + +pom_file( + name = "export_pom", + substitutions = {"RELEASE_VERSION": _RELEASE_VERSION}, + targets = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/api", + ], + template_file = "pom.template", +) + +kt_jvm_export( + name = "export", + srcs = glob(["*.kt"]), + maven_coordinates = "com.google.privacy.differentialprivacy.pipelinedp4j:pipelinedp4j:%s" % _RELEASE_VERSION, + pom_template = ":export_pom", +) diff --git a/pipelinedp4j/README.md b/pipelinedp4j/README.md new file mode 100644 index 00000000..4664fee4 --- /dev/null +++ b/pipelinedp4j/README.md @@ -0,0 +1,34 @@ + + +# PipelineDP4j + +PipelineDP4j is an end-to-end differential privacy solution for JVM that supports various frameworks for distributed data processing such as [Apache Spark](https://spark.apache.org/) and +[Apache Beam](https://beam.apache.org/documentation/). +It is intended to be usable by all developers, regardless of their differential +privacy expertise. + +Internally, PipelineDP4j relies on the lower-level building blocks from the +differential privacy library and combines them into an "out-of-the-box" solution +that takes care of all the steps that are essential to differential privacy, +including noise addition, [partition selection](https://arxiv.org/abs/2006.03684), +and contribution bounding. Thus, rather than using the lower-level differential +privacy library, it is recommended to use PipelineDP4j, as it can reduce +implementation mistakes. + +PipelineDP4j can be used on any JVM using any JVM compatible language like Kotlin, Scala or Java. + +## How to Use + + +Our [codelab](https://codelabs.developers.google.com/codelabs/pipelinedp4j/) +about computing private statistics with PipelineDP4j +demonstrates how to use the library. Source code for the codelab is available in +the [codelab/](codelab) +directory. + + +Full documentation of the API is available as [kdoc](). + +## Using with Bazel + \ No newline at end of file diff --git a/pipelinedp4j/WORKSPACE.bazel b/pipelinedp4j/WORKSPACE.bazel new file mode 100644 index 00000000..774d10e9 --- /dev/null +++ b/pipelinedp4j/WORKSPACE.bazel @@ -0,0 +1,152 @@ +workspace(name = "com_google_java_pipeline_dp") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +# JVM +http_archive( + name = "rules_jvm_external", + sha256 = "a4ccfc3cb86448f0c8f1d2d6ba59ea34f771f854e7d16e38ef529a31a739e250", + strip_prefix = "rules_jvm_external-6.4", + url = "https://github.com/bazelbuild/rules_jvm_external/archive/6.4.tar.gz", +) + +http_archive( + name = "bazel_common", + sha256 = "ba4700fe928da1574726c9228e8a321907cd166040aed7fbafd0468f13a08fdb", + strip_prefix = "bazel-common-ebce2af3f0de560b649dcf98ef732a56b80e829c", + url = "https://github.com/google/bazel-common/archive/ebce2af3f0de560b649dcf98ef732a56b80e829c.tar.gz", +) + +load("@rules_jvm_external//:repositories.bzl", "rules_jvm_external_deps") + +rules_jvm_external_deps() + +load("@rules_jvm_external//:setup.bzl", "rules_jvm_external_setup") + +rules_jvm_external_setup() + +load("@rules_jvm_external//:defs.bzl", "maven_install") +load("@rules_jvm_external//:specs.bzl", "maven") + +# Kotlin +http_archive( + name = "rules_kotlin", + sha256 = "3b772976fec7bdcda1d84b9d39b176589424c047eb2175bed09aac630e50af43", + url = "https://github.com/bazelbuild/rules_kotlin/releases/download/v1.9.6/rules_kotlin-v1.9.6.tar.gz", +) + +load("@rules_kotlin//kotlin:repositories.bzl", "kotlin_repositories") + +kotlin_repositories() + +load("@rules_kotlin//kotlin:core.bzl", "kt_register_toolchains") + +kt_register_toolchains() + +# Kotlin gRPC +http_archive( + name = "com_github_grpc_grpc_kotlin", + repo_mapping = {"@io_bazel_rules_kotlin": "@rules_kotlin"}, + strip_prefix = "grpc-kotlin-1.4.2", + url = "https://github.com/grpc/grpc-kotlin/archive/refs/tags/v1.4.2.tar.gz", +) + +# Repositories +load( + "@com_github_grpc_grpc_kotlin//:repositories.bzl", + "grpc_kt_repositories", + "io_grpc_grpc_java", +) + +io_grpc_grpc_java() + +load( + "@io_grpc_grpc_java//:repositories.bzl", + "grpc_java_repositories", +) + +# Maven +maven_install( + artifacts = [ + "com.google.privacy.differentialprivacy:differentialprivacy:3.0.0", + "com.google.protobuf:protobuf-kotlin:3.18.0", + "org.apache.beam:beam-sdks-java-core:2.49.0", + "org.apache.beam:beam-sdks-java-extensions-avro:2.49.0", + "org.apache.beam:beam-sdks-java-extensions-protobuf:2.49.0", + # Test only dependencies. + maven.artifact( + "com.google.truth", + "truth", + "1.0.1", + testonly = True, + ), + maven.artifact( + "com.google.truth.extensions", + "truth-proto-extension", + "1.0.1", + testonly = True, + ), + maven.artifact( + "com.google.truth.extensions", + "truth-liteproto-extension", + "1.0.1", + testonly = True, + ), + maven.artifact( + "junit", + "junit", + "4.13.2", + testonly = True, + ), + maven.artifact( + "com.google.testparameterinjector", + "test-parameter-injector", + "1.18", + testonly = True, + ), + maven.artifact( + "org.jetbrains.kotlin", + "kotlin-test", + "2.0.21", + testonly = True, + ), + maven.artifact( + "org.mockito", + "mockito-core", + "5.14.2", + testonly = True, + ), + maven.artifact( + "org.mockito.kotlin", + "mockito-kotlin", + "5.4.0", + testonly = True, + ), + maven.artifact( + "org.apache.beam", + "beam-runners-direct-java", + "2.49.0", + testonly = True, + ), + maven.artifact( + "org.hamcrest", + "hamcrest", + "3.0", + testonly = True, + ), + ], + repositories = [ + "https://maven.google.com", + "https://repo.maven.apache.org/maven2/", + ], +) + +# gRPC +grpc_kt_repositories() + +grpc_java_repositories() + +# Protocol Buffers +load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") + +protobuf_deps() diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/AggregationSpec.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/AggregationSpec.kt new file mode 100644 index 00000000..3cceefbc --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/AggregationSpec.kt @@ -0,0 +1,233 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import com.google.common.collect.ImmutableList +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricDefinition +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetPerOpSpec + +/** + * An internal interface to specify an aggregation. + * + * The fields of the sealed class are the properties that any aggregation must have: in which column + * to write the result, how much budget we can use, aggregation type, and how to extract the + * aggregated value from the input. + * + * @param T the type of the elements in the collection. + */ +sealed class AggregationSpec +private constructor( + internal val outputColumnName: String, + internal val budget: BudgetPerOpSpec?, + metricType: MetricType, + internal val valueExtractor: ((T) -> Double)? = null, +) { + internal val metricDefinition = MetricDefinition(metricType, budget) + + /** + * A privacy id count aggregation. + * + * @param outputColumnName the name of the column to write the result to. + * @param budget the budget to use for the aggregation. + */ + internal class PrivacyIdCount(outputColumnName: String, budget: BudgetPerOpSpec?) : + AggregationSpec( + outputColumnName, + budget, + MetricType.PRIVACY_ID_COUNT, + valueExtractor = null, + ) {} + + /** + * A count aggregation. + * + * @param outputColumnName the name of the column to write the result to. + * @param budget the budget to use for the aggregation. + */ + internal class Count(outputColumnName: String, budget: BudgetPerOpSpec?) : + AggregationSpec(outputColumnName, budget, MetricType.COUNT, valueExtractor = null) {} + + /** + * A sum aggregation. + * + * @param outputColumnName the name of the column to write the result to. + * @param budget the budget to use for the aggregation. + * @param valueExtractor a function to extract the aggregated value from the input. + * @param minTotalValue the minimum value of the sum. + * @param maxTotalValue the maximum value of the sum. + */ + internal class Sum( + outputColumnName: String, + budget: BudgetPerOpSpec?, + valueExtractor: (T) -> Double, + internal val minTotalValue: Double?, + internal val maxTotalValue: Double?, + ) : AggregationSpec(outputColumnName, budget, MetricType.SUM, valueExtractor) {} + + /** + * A mean aggregation. + * + * @param outputColumnName the name of the column to write the result to. + * @param budget the budget to use for the aggregation. + * @param valueExtractor a function to extract the aggregated value from the input. + * @param minValue the smallest possible value that a privacy unit can contribute. + * @param maxValue the largest possible value that a privacy unit can contribute. + */ + internal class Mean( + outputColumnName: String, + budget: BudgetPerOpSpec?, + valueExtractor: (T) -> Double, + internal val minValue: Double, + internal val maxValue: Double, + ) : AggregationSpec(outputColumnName, budget, MetricType.MEAN, valueExtractor) {} + + /** + * A variance aggregation. + * + * @param outputColumnName the name of the column to write the result to. + * @param budget the budget to use for the aggregation. + * @param valueExtractor a function to extract the aggregated value from the input. + * @param minValue the smallest possible value that a privacy unit can contribute. + * @param maxValue the largest possible value that a privacy unit can contribute. + */ + internal class Variance( + outputColumnName: String, + budget: BudgetPerOpSpec?, + valueExtractor: (T) -> Double, + internal val minValue: Double, + internal val maxValue: Double, + ) : AggregationSpec(outputColumnName, budget, MetricType.VARIANCE, valueExtractor) {} + + /** + * A quantiles aggregation. + * + * @param outputColumnName the name of the column to write the result to. + * @param budget the budget to use for the aggregation. + * @param valueExtractor a function to extract the aggregated value from the input. + * @param ranks the ranks of the quantiles to compute. + * @param minValue the smallest possible value that a privacy unit can contribute. + * @param maxValue the largest possible value that a privacy unit can contribute. + */ + internal class Quantiles( + outputColumnName: String, + budget: BudgetPerOpSpec?, + valueExtractor: (T) -> Double, + ranks: List, + internal val minValue: Double, + internal val maxValue: Double, + ) : + AggregationSpec( + outputColumnName, + budget, + MetricType.QUANTILES(ImmutableList.copyOf(ranks)), + valueExtractor, + ) {} +} + +internal fun List>.minTotalValue(): Double? { + val values = + mapNotNull { + when (it) { + is AggregationSpec.PrivacyIdCount<*> -> null + is AggregationSpec.Count<*> -> null + is AggregationSpec.Sum<*> -> it.minTotalValue + is AggregationSpec.Mean<*> -> null + is AggregationSpec.Variance<*> -> null + is AggregationSpec.Quantiles<*> -> null + } + } + .toSet() + require(values.size <= 1) { + "Different minTotalValues: ${values}. minTotalValue can be specified only once because for now only aggregations of the same value are supported." + } + return values.singleOrNull() +} + +internal fun List>.maxTotalValue(): Double? { + val values = + mapNotNull { + when (it) { + is AggregationSpec.PrivacyIdCount<*> -> null + is AggregationSpec.Count<*> -> null + is AggregationSpec.Sum<*> -> it.maxTotalValue + is AggregationSpec.Mean<*> -> null + is AggregationSpec.Variance<*> -> null + is AggregationSpec.Quantiles<*> -> null + } + } + .toSet() + require(values.size <= 1) { + "Different maxTotalValues: ${values}. maxTotalValue can be specified only once because for now only aggregations of the same value are supported." + } + return values.singleOrNull() +} + +internal fun List>.minValue(): Double? { + val values = + mapNotNull { + when (it) { + is AggregationSpec.PrivacyIdCount<*> -> null + is AggregationSpec.Count<*> -> null + is AggregationSpec.Sum<*> -> null + is AggregationSpec.Mean<*> -> it.minValue + is AggregationSpec.Variance<*> -> it.minValue + is AggregationSpec.Quantiles<*> -> it.minValue + } + } + .toSet() + require(values.size <= 1) { + "Different minValues: ${values}. Only aggregations of the same value are supported for now and they must have the same bounds including minValue." + } + return values.singleOrNull() +} + +internal fun List>.maxValue(): Double? { + val values = + mapNotNull { + when (it) { + is AggregationSpec.PrivacyIdCount<*> -> null + is AggregationSpec.Count<*> -> null + is AggregationSpec.Sum<*> -> null + is AggregationSpec.Mean<*> -> it.maxValue + is AggregationSpec.Variance<*> -> it.maxValue + is AggregationSpec.Quantiles<*> -> it.maxValue + } + } + .toSet() + require(values.size <= 1) { + "Different maxValues: ${values}. Only aggregations of the same value are supported for now and they must have the same bounds including maxValue." + } + return values.singleOrNull() +} + +internal fun List>.outputColumnNamesWithMetricTypes() = map { + it.outputColumnName to it.metricDefinition.type +} + +internal fun List>.valueExtractors() = + mapNotNull { + when (it) { + is AggregationSpec.PrivacyIdCount<*> -> null + is AggregationSpec.Count<*> -> null + is AggregationSpec.Sum<*> -> it.valueExtractor + is AggregationSpec.Mean<*> -> it.valueExtractor + is AggregationSpec.Variance<*> -> it.valueExtractor + is AggregationSpec.Quantiles<*> -> it.valueExtractor + } + } + .toSet() diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BUILD.bazel b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BUILD.bazel new file mode 100644 index 00000000..c1b7e791 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BUILD.bazel @@ -0,0 +1,43 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_library") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +kt_jvm_library( + name = "api", + srcs = glob(["*.kt"]), + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/beam:beam_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/beam:beam_encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:data_extractors", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_engine", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_functions_params", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:framework_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_spec", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:dpaggregates_kt_proto", + "@maven//:com_google_errorprone_error_prone_annotations", + "@maven//:com_google_guava_guava", + "@maven//:org_apache_beam_beam_sdks_java_core", + "@maven//:org_apache_beam_beam_sdks_java_extensions_avro", + ], +) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQuery.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQuery.kt new file mode 100644 index 00000000..0b3d5e2c --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQuery.kt @@ -0,0 +1,96 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import com.google.privacy.differentialprivacy.pipelinedp4j.beam.BeamTable +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates +import org.apache.beam.sdk.extensions.avro.coders.AvroCoder +import org.apache.beam.sdk.transforms.MapElements +import org.apache.beam.sdk.transforms.SerializableFunction +import org.apache.beam.sdk.values.KV +import org.apache.beam.sdk.values.PCollection as BeamPCollection + +/** + * A differentially-private query to run on Beam. + * + * @param T the type of the elements in the collection. + */ +class BeamQuery +internal constructor( + data: PipelineDpCollection, + privacyIdExtractor: (T) -> String, + groupKeyExtractor: (T) -> String, + maxGroupsContributed: Int, + maxContributionsPerGroup: Int, + publicKeys: PipelineDpCollection?, + aggregations: List>, +) : + Query>( + data, + privacyIdExtractor, + groupKeyExtractor, + maxGroupsContributed, + maxContributionsPerGroup, + publicKeys, + aggregations, + ) { + /** + * Runs the query with the given total budget and noise kind. + * + * @param budget the budget to use for the query. + * @param noiseKind the noise kind to use for the query. + * @return the result of the query. + */ + override fun run( + budget: TotalBudget, + noiseKind: NoiseKind, + ): BeamPCollection { + val result = (runWithDpEngine(budget, noiseKind) as BeamTable).data + val outputColumnNamesWithMetricTypes = aggregations.outputColumnNamesWithMetricTypes() + val coder = AvroCoder.of(QueryPerGroupResult::class.java) + val mapToResultFn = { kv: KV -> + val key = kv.key + val dpAggregates = kv.value + + val aggregationsMap = + buildMap { + for ((outputColumnName, metricType) in outputColumnNamesWithMetricTypes) { + when (metricType) { + MetricType.PRIVACY_ID_COUNT -> put(outputColumnName, dpAggregates.privacyIdCount) + MetricType.COUNT -> put(outputColumnName, dpAggregates.count) + MetricType.SUM -> put(outputColumnName, dpAggregates.sum) + MetricType.MEAN -> put(outputColumnName, dpAggregates.mean) + MetricType.VARIANCE -> put(outputColumnName, dpAggregates.variance) + is MetricType.QUANTILES -> { + // TODO: consider creating a data class or resuing copy of DpAggregates + // proto and not allowing outputColumnName. + for ((rank, value) in metricType.ranks.zip(dpAggregates.quantilesList)) { + put("${outputColumnName}_${rank}", value) + } + } + } + } + } + + QueryPerGroupResult(key, aggregationsMap) + } + return result + .apply(MapElements.into(coder.encodedTypeDescriptor).via(SerializableFunction(mapToResultFn))) + .setCoder(coder) + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryBuilder.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryBuilder.kt new file mode 100644 index 00000000..d029cc98 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryBuilder.kt @@ -0,0 +1,67 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import org.apache.beam.sdk.values.PCollection as BeamPCollection + +/** + * A builder for queries on Beam. + * + * @param T the type of the elements in the collection. + */ +class BeamQueryBuilder +internal constructor(data: BeamPCollection, privacyIdExtractor: (T) -> String) : + QueryBuilder>( + BeamPipelineDpCollection(data), + privacyIdExtractor = privacyIdExtractor, + ) { + /** + * Groups the data by keys (corresnponds to groupBy operation in SQL). + * + * @param groupKeyExtractor a function to extract the group key from the input. + * @param maxGroupsContributed the maximum number of groups that a single privacy unit can + * contribute to. + * @param maxContributionsPerGroup the maximum number of contributions that a single privacy unit + * can make to a single group. + * @param publicGroups a collection of publicly known keys. Read more about public groups in the + * documentation to the library. + */ + @JvmOverloads + fun groupBy( + groupKeyExtractor: (T) -> String, + maxGroupsContributed: Int, + maxContributionsPerGroup: Int, + publicGroups: BeamPCollection? = null, + ) = + groupBy( + groupKeyExtractor, + maxGroupsContributed, + maxContributionsPerGroup, + publicGroups?.let { BeamPipelineDpCollection(it) }, + ) + + override fun build() = + BeamQuery( + data, + privacyIdExtractor, + groupKeyExtractor, + maxGroupsContributed, + maxContributionsPerGroup, + publicGroups, + aggregations, + ) +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BudgetSpec.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BudgetSpec.kt new file mode 100644 index 00000000..67bb8292 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/BudgetSpec.kt @@ -0,0 +1,150 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import com.google.errorprone.annotations.Immutable +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AbsoluteBudgetPerOpSpec as InternalAbsoluteBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetPerOpSpec as InternalBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.RelativeBudgetPerOpSpec as InternalRelativeBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.TotalBudget as InternalTotalBudget +import java.io.Serializable + +/** + * Represents the budget allocated for anonymizing a metric or group selection. + * + * This is a sealed interface with two implementations: [AbsoluteBudgetPerOpSpec] for absolute + * budget values and [RelativeBudgetPerOpSpec] for relative weights. + */ +@Immutable +sealed interface BudgetPerOpSpec { + /** + * Multiplies this budget specification by a given factor. + * + * @param factor the factor to multiply the budget by. + * @return a new budget specification with the multiplied values or weights. + */ + operator fun times(factor: Double): BudgetPerOpSpec +} + +/** + * Represents an absolute budget (epsilon and delta) for anonymizing a metric or group selection. + * + * @property epsilon the epsilon (ε) privacy budget value. Must be non-negative. + * @property delta the delta (δ) privacy budget value. Must be non-negative. + */ +@Immutable +data class AbsoluteBudgetPerOpSpec(val epsilon: Double, val delta: Double) : + BudgetPerOpSpec, Serializable { + init { + BudgetValidationUtils.validateEpsilon(epsilon) + BudgetValidationUtils.validateDelta(delta) + } + + override fun times(factor: Double) = AbsoluteBudgetPerOpSpec(factor * epsilon, factor * delta) +} + +/** + * Represents a relative weight for anonymizing a metric or partition selection. + * + * The weight is relative to the weights of other metrics computed by the same query. + * + * @property weight the relative weight. Must be strictly positive. + */ +@Immutable +data class RelativeBudgetPerOpSpec(val weight: Double) : BudgetPerOpSpec, Serializable { + init { + BudgetValidationUtils.validateWeight(weight) + } + + override fun times(factor: Double) = RelativeBudgetPerOpSpec(factor * weight) +} + +/** + * The total amount of budget allowed to be used in a single query for accounting both relative and + * absolute operation costs. + * + * @property epsilon the total epsilon (ε) privacy budget value. Must be non-negative. + * @property delta the total delta (δ) privacy budget value. Must be non-negative. Defaults to 0.0. + */ +@Immutable +data class TotalBudget @JvmOverloads constructor(val epsilon: Double, val delta: Double = 0.0) : + Serializable { + init { + BudgetValidationUtils.validateEpsilon(epsilon) + BudgetValidationUtils.validateDelta(delta) + } +} + +/** + * Converts the [BudgetPerOpSpec] to the [InternalBudgetPerOpSpec]. + * + * We delibaretly do not expose the internal classes in the public API to limit the surface of the + * API. This will give us more flexibility to change the implementation. + */ +internal fun BudgetPerOpSpec.toInternalBudgetPerOpSpec(): InternalBudgetPerOpSpec = + when (this) { + is AbsoluteBudgetPerOpSpec -> toInternalAbsoluteBudgetPerOpSpec(this) + is RelativeBudgetPerOpSpec -> toInternalRelativeBudgetPerOpSpec(this) + } + +private fun toInternalAbsoluteBudgetPerOpSpec(spec: AbsoluteBudgetPerOpSpec) = + InternalAbsoluteBudgetPerOpSpec(spec.epsilon, spec.delta) + +private fun toInternalRelativeBudgetPerOpSpec(spec: RelativeBudgetPerOpSpec) = + InternalRelativeBudgetPerOpSpec(spec.weight) + +/** + * Converts the [TotalBudget] to the [InternalTotalBudget]. + * + * We delibaretly do not expose the internal classes in the public API to limit the surface of the + * API. This will give us more flexibility to change the implementation. + */ +internal fun TotalBudget.toInternalTotalBudget() = InternalTotalBudget(epsilon, delta) + +/** Utility object for validating budget parameters. */ +private object BudgetValidationUtils { + + /** + * Validates that epsilon is non-negative. + * + * @param epsilon the epsilon value to validate. + * @throws IllegalArgumentException if epsilon is negative. + */ + fun validateEpsilon(epsilon: Double) { + require(epsilon >= 0.0) { "Epsilon must be >= 0.0. Provided epsilon: $epsilon." } + } + + /** + * Validates that delta is non-negative. + * + * @param delta the delta value to validate. + * @throws IllegalArgumentException if delta is negative. + */ + fun validateDelta(delta: Double) { + require(delta >= 0.0) { "Delta must be >= 0.0. Provided delta: $delta." } + } + + /** + * Validates that a weight is strictly positive. + * + * @param weight the weight value to validate. + * @throws IllegalArgumentException if weight is not strictly positive. + */ + fun validateWeight(weight: Double) { + require(weight > 0.0) { "Weight must be > 0. Provided weight: $weight." } + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/NoiseKind.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/NoiseKind.kt new file mode 100644 index 00000000..200dda85 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/NoiseKind.kt @@ -0,0 +1,37 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind as InternalNoiseKind + +/** The kind of noise that can be applied to the data. */ +enum class NoiseKind { + LAPLACE, + GAUSSIAN, +} + +/** + * Converts the [NoiseKind] to the [InternalNoiseKind]. + * + * We delibaretly do not expose the internal classes in the public API to limit the surface of the + * API. This will give us more flexibility to change the implementation. + */ +internal fun NoiseKind.toInternalNoiseKind() = + when (this) { + NoiseKind.LAPLACE -> InternalNoiseKind.LAPLACE + NoiseKind.GAUSSIAN -> InternalNoiseKind.GAUSSIAN + } diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/PipelineDpCollection.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/PipelineDpCollection.kt new file mode 100644 index 00000000..c79e7238 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/PipelineDpCollection.kt @@ -0,0 +1,54 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import com.google.privacy.differentialprivacy.pipelinedp4j.beam.BeamCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.beam.BeamEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.core.EncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import org.apache.beam.sdk.values.PCollection as BeamPCollection + +/** + * An internal interface to represent an arbitrary collection that is supported by PipelineDP4j. + * + * This interface is used to represent collections in a generic way, so that we can use the same + * code for different collection types. Essentially this is just a helper interface. + * + * @param T the type of the elements in the collection. + */ +sealed interface PipelineDpCollection { + val encoderFactory: EncoderFactory + + fun toFrameworkCollection(): FrameworkCollection +} + +/** Beam PCollection. */ +internal data class BeamPipelineDpCollection(val data: BeamPCollection) : + PipelineDpCollection { + override val encoderFactory = BeamEncoderFactory() + + override fun toFrameworkCollection() = BeamCollection(data) +} + +/** Local collection represented as a Kotlin sequence. */ +internal data class LocalPipelineDpCollection(val data: Sequence) : PipelineDpCollection { + override val encoderFactory = LocalEncoderFactory() + + override fun toFrameworkCollection() = LocalCollection(data) +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/Query.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/Query.kt new file mode 100644 index 00000000..05a4f9cb --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/Query.kt @@ -0,0 +1,122 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import com.google.common.collect.ImmutableList +import com.google.privacy.differentialprivacy.pipelinedp4j.core.AggregationParams +import com.google.privacy.differentialprivacy.pipelinedp4j.core.DataExtractors +import com.google.privacy.differentialprivacy.pipelinedp4j.core.DpEngine +import com.google.privacy.differentialprivacy.pipelinedp4j.core.DpEngineBudgetSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates + +/** + * A differentially-private query. + * + * You can get the instance of a query by calling [QueryBuilder.build]. + * + * @param T the type of the elements in the collection. + * @param R the type of the result. + */ +sealed class Query +protected constructor( + private val data: PipelineDpCollection, + private val privacyIdExtractor: (T) -> String, + private val groupKeyExtractor: (T) -> String, + private val maxGroupsContributed: Int, + private val maxContributionsPerGroup: Int, + private val publicKeys: PipelineDpCollection?, + protected val aggregations: List>, +) { + init { + validate() + } + + /** + * Runs the query with the given total budget and noise kind. + * + * @param budget the budget to use for the query. + * @param noiseKind the noise kind to use for the query. + * @return the result of the query. + */ + abstract fun run(budget: TotalBudget, noiseKind: NoiseKind): R + + protected fun runWithDpEngine( + budget: TotalBudget, + noiseKind: NoiseKind, + ): FrameworkTable { + val dpEngine = + DpEngine.create(data.encoderFactory, DpEngineBudgetSpec(budget.toInternalTotalBudget())) + + val valueExtractor = aggregations.mapNotNull { it.valueExtractor }.toSet().singleOrNull() + val extractors = + if (valueExtractor != null) + DataExtractors.from( + privacyIdExtractor, + privacyIdEncoder = data.encoderFactory.strings(), + partitionKeyExtractor = groupKeyExtractor, + partitionKeyEncoder = data.encoderFactory.strings(), + valueExtractor = valueExtractor, + ) + else + DataExtractors.from( + privacyIdExtractor, + privacyIdEncoder = data.encoderFactory.strings(), + partitionKeyExtractor = groupKeyExtractor, + partitionKeyEncoder = data.encoderFactory.strings(), + ) + val aggregateParams = + AggregationParams( + metrics = ImmutableList.copyOf(aggregations.map { it.metricDefinition }), + noiseKind = noiseKind.toInternalNoiseKind(), + maxPartitionsContributed = maxGroupsContributed, + maxContributionsPerPartition = maxContributionsPerGroup, + minTotalValue = aggregations.minTotalValue(), + maxTotalValue = aggregations.maxTotalValue(), + minValue = aggregations.minValue(), + maxValue = aggregations.maxValue(), + ) + val result = + dpEngine.aggregate( + data.toFrameworkCollection(), + aggregateParams, + extractors, + publicKeys?.toFrameworkCollection(), + ) + dpEngine.done() + return result + } + + private fun validate() { + requireNoDuplicateAggregations() + requireOneValue() + } + + private fun requireNoDuplicateAggregations() { + val outputColumnNameCounts = + aggregations.map { it.outputColumnName }.groupingBy { it }.eachCount() + val duplicates = outputColumnNameCounts.filter { it.value > 1 }.keys + require(duplicates.isEmpty()) { + "There aggregations with duplicate output column names: ${duplicates}." + } + } + + private fun requireOneValue() = + require(aggregations.valueExtractors().size <= 1) { + "Aggregation of different values is not supported yet. Please aggregate only one value. If you provide value extractors then it has to be the same instance." + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/QueryBuilder.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/QueryBuilder.kt new file mode 100644 index 00000000..5fc44767 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/QueryBuilder.kt @@ -0,0 +1,229 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import org.apache.beam.sdk.values.PCollection as BeamPCollection + +/** + * A builder for differentially-private queries. + * + * To create a builder use [QueryBuilder.from] method. Then you should call [QueryBuilder.groupBy] + * to group the data by keys, and then call one or more aggregation functions to specify the + * differentially-private aggregations to perform on the data. Once you have specified all the + * aggregations you should call [QueryBuilder.build] to get the instance of a query that you can + * run. + * + * @param T the type of the elements in the collection. + * @param R the type of the result. + */ +sealed class QueryBuilder +protected constructor( + protected val data: PipelineDpCollection, + protected val privacyIdExtractor: ((T) -> String), +) { + protected lateinit var groupKeyExtractor: (T) -> String + protected var maxGroupsContributed: Int = -1 + protected var maxContributionsPerGroup: Int = -1 + protected var publicGroups: PipelineDpCollection? = null + protected val aggregations = mutableListOf>() + + protected fun groupBy( + groupKeyExtractor: (T) -> String, + maxGroupsContributed: Int, + maxContributionsPerGroup: Int, + publicGroups: PipelineDpCollection? = null, + ): QueryBuilder { + this.groupKeyExtractor = groupKeyExtractor + this.maxGroupsContributed = maxGroupsContributed + this.maxContributionsPerGroup = maxContributionsPerGroup + require( + publicGroups == null || + publicGroups is LocalPipelineDpCollection || + publicGroups::class == data::class + ) { + "Public keys must be either stored in a Sequence object or in the collection of the same type as the data is stored." + } + this.publicGroups = publicGroups + return this + } + + /** + * Schedule an aggregation to count distinct privacy units. + * + * @param outputColumnName if output is dataframe then it is the name of the column to write the + * result to, if collection then it is the name of the output field. + * @param budget the budget to use for the aggregation. + */ + @JvmOverloads + fun countDistinctPrivacyUnits( + outputColumnName: String, + budget: BudgetPerOpSpec? = null, + ): QueryBuilder { + aggregations.add( + AggregationSpec.PrivacyIdCount(outputColumnName, budget?.toInternalBudgetPerOpSpec()) + ) + return this + } + + /** + * Schedule a count aggregation. + * + * @param outputColumnName if output is dataframe then it is the name of the column to write the + * result to, if collection then it is the name of the output field. + * @param budget the budget to use for the aggregation. + */ + @JvmOverloads + fun count(outputColumnName: String, budget: BudgetPerOpSpec? = null): QueryBuilder { + aggregations.add(AggregationSpec.Count(outputColumnName, budget?.toInternalBudgetPerOpSpec())) + return this + } + + /** + * Schedule a sum aggregation. + * + * @param valueExtractor a function to extract the aggregated value from the input. + * @param minTotalValuePerPrivacyUnitInGroup minimum across all groups of the sums of the privacy + * unit contributions to a group. Don't specify it if you also caclulate either MEAN or VARIANCE + * because in this case this value is not used. + * @param maxTotalValuePerPrivacyUnitInGroup the maximum value of the same sum. Don't specify it + * if you also caclulate either MEAN or VARIANCE because in this case this value is not used. + * @param outputColumnName if output is dataframe then it is the name of the column to write the + * result to, if collection then it is the name of the output field. + * @param budget the budget to use for the aggregation. + */ + @JvmOverloads + fun sum( + valueExtractor: (T) -> Double, + minTotalValuePerPrivacyUnitInGroup: Double? = null, + maxTotalValuePerPrivacyUnitInGroup: Double? = null, + outputColumnName: String, + budget: BudgetPerOpSpec? = null, + ): QueryBuilder { + aggregations.add( + AggregationSpec.Sum( + outputColumnName, + budget?.toInternalBudgetPerOpSpec(), + valueExtractor, + minTotalValuePerPrivacyUnitInGroup, + maxTotalValuePerPrivacyUnitInGroup, + ) + ) + return this + } + + /** + * Schedule a mean aggregation. + * + * @param valueExtractor a function to extract the aggregated value from the input. + * @param minValue the minimum value that a privacy unit can contribute. + * @param maxValue the maximum value that a privacy unit can contribute. + * @param outputColumnName if output is dataframe then it is the name of the column to write the + * result to, if collection then it is the name of the output field. + * @param budget the budget to use for the aggregation. + */ + @JvmOverloads + fun mean( + valueExtractor: (T) -> Double, + minValue: Double, + maxValue: Double, + outputColumnName: String, + budget: BudgetPerOpSpec? = null, + ): QueryBuilder { + aggregations.add( + AggregationSpec.Mean( + outputColumnName, + budget?.toInternalBudgetPerOpSpec(), + valueExtractor, + minValue, + maxValue, + ) + ) + return this + } + + /** + * Schedule a variance aggregation. + * + * @param valueExtractor a function to extract the aggregated value from the input. + * @param minValue the minimum value that a privacy unit can contribute. + * @param maxValue the maximum value that a privacy unit can contribute. + * @param outputColumnName if output is dataframe then it is the name of the column to write the + * result to, if collection then it is the name of the output field. + * @param budget the budget to use for the aggregation. + */ + @JvmOverloads + fun variance( + valueExtractor: (T) -> Double, + minValue: Double, + maxValue: Double, + outputColumnName: String, + budget: BudgetPerOpSpec? = null, + ): QueryBuilder { + aggregations.add( + AggregationSpec.Variance( + outputColumnName, + budget?.toInternalBudgetPerOpSpec(), + valueExtractor, + minValue, + maxValue, + ) + ) + return this + } + + /** + * Schedule a quantiles aggregation. + * + * @param valueExtractor a function to extract the aggregated value from the input. + * @param ranks the ranks of the quantiles to compute. + * @param minValue the minimum value that a privacy unit can contribute. + * @param maxValue the maximum value that a privacy unit can contribute. + * @param outputColumnName if output is dataframe then it is the name of the column to write the + * result to. If collection then there will be multiple output fields, one per rank, with names + * "outputColumnName_rank" where rank is the rank of the quantile. + * @param budget the budget to use for the aggregation. + */ + @JvmOverloads + fun quantiles( + valueExtractor: (T) -> Double, + ranks: List, + minValue: Double, + maxValue: Double, + outputColumnName: String, + budget: BudgetPerOpSpec? = null, + ): QueryBuilder { + aggregations.add( + AggregationSpec.Quantiles( + outputColumnName, + budget?.toInternalBudgetPerOpSpec(), + valueExtractor, + ranks, + minValue, + maxValue, + ) + ) + return this + } + + abstract fun build(): Query + + companion object { + @JvmStatic + fun from(data: BeamPCollection, privacyIdExtractor: (T) -> String) = + BeamQueryBuilder(data, privacyIdExtractor) + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/QueryPerGroupResult.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/QueryPerGroupResult.kt new file mode 100644 index 00000000..09cfd7d9 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api/QueryPerGroupResult.kt @@ -0,0 +1,30 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +/** + * The result of a query for a single partition. + * + * @param groupKey is the partition key. + * @param aggregationResults is mapping from aggregation name to the result of the aggregation for + * this partition. For quantiles there will be multiple entries in the map, one per rank, with + * names "aggregationName_rank". + */ +data class QueryPerGroupResult(val groupKey: String, val aggregationResults: Map) { + // Necessary for Beam serialization. + private constructor() : this("", mapOf()) +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BUILD.bazel b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BUILD.bazel new file mode 100644 index 00000000..4b30c68d --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BUILD.bazel @@ -0,0 +1,57 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_library") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +kt_jvm_library( + name = "beam_encoders", + srcs = ["BeamEncoders.kt"], + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:encoders", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:org_apache_beam_beam_sdks_java_core", + "@maven//:org_apache_beam_beam_sdks_java_extensions_avro", + "@maven//:org_apache_beam_beam_sdks_java_extensions_protobuf", + ], +) + +kt_jvm_library( + name = "beam_collections", + srcs = [ + "BeamCollection.kt", + "BeamTable.kt", + ], + deps = [ + ":beam_encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:framework_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_collections", + "@maven//:org_apache_beam_beam_sdks_java_core", + ], +) + +kt_jvm_library( + name = "beam_dp_engine_factory", + srcs = ["BeamDpEngineFactory.kt"], + deps = [ + ":beam_encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_engine", + ], +) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamCollection.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamCollection.kt new file mode 100644 index 00000000..e08f6991 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamCollection.kt @@ -0,0 +1,89 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.beam + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.Encoder +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkCollection +import org.apache.beam.sdk.coders.KvCoder +import org.apache.beam.sdk.transforms.Distinct +import org.apache.beam.sdk.transforms.MapElements +import org.apache.beam.sdk.transforms.SerializableFunction +import org.apache.beam.sdk.transforms.WithKeys +import org.apache.beam.sdk.values.KV +import org.apache.beam.sdk.values.PCollection + +/** An implementation of [FrameworkCollection], which runs all operations on Beam. */ +class BeamCollection(val data: PCollection) : FrameworkCollection { + override val elementsEncoder: BeamEncoder = BeamEncoder(data.coder) + + override fun distinct(stageName: String): BeamCollection = + BeamCollection(data.apply(stageName, Distinct.create())) + + override fun map( + stageName: String, + outputType: Encoder, + mapFn: (T) -> R, + ): BeamCollection { + val outputCoder = (outputType as BeamEncoder).coder + return BeamCollection( + data + .apply( + stageName, + MapElements.into(outputCoder.encodedTypeDescriptor).via(SerializableFunction(mapFn)), + ) + .setCoder(outputCoder) + ) + } + + override fun keyBy( + stageName: String, + outputType: Encoder, + keyFn: (T) -> K, + ): BeamTable { + val keyCoder = (outputType as BeamEncoder).coder + return BeamTable( + data + .apply( + stageName, + WithKeys.of(SerializableFunction(keyFn)).withKeyType(keyCoder.encodedTypeDescriptor), + ) + .setCoder(KvCoder.of(keyCoder, data.coder)) + ) + } + + override fun mapToTable( + stageName: String, + keyType: Encoder, + valueType: Encoder, + mapFn: (T) -> Pair, + ): BeamTable { + val keyBeamType = keyType as BeamEncoder + val valueBeamType = valueType as BeamEncoder + val outputCoder = KvCoder.of(keyBeamType.coder, valueBeamType.coder) + val kvMapFn = { x: T -> mapFn(x).toKV() } + return BeamTable( + data + .apply( + stageName, + MapElements.into(outputCoder.encodedTypeDescriptor).via(SerializableFunction(kvMapFn)), + ) + .setCoder(outputCoder) + ) + } +} + +internal fun Pair.toKV(): KV = KV.of(first, second) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamDpEngineFactory.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamDpEngineFactory.kt new file mode 100644 index 00000000..a4fb0c6f --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamDpEngineFactory.kt @@ -0,0 +1,27 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.beam + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.DpEngine +import com.google.privacy.differentialprivacy.pipelinedp4j.core.DpEngineBudgetSpec + +// TODO: add tests either in respective test class or preferably delete this file and +// and parameterize the dp engine end2end tests with all different types of backends. + +/** Creates a [DpEngine] that runs DP aggregations on Beam. */ +fun DpEngine.Factory.createBeamEngine(budgetSpec: DpEngineBudgetSpec) = + create(BeamEncoderFactory(), budgetSpec) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamEncoders.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamEncoders.kt new file mode 100644 index 00000000..60a948b9 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamEncoders.kt @@ -0,0 +1,68 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.beam + +import org.apache.beam.sdk.extensions.protobuf.ProtoCoder +import com.google.privacy.differentialprivacy.pipelinedp4j.core.Encoder +import com.google.privacy.differentialprivacy.pipelinedp4j.core.EncoderFactory +import com.google.protobuf.Message +import java.io.InputStream +import java.io.OutputStream +import kotlin.reflect.KClass +import org.apache.beam.sdk.coders.Coder +import org.apache.beam.sdk.coders.CustomCoder +import org.apache.beam.sdk.coders.DoubleCoder +import org.apache.beam.sdk.coders.StringUtf8Coder +import org.apache.beam.sdk.coders.VarIntCoder +import org.apache.beam.sdk.extensions.avro.coders.AvroCoder + +class BeamEncoder(val coder: Coder) : Encoder + +class BeamEncoderFactory() : EncoderFactory { + override fun strings() = BeamEncoder(StringUtf8Coder.of()) + + override fun doubles() = BeamEncoder(DoubleCoder.of()) + + override fun ints() = BeamEncoder(VarIntCoder.of()) + + override fun records(recordClass: KClass) = + BeamEncoder(AvroCoder.of(recordClass.java)) + + override fun protos(protoClass: KClass) = + BeamEncoder(ProtoCoder.of(protoClass.java)) + + override fun tuple2sOf(first: Encoder, second: Encoder) = + BeamEncoder( + KotlinPairCoder((first as BeamEncoder).coder, (second as BeamEncoder).coder) + ) +} + +private class KotlinPairCoder( + private val firstCoder: Coder, + private val secondCoder: Coder, +) : CustomCoder>() { + override fun encode(value: Pair, out: OutputStream) { + firstCoder.encode(value.first, out) + secondCoder.encode(value.second, out) + } + + override fun decode(inStream: InputStream): Pair { + val first = firstCoder.decode(inStream) + val second = secondCoder.decode(inStream) + return Pair(first, second) + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTable.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTable.kt new file mode 100644 index 00000000..797416c1 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTable.kt @@ -0,0 +1,269 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.beam + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.Encoder +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkTable +import com.google.privacy.differentialprivacy.pipelinedp4j.core.StageNameUtils.append +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import org.apache.beam.sdk.coders.KvCoder +import org.apache.beam.sdk.coders.VoidCoder +import org.apache.beam.sdk.transforms.Combine +import org.apache.beam.sdk.transforms.DoFn +import org.apache.beam.sdk.transforms.DoFn.ProcessContext +import org.apache.beam.sdk.transforms.DoFn.ProcessElement +import org.apache.beam.sdk.transforms.Filter +import org.apache.beam.sdk.transforms.Flatten +import org.apache.beam.sdk.transforms.GroupByKey +import org.apache.beam.sdk.transforms.Keys +import org.apache.beam.sdk.transforms.MapElements +import org.apache.beam.sdk.transforms.ParDo +import org.apache.beam.sdk.transforms.SerializableBiFunction +import org.apache.beam.sdk.transforms.SerializableFunction +import org.apache.beam.sdk.transforms.Values +import org.apache.beam.sdk.transforms.join.CoGbkResult +import org.apache.beam.sdk.transforms.join.CoGroupByKey +import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple +import org.apache.beam.sdk.values.KV +import org.apache.beam.sdk.values.PCollection +import org.apache.beam.sdk.values.PCollectionList + +/** An implementation of [FrameworkTable], which runs all operations on Beam. */ +class BeamTable(val data: PCollection>) : FrameworkTable { + private val kvCoder = (data.coder as KvCoder) + + override val keysEncoder = BeamEncoder(kvCoder.keyCoder) + + override val valuesEncoder = BeamEncoder(kvCoder.valueCoder) + + override fun map( + stageName: String, + outputType: Encoder, + mapFn: (K, V) -> R, + ): BeamCollection { + val outputCoder = (outputType as BeamEncoder).coder + val kvMapFn = { x: KV -> mapFn(x.getKey(), x.getValue()) } + return BeamCollection( + data + .apply( + stageName, + MapElements.into(outputCoder.encodedTypeDescriptor).via(SerializableFunction(kvMapFn)), + ) + .setCoder(outputCoder) + ) + } + + override fun groupAndCombineValues(stageName: String, combFn: (V, V) -> V): BeamTable { + return BeamTable(data.apply(stageName, Combine.perKey(SerializableBiFunction(combFn)))) + } + + override fun groupByKey(stageName: String): BeamTable> { + return BeamTable(data.apply(stageName, GroupByKey.create())) + } + + override fun keys(stageName: String): BeamCollection = + BeamCollection(data.apply(Keys.create())) + + override fun values(stageName: String): BeamCollection = + BeamCollection(data.apply(Values.create())) + + override fun mapValues( + stageName: String, + outputType: Encoder, + mapValuesFn: (K, V) -> VO, + ): BeamTable { + val beamOutputValueType = outputType as BeamEncoder + val outputCoder = KvCoder.of(keysEncoder.coder, beamOutputValueType.coder) + val kvMapFn = { x: KV -> KV.of(x.getKey(), mapValuesFn(x.getKey(), x.getValue())) } + return BeamTable( + data + .apply( + stageName, + MapElements.into(outputCoder.encodedTypeDescriptor).via(SerializableFunction(kvMapFn)), + ) + .setCoder(outputCoder) + ) + } + + override fun mapToTable( + stageName: String, + outputKeyType: Encoder, + outputValueType: Encoder, + mapFn: (K, V) -> Pair, + ): BeamTable { + val keyBeamEncoder: BeamEncoder = outputKeyType as BeamEncoder + val valueBeamEncoder: BeamEncoder = outputValueType as BeamEncoder + val outputCoder = KvCoder.of(keyBeamEncoder.coder, valueBeamEncoder.coder) + val kvMapFn = { x: KV -> mapFn(x.getKey(), x.getValue()).toKV() } + return BeamTable( + data + .apply( + stageName, + MapElements.into(outputCoder.encodedTypeDescriptor).via(SerializableFunction(kvMapFn)), + ) + .setCoder(outputCoder) + ) + } + + override fun flatMapToTable( + stageName: String, + keyType: Encoder, + valueType: Encoder, + mapFn: (K, V) -> Sequence>, + ): BeamTable { + val keyBeamEncoder: BeamEncoder = keyType as BeamEncoder + val valueBeamEncoder: BeamEncoder = valueType as BeamEncoder + return BeamTable( + data + .apply( + stageName, + ParDo.of( + object : DoFn, KV>() { + @ProcessElement + fun processElement(c: ProcessContext) { + val kv = c.element() + val results = mapFn(kv.getKey(), kv.getValue()) + for (result in results) { + c.output(KV.of(result.first, result.second)) + } + } + } + ), + ) + .setCoder(KvCoder.of(keyBeamEncoder.coder, valueBeamEncoder.coder)) + ) + } + + override fun filterValues(stageName: String, predicate: (V) -> Boolean): BeamTable { + val kvPredicate = { x: KV -> predicate(x.getValue()) } + return BeamTable(data.apply(stageName, Filter.by(SerializableFunction(kvPredicate)))) + } + + override fun filterKeys(stageName: String, predicate: (K) -> Boolean): BeamTable { + val kvPredicate = { x: KV -> predicate(x.getKey()) } + return BeamTable(data.apply(stageName, Filter.by(SerializableFunction(kvPredicate)))) + } + + override fun filterKeys( + stageName: String, + allowedKeys: FrameworkCollection, + unbalancedKeys: Boolean, + ): BeamTable { + return when (allowedKeys) { + is BeamCollection -> { + // There is no special optimized implementation for unbalanced keys in Beam. + filterKeysStoredInBeamCollection(stageName, allowedKeys) + } + is LocalCollection -> { + filterKeysStoredInLocalCollection(stageName, allowedKeys) + } + else -> + throw IllegalArgumentException( + "Collection is of unsupported backend. Only Beam and local backends are supported, " + + "the type of the given collection is ${allowedKeys.javaClass}" + ) + } + } + + override fun flattenWith(stageName: String, other: FrameworkTable): BeamTable { + val otherBeamTable = other as BeamTable + val collectionsList = PCollectionList.of(this.data).and(otherBeamTable.data) + return BeamTable(collectionsList.apply(stageName, Flatten.pCollections())) + } + + /** + * Keeps only those table entries whose keys are in [allowedKeys] Beam collection. + * + * Filtering is done by joining the table with [allowedKeys] and keeping only those entries that + * matched with some key in [allowedKeys]. The data that belongs to one key is processed in a + * single worker, however it does not have to fit in memory. The algorithm does not handle + * unbalanced keys (i.e. hot partitions) in any specific way. + */ + private fun filterKeysStoredInBeamCollection( + stageName: String, + allowedKeys: BeamCollection, + ): BeamTable { + val allowedKeysAsTable = + allowedKeys + .map( + stageName.append("ConvertAllowedKeysToTable"), + BeamEncoder(KvCoder.of(allowedKeys.elementsEncoder.coder, VoidCoder.of())), + { k -> KV.of(k, null) }, + ) + .data + val dataTag = "DataTag" + val allowedKeysTag = "AllowedKeysTag" + val pCollectionTuple = + KeyedPCollectionTuple.of(dataTag, data).and(allowedKeysTag, allowedKeysAsTable) + val joinResult = pCollectionTuple.apply(CoGroupByKey.create()) + + val filteredTable = + joinResult + .apply( + stageName.append("FilterForAllowedKeys"), + ParDo.of( + object : DoFn, KV>() { + @ProcessElement + fun processElement(c: ProcessContext) { + val kv = c.element() + val tableValues = kv.value.getAll(dataTag) + val keyIsAllowed = kv.value.getAll(allowedKeysTag).any() + if (keyIsAllowed) { + for (value in tableValues) { + c.output(KV.of(kv.key, value)) + } + } + } + } + ), + ) + .setCoder(KvCoder.of(keysEncoder.coder, valuesEncoder.coder)) + + return BeamTable(filteredTable) + } + + /** + * Keeps only those table entries whose keys are in [allowedKeys] local collection. + * + * Filtering is done by converting [allowedKeys] to a HashSet and checking if the key of the entry + * is in that set. + * + * The [HashSet] is sent to workers over the network. It might be a bottleneck in the future if + * there are clients whose set is large. If such situation occurs, it might be worth thinking how + * we can send [List] to workers because [List] is smaller in size. However, at the time of + * writing it seemed not possible to send list and create one hashset per worker. The only + * solution we found was to send list and in the lambda create hashset from this list but it means + * that for each key we will create a hashset which is too inefficient and consumes too much + * memory. There might be a solution with side inputs if we could convert the [allowedKeys] + * sequence to PCollection, however to do that we need Beam pipeline instance which we don't have + * access to. + */ + private fun filterKeysStoredInLocalCollection( + stageName: String, + allowedKeys: LocalCollection, + ): BeamTable { + // TODO: add end2end where public partitions are stored in local collection. + val allowedKeysHashSet = allowedKeys.data.toHashSet() + return filterKeys(stageName) { k -> k in allowedKeysHashSet } + } + + override fun samplePerKey(stageName: String, count: Int): BeamTable> { + // TODO: implement. + throw UnsupportedOperationException("Not implemented yet") + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/BUILD.bazel b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/BUILD.bazel new file mode 100644 index 00000000..da1df01d --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/BUILD.bazel @@ -0,0 +1,119 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_library") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +kt_jvm_library( + name = "encoders", + srcs = ["Encoders.kt"], + deps = [ + "@maven//:com_google_protobuf_protobuf_java", + ], +) + +kt_jvm_library( + name = "framework_collections", + srcs = [ + "FrameworkCollection.kt", + "FrameworkTable.kt", + ], + deps = [ + ":encoders", + ], +) + +kt_jvm_library( + name = "dp_functions_params", + srcs = [ + "DpFunctionsParams.kt", + ], + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_spec", + "@maven//:com_google_errorprone_error_prone_annotations", + "@maven//:com_google_guava_guava", + ], +) + +kt_jvm_library( + name = "core_types", + srcs = [ + "CoreTypes.kt", + ], + deps = [ + ":encoders", + ], +) + +kt_jvm_library( + name = "data_extractors", + srcs = [ + "DataExtractors.kt", + ], + deps = [ + ":core_types", + ":encoders", + ], +) + +kt_jvm_library( + name = "contribution_sampler", + srcs = [ + "ContributionSampler.kt", + "NoPrivacySampler.kt", + "PartitionAndPerPartitionSampler.kt", + "PartitionSampler.kt", + "PerPartitionContributionsSampler.kt", + ], + deps = [ + ":core_types", + ":encoders", + ":framework_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:dpaggregates_kt_proto", + ], +) + +kt_jvm_library( + name = "dp_engine", + srcs = [ + "Combiners.kt", + "ComputationalGraph.kt", + "DpEngine.kt", + "PrivatePartitions.kt", + "PublicPartitions.kt", + ], + deps = [ + ":contribution_sampler", + ":core_types", + ":data_extractors", + ":dp_functions_params", + ":encoders", + ":framework_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:allocated_budget", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_accountant", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_spec", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary:noise_factories", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary:pre_aggregation_partition_selection_factory", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:accumulators_kt_proto", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:dpaggregates_kt_proto", + "@maven//:com_google_errorprone_error_prone_annotations", + "@maven//:com_google_privacy_differentialprivacy_differentialprivacy", + "@maven//:com_google_protobuf_protobuf_java", + ], +) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/Combiners.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/Combiners.kt new file mode 100644 index 00000000..80e84df4 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/Combiners.kt @@ -0,0 +1,1021 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.errorprone.annotations.Immutable +import com.google.privacy.differentialprivacy.BoundedQuantiles +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.CompoundAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.CountAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.MeanAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdCountAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.QuantilesAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.SumAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.VarianceAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.compoundAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.countAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.dpAggregates +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.meanAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdCountAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.quantilesAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.sumAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.varianceAccumulator +import com.google.protobuf.ByteString +import java.io.Serializable +import kotlin.math.abs +import kotlin.math.max + +/** + * An entity that aggregates input values and adds noise for differential privacy (DP). + * + * Concrete implementations handle specific metric types (e.g., counts, sums, means, quantiles). + * + * @param AccumulatorT type of accumulator used to store intermediate aggregation results. + * @param OutputT type of output produced by the combiner, typically a noisy metric. + */ +sealed interface Combiner : Serializable { + /** + * Whether per-partition contribution bounding has to be performed before calling + * [createAccumulator]. + * + * Privacy levels without contribution bounding should not affect this value, just assume a + * privacy level with contribution bounding. + */ + val requiresPerPartitionBoundedInput: Boolean + + /** + * Creates a new accumulator instance from the given privacy id contributions. + * + * @param contributions the privacy id contributions to initialize the accumulator with. + * @return a new accumulator instance representing the aggregated contributions. + */ + fun createAccumulator(contributions: PrivacyIdContributions): AccumulatorT + + /** + * Merges two accumulators into one. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new accumulator containing the combined results of the two input accumulators. + */ + fun mergeAccumulators(accumulator1: AccumulatorT, accumulator2: AccumulatorT): AccumulatorT + + /** + * Returns an anonymized metric computed on the given accumulator. + * + * @param accumulator the accumulator containing the aggregated data. + * @return the computed anonymized metrics of type `OutputT`. + */ + fun computeMetrics(accumulator: AccumulatorT): OutputT + + /** + * Creates an empty accumulator that holds no value. + * + * @return an empty accumulator of type `AccumulatorT`. + */ + fun emptyAccumulator() = createAccumulator(PrivacyIdContributions.getDefaultInstance()) +} + +/** + * A [Combiner] for the [MetricType.COUNT]. It returns a noisy count of input items. + * + * @property aggregationParams parameters that control the aggregation behavior. + * @property budget the amount of privacy budget that can be used by the combiner. + * @property noiseFactory allows for passing the noise generator as a parameter in order to be able + * to mock it in tests. + */ +class CountCombiner( + private val aggregationParams: AggregationParams, + private val budget: AllocatedBudget, + private val noiseFactory: (NoiseKind) -> Noise, +) : Combiner { + override val requiresPerPartitionBoundedInput = false + + /** + * Creates a new [CountAccumulator] initialized with the size of the input contributions, + * potentially bounded based on the privacy level and aggregation parameters. + * + * @param contributions the privacy id contributions to initialize the accumulator with. + * @return a new [CountAccumulator] instance. + */ + override fun createAccumulator(contributions: PrivacyIdContributions): CountAccumulator = + countAccumulator { + count = + contributions.valuesList.size + .toLong() + .coerceInIfContributionBoundingEnabled( + 0, + aggregationParams.maxContributionsPerPartition!!.toLong(), + aggregationParams.privacyLevel, + ) + } + + /** + * Merges two [CountAccumulator] instances by summing their counts. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new [CountAccumulator] instance with the combined count. + */ + override fun mergeAccumulators( + accumulator1: CountAccumulator, + accumulator2: CountAccumulator, + ): CountAccumulator = countAccumulator { count = accumulator1.count + accumulator2.count } + + /** + * Computes a noisy count from the given [CountAccumulator]. + * + * @param accumulator the accumulator containing the aggregated count. + * @return a noisy count with added differential privacy guarantees. + */ + override fun computeMetrics(accumulator: CountAccumulator): Double { + val noise = noiseFactory(aggregationParams.noiseKind) + return noise.addNoise( + accumulator.count.toDouble(), + aggregationParams.maxPartitionsContributed!!, + aggregationParams.maxContributionsPerPartition!!.toDouble(), + budget.epsilon(), + budget.delta(), + ) + } +} + +/** + * A [Combiner] for the [MetricType.PRIVACY_ID_COUNT]. + * + * It returns a noisy count of privacy ids. + * + * @property aggregationParams parameters controlling the aggregation process. + * @property budget the amount of privacy budget that can be used by the combiner. + * @property noiseFactory allows for passing the noise generator as a parameter in order to be able + * to mock it in tests. + */ +class PrivacyIdCountCombiner( + private val aggregationParams: AggregationParams, + private val budget: AllocatedBudget, + private val noiseFactory: (NoiseKind) -> Noise, +) : Combiner { + override val requiresPerPartitionBoundedInput = false + + /** + * Creates a [PrivacyIdCountAccumulator] initialized with the count of unique privacy ids in the + * contributions. If the contributions are empty (representing an empty public partition), the + * count is 0. Otherwise, the count is 1, as the contributions correspond to a single privacy id. + * + * @param contributions privacy id contributions to initialize the accumulator with. + * @return a new [PrivacyIdCountAccumulator] instance. + */ + override fun createAccumulator(contributions: PrivacyIdContributions): PrivacyIdCountAccumulator { + val privacyIdCount = + if (contributions.valuesList.isEmpty()) { + // Empty public partition, no privacy ids. + 0 + } else { + // `contributions` corresponds to 1 privacy id. + 1 + } + + return privacyIdCountAccumulator { count = privacyIdCount.toLong() } + } + + /** + * Merges two [PrivacyIdCountAccumulator] instances by summing their counts. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new [PrivacyIdCountAccumulator] instance with the combined count. + */ + override fun mergeAccumulators( + accumulator1: PrivacyIdCountAccumulator, + accumulator2: PrivacyIdCountAccumulator, + ): PrivacyIdCountAccumulator = privacyIdCountAccumulator { + count = accumulator1.count + accumulator2.count + } + + /** + * Computes a noisy count of unique privacy ids from the given [PrivacyIdCountAccumulator]. + * + * Noise is added using the specified noise mechanism (`noiseFactory`), privacy budget (`budget`), + * and the maximum number of partitions contributed + * (`aggregationParams.maxPartitionsContributed`). The lInfSensitivity is set to 1.0, as each + * privacy id contributes at most 1 to the count. + * + * @param accumulator the accumulator containing the aggregated privacy id count. + * @return a noisy count of unique privacy ids. + */ + override fun computeMetrics(accumulator: PrivacyIdCountAccumulator): Double = + noiseFactory(aggregationParams.noiseKind) + .addNoise( + accumulator.count.toDouble(), + aggregationParams.maxPartitionsContributed!!, + /* lInfSensitivity = */ 1.0, + budget.epsilon(), + budget.delta(), + ) +} + +/** + * A [Combiner] for the computing exact (i.e. not anonymized) privacy id count. + * + * The exact privacy id count is used for pre-aggregation partition selection. The exact privacy id + * count is not anonymized and hence it cannot be returned in the in anonymized output. This + * combiner should not be used together with PrivacyIdCountCombiner. + */ +class ExactPrivacyIdCountCombiner : Combiner { + override val requiresPerPartitionBoundedInput = false + + /** + * Creates a [PrivacyIdCountAccumulator] initialized with a count of 1, assuming the contributions + * represent a single privacy id. + * + * @param contributions the privacy id contributions to initialize the accumulator with. Must NOT + * be empty. + * @return a new [PrivacyIdCountAccumulator] instance with a count of 1. + */ + override fun createAccumulator(contributions: PrivacyIdContributions): PrivacyIdCountAccumulator { + return privacyIdCountAccumulator { count = 1 } + } + + /** + * Merges two [PrivacyIdCountAccumulator] instances by summing their counts. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new [PrivacyIdCountAccumulator] instance with the combined count. + */ + override fun mergeAccumulators( + accumulator1: PrivacyIdCountAccumulator, + accumulator2: PrivacyIdCountAccumulator, + ): PrivacyIdCountAccumulator = privacyIdCountAccumulator { + count = accumulator1.count + accumulator2.count + } + + /** + * This method is not supported for `ExactPrivacyIdCountCombiner`. + * + * Since the count is not anonymized, computing metrics would violate differential privacy. + * + * @throws UnsupportedOperationException always, as this method is not meant to be used. + */ + override fun computeMetrics(accumulator: PrivacyIdCountAccumulator): Double = + throw UnsupportedOperationException( + "ExactPrivacyIdCountCombiner does not support compute_metrics." + ) +} + +/** + * A [Combiner] which computes [MetricType.PRIVACY_ID_COUNT] and performs post-aggregation partition + * selection. When the noised privacy id count is smaller than the threshold, it returns null, + * otherwise the noised privacy id count, which is anonymized. The threshold is computed from input + * parameters to ensure Differential Privacy. This combiner should not be used together with + * PrivacyIdCountCombiner. + * + * @property aggregationParams parameters controlling the aggregation process. + * @property noiseBudget the amount of privacy budget that can be used by the combiner for the + * noise. + * @property thresholdingBudget the amount of privacy budget that can be used by the combiner for + * the thresholding. + * @property noiseFactory allows for passing the noise generator as a parameter in order to be able + * to mock it in tests. + */ +class PostAggregationPartitionSelectionCombiner( + private val aggregationParams: AggregationParams, + private val noiseBudget: AllocatedBudget, + private val thresholdingBudget: AllocatedBudget, + private val noiseFactory: (NoiseKind) -> Noise, +) : Combiner { + override val requiresPerPartitionBoundedInput = false + + /** + * Creates a [PrivacyIdCountAccumulator] initialized with a count of 1, assuming there is at least + * one contribution (representing one privacy id). + * + * @param contributions the privacy id contributions to initialize the accumulator with. Must NOT + * be empty. + * @return a new [PrivacyIdCountAccumulator] instance with a count of 1. + * @throws IllegalArgumentException if the contributions are empty. + */ + override fun createAccumulator(contributions: PrivacyIdContributions): PrivacyIdCountAccumulator { + require(contributions.valuesList.size > 0) { + "There must be contributions for PostAggregationPartitionSelectionCombiner." + } + return privacyIdCountAccumulator { count = 1 } + } + + /** + * Merges two accumulators by summing their counts. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new accumulator with the combined count. + */ + override fun mergeAccumulators( + accumulator1: PrivacyIdCountAccumulator, + accumulator2: PrivacyIdCountAccumulator, + ): PrivacyIdCountAccumulator = privacyIdCountAccumulator { + count = accumulator1.count + accumulator2.count + } + + /** + * Computes the noisy count of privacy ids and applies the post-aggregation partition selection + * mechanism. + * + * @param accumulator the accumulator containing the aggregated privacy id count. + * @return the noisy count of privacy ids if the partition is kept, or null if it is discarded. + */ + override fun computeMetrics(accumulator: PrivacyIdCountAccumulator): Double? = + getPartitionSelector().addNoiseIfShouldKeep(accumulator.count) + + /** + * Returns a [PostAggregationPartitionSelector] which is used for post-aggregation partition + * selection. + * + * @return a [PostAggregationPartitionSelector] instance. + */ + internal fun getPartitionSelector(): PostAggregationPartitionSelector = + PostAggregationPartitionSelectorImpl( + aggregationParams.maxPartitionsContributed!!, + aggregationParams.noiseKind, + aggregationParams.preThreshold, + noiseBudget, + thresholdingBudget, + noiseFactory, + ) +} + +/** + * A [Combiner] for the [MetricType.SUM]. + * + * It returns a noisy sum of input items. + * + * @property aggregationParams parameters controlling the aggregation process. + * @property budget the amount of privacy budget that can be used by the combiner. + * @property noiseFactory allows for passing the noise generator as a parameter in order to be able + * to mock it in tests. + */ +class SumCombiner( + private val aggregationParams: AggregationParams, + // The amount of privacy budget that can be used by the combiner. + private val budget: AllocatedBudget, + // We allow for passing the noise generator as a parameter in order to be able to mock it in + // tests. + private val noiseFactory: (NoiseKind) -> Noise, +) : Combiner, Serializable { + override val requiresPerPartitionBoundedInput = false + + /** + * Creates a [SumAccumulator] from the given privacy id contributions. + * + * **Important Note:** The `contributions` must contain all contributions by a single privacy id + * to a single partition. + * + * @param contributions the contributions of a single privacy id to a single partition. + * @return a new `SumAccumulator` initialized with the sum of the contributions, potentially + * bounded based on the privacy level. + */ + override fun createAccumulator(contributions: PrivacyIdContributions): SumAccumulator = + sumAccumulator { + sum = + if (contributions.valuesList.isEmpty()) { + 0.0 + } else { + contributions.valuesList + .sum() + .coerceInIfContributionBoundingEnabled( + aggregationParams.minTotalValue!!, + aggregationParams.maxTotalValue!!, + aggregationParams.privacyLevel, + ) + } + } + + /** + * Merges two [SumAccumulator] instances by adding their sums. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new `SumAccumulator` with the combined sum. + */ + override fun mergeAccumulators( + accumulator1: SumAccumulator, + accumulator2: SumAccumulator, + ): SumAccumulator = sumAccumulator { sum = accumulator1.sum + accumulator2.sum } + + /** + * Computes a noisy sum from the given [SumAccumulator]. + * + * @param accumulator the accumulator containing the aggregated sum. + * @return a noisy sum with added differential privacy guarantees. + */ + override fun computeMetrics(accumulator: SumAccumulator): Double { + val noise = noiseFactory(aggregationParams.noiseKind) + val lInfSensitivity = + max(abs(aggregationParams.minTotalValue!!), abs(aggregationParams.maxTotalValue!!)) + + return noise.addNoise( + accumulator.sum, + aggregationParams.maxPartitionsContributed!!, + lInfSensitivity, + budget.epsilon(), + budget.delta(), + ) + } +} + +/** + * A [Combiner] for the [MetricType.MEAN]. + * + * It returns a noisy mean of input items. It can also return count and sum if requested by the + * user. + * + * @property aggregationParams parameters controlling the aggregation process. + * @property countBudget the allocated privacy budget for the count calculation. + * @property sumBudget the allocated privacy budget for the sum calculation. + * @property noiseFactory allows for passing the noise generator as a parameter in order to be able + * to mock it in tests. + */ +class MeanCombiner( + private val aggregationParams: AggregationParams, + private val countBudget: AllocatedBudget, + private val sumBudget: AllocatedBudget, + private val noiseFactory: (NoiseKind) -> Noise, +) : Combiner, Serializable { + private val midValue = (aggregationParams.minValue!! + aggregationParams.maxValue!!) / 2 + private val returnCount = aggregationParams.metrics.any { it.type == COUNT } + private val returnSum = aggregationParams.metrics.any { it.type == SUM } + + override val requiresPerPartitionBoundedInput = true + + /** + * **Important Note:** the [contributions] passed to this function must all be contributions of a + * particular privacy id into a particular partition **sub-sampled** to + * **aggregationParams.maxContributionsPerPartition** + * + * @param contributions privacy id contributions for a specific privacy id and partition. + * @return a new [MeanAccumulator] with the count and normalized sum of the contributions. + */ + override fun createAccumulator(contributions: PrivacyIdContributions): MeanAccumulator = + // All input values are normalized to be their difference from the middle of the + // input range. That allows us to calculate the sum of all input values with + // half the sensitivity it would otherwise take for better accuracy (as compared + // to doing noisy sum / noisy count). + meanAccumulator { + count = contributions.valuesList.size.toLong() + normalizedSum = + contributions.valuesList + .map { + it.coerceInIfContributionBoundingEnabled( + aggregationParams.minValue!!, + aggregationParams.maxValue!!, + aggregationParams.privacyLevel, + ) - midValue + } + .sum() + } + + /** + * Merges two [MeanAccumulator] instances by summing their counts and normalized sums. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new [MeanAccumulator] with the combined counts and normalized sums. + */ + override fun mergeAccumulators( + accumulator1: MeanAccumulator, + accumulator2: MeanAccumulator, + ): MeanAccumulator = meanAccumulator { + count = accumulator1.count + accumulator2.count + normalizedSum = accumulator1.normalizedSum + accumulator2.normalizedSum + } + + /** + * Computes the DP mean from the given [MeanAccumulator]. + * + * @param accumulator the accumulator containing aggregated count and normalized sum. + * @return a [MeanCombinerResult] containing the DP mean, and optionally DP sum and count. + */ + override fun computeMetrics(accumulator: MeanAccumulator): MeanCombinerResult { + val dpCount = getNoisedCount(accumulator.count, aggregationParams, countBudget, noiseFactory) + val dpNormalizedSum = + getNoisedNormalizedSum( + accumulator.normalizedSum, + midValue, + aggregationParams, + sumBudget, + noiseFactory, + ) + // Adding midValue denormalize mean to [minValue, maxValue] range. + val dpMean = dpNormalizedSum / dpCount + midValue + val outSum = if (returnSum) dpMean * dpCount else null + val outCount = if (returnCount) dpCount else null + return MeanCombinerResult(dpMean, outSum, outCount) + } +} + +/** + * Represents the result of the [MeanCombiner]. + * + * @property mean the differentially private mean. + * @property sum the differentially private sum (if requested). + * @property count the differentially private count (if requested). + */ +@Immutable +data class MeanCombinerResult(val mean: Double, val sum: Double?, val count: Double?) : + Serializable + +/** + * A [Combiner] for the [MetricType.QUANTILES]. + * + * It returns noisy quantiles for the requested ranks. The output quantiles are sorted by ranks. + * + * @property ranks a list of ranks for which quantiles will be computed. The ranks must be between 0 + * (inclusive) and 1 (inclusive). + * @property aggregationParams parameters controlling the aggregation process. + * @property budget the amount of privacy budget that can be used by the combiner. + * @property noiseFactory allows for passing the noise generator as a parameter in order to be able + * to mock it in tests. + */ +class QuantilesCombiner( + private val ranks: List, + private val aggregationParams: AggregationParams, + private val budget: AllocatedBudget, + private val noiseFactory: (NoiseKind) -> Noise, +) : Combiner>, Serializable { + override val requiresPerPartitionBoundedInput = true + + /** + * Creates a `QuantilesAccumulator` from privacy id contributions. + * + * **Important Note:** the [contributions] passed to this function must all be contributions of a + * particular privacy id into a particular partition **sub-sampled** to + * **aggregationParams.maxContributionsPerPartition** + * + * @param contributions privacy id contributions. + * @return a new `QuantilesAccumulator` containing a serialized summary of the quantiles. + */ + override fun createAccumulator(contributions: PrivacyIdContributions): QuantilesAccumulator = + quantilesAccumulator { + val boundedQuantiles = emptyBoundedQuantiles() + boundedQuantiles.addEntries(contributions.valuesList) + serializedQuantilesSummary = ByteString.copyFrom(boundedQuantiles.serializableSummary) + } + + /** + * Merges two [QuantilesAccumulator] instances. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new `QuantilesAccumulator` containing the merged quantile summary. + */ + override fun mergeAccumulators( + accumulator1: QuantilesAccumulator, + accumulator2: QuantilesAccumulator, + ): QuantilesAccumulator = quantilesAccumulator { + val boundedQuantiles = emptyBoundedQuantiles() + boundedQuantiles.mergeWith(accumulator1.serializedQuantilesSummary.toByteArray()) + boundedQuantiles.mergeWith(accumulator2.serializedQuantilesSummary.toByteArray()) + serializedQuantilesSummary = ByteString.copyFrom(boundedQuantiles.serializableSummary) + } + + /** + * Computes and returns a list of noisy quantiles for the specified ranks. + * + * The output quantiles are sorted in ascending order based on their ranks. + * + * @param accumulator the accumulator containing the aggregated data. + * @return a list of noisy quantiles corresponding to the specified ranks. + */ + override fun computeMetrics(accumulator: QuantilesAccumulator): List { + val boundedQuantiles = emptyBoundedQuantiles() + boundedQuantiles.mergeWith(accumulator.serializedQuantilesSummary.toByteArray()) + return ranks.sorted().map { boundedQuantiles.computeResult(it) } + } + + /** + * Creates an empty `BoundedQuantiles` builder initialized with the necessary parameters. + * + * @return an empty `BoundedQuantiles.Builder` instance. + */ + private fun emptyBoundedQuantiles() = + BoundedQuantiles.builder() + .epsilon(budget.epsilon()) + .delta(budget.delta()) + .noise(noiseFactory(aggregationParams.noiseKind)) + .maxPartitionsContributed( + if (aggregationParams.privacyLevel.withPartitionsContributedBounding) { + aggregationParams.maxPartitionsContributed!! + } else { + 1 + } + ) + .maxContributionsPerPartition( + if (aggregationParams.privacyLevel.withContributionsPerPartitionBounding) { + aggregationParams.maxContributionsPerPartition!! + } else { + Int.MAX_VALUE + } + ) + // Min and max values aren't changed if there is no contribution bounding because the extreme + // values aren't supported by the DP library. + .lower(aggregationParams.minValue!!) + .upper(aggregationParams.maxValue!!) + .build() +} + +/** + * A [Combiner] for the [MetricType.VARIANCE]. + * + * It returns a noisy variance of input items. It can also return count, sum, and mean if requested + * by the user. + * + * @property aggregationParams parameters controlling the aggregation process (including whether to + * return count, sum, and mean). + * @property countBudget the privacy budget for the count calculation. + * @property sumBudget the privacy budget for the sum calculation. + * @property sumSquaresBudget the privacy budget for the sum of squares calculation. + * @property noiseFactory allows for passing the noise generator as a parameter in order to be able + * to mock it in tests. + */ +class VarianceCombiner( + private val aggregationParams: AggregationParams, + private val countBudget: AllocatedBudget, + private val sumBudget: AllocatedBudget, + private val sumSquaresBudget: AllocatedBudget, + private val noiseFactory: (NoiseKind) -> Noise, +) : Combiner, Serializable { + private val midValue = (aggregationParams.minValue!! + aggregationParams.maxValue!!) / 2 + private val returnCount = aggregationParams.metrics.any { it.type == COUNT } + private val returnSum = aggregationParams.metrics.any { it.type == SUM } + private val returnMean = aggregationParams.metrics.any { it.type == MEAN } + + override val requiresPerPartitionBoundedInput = true + + /** + * **Important Note:** the [contributions] passed to this function must all be contributions of a + * particular privacy id into a particular partition **sub-sampled** to + * **aggregationParams.maxContributionsPerPartition**. + * + * @param contributions sub-sampled privacy id contributions. + * @return a new `VarianceAccumulator` containing normalized sum, sum of squares, and count of the + * values. + */ + override fun createAccumulator(contributions: PrivacyIdContributions): VarianceAccumulator = + // All input values are normalized to be their difference from the middle of the + // input range. That allows us to calculate the sum of all input values with + // half the sensitivity it would otherwise take for better accuracy (as compared + // to doing noisy sum / noisy count). + varianceAccumulator { + val coercedValues = + contributions.valuesList.map { + it.coerceInIfContributionBoundingEnabled( + aggregationParams.minValue!!, + aggregationParams.maxValue!!, + aggregationParams.privacyLevel, + ) - midValue + } + count = coercedValues.size.toLong() + normalizedSum = coercedValues.sum() + normalizedSumSquares = coercedValues.map { it * it }.sum() + } + + /** + * Merges two `VarianceAccumulator` instances by summing their counts, normalized sums, and + * normalized sums of squares. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new `VarianceAccumulator` with the combined values. + */ + override fun mergeAccumulators( + accumulator1: VarianceAccumulator, + accumulator2: VarianceAccumulator, + ): VarianceAccumulator = varianceAccumulator { + count = accumulator1.count + accumulator2.count + normalizedSum = accumulator1.normalizedSum + accumulator2.normalizedSum + normalizedSumSquares = accumulator1.normalizedSumSquares + accumulator2.normalizedSumSquares + } + + /** + * Computes the DP variance, and optionally DP sum, count, and mean, from the given + * `VarianceAccumulator`. + * + * @param accumulator the accumulator containing the aggregated data. + * @return a [VarianceCombinerResult] containing the computed DP variance, and potentially DP sum, + * count, and mean. + */ + override fun computeMetrics(accumulator: VarianceAccumulator): VarianceCombinerResult { + val dpCount = getNoisedCount(accumulator.count, aggregationParams, countBudget, noiseFactory) + val dpNormalizedSum = + getNoisedNormalizedSum( + accumulator.normalizedSum, + midValue, + aggregationParams, + sumBudget, + noiseFactory, + ) + val dpNormalizedSumSquares = + getNoisedNormalizedSumOfSquares( + accumulator.normalizedSumSquares, + midValue, + aggregationParams, + sumSquaresBudget, + noiseFactory, + ) + val dpNormalizedMean = dpNormalizedSum / dpCount + val dpVariance = dpNormalizedSumSquares / dpCount - dpNormalizedMean * dpNormalizedMean + val outCount = if (returnCount) dpCount else null + // Mean uses post processing from COUNT and SUM operations, so it consumes no budget. + val dpMean = dpNormalizedMean + midValue + val outSum = if (returnSum) dpMean * dpCount else null + val outMean = if (returnMean) dpMean else null + return VarianceCombinerResult(dpVariance, outCount, outSum, outMean) + } +} + +/** + * Represents the result of the [VarianceCombiner]. + * + * @property variance the differentially private variance. + * @property count the differentially private count of values (if requested). + * @property sum the differentially private sum of values (if requested). + * @property mean the differentially private mean of values (if requested). + */ +@Immutable +data class VarianceCombinerResult( + val variance: Double, + val count: Double?, + val sum: Double?, + val mean: Double?, +) : Serializable + +/** + * An assembly of [Combiner]s corresponding to the metrics being computed. Returns a combination of + * results computed by all the provided combiners. + * + * It is caller's responsibility to ensure that no redundant combiners are passed. For example, if + * MeanCombiner is passed then it is not necessary to pass [CountCombiner] and [SumCombiner], it + * will just lead to unnecessary computations and privacy budget consumption. + * + * It is prohibited to pass [CompoundCombiner] in the [combiners] constructor argument. + * + * @property combiners the collection of [Combiner] instances to be used for aggregating different + * metrics. + */ +class CompoundCombiner(val combiners: Iterable>) : + Combiner { + init { + check(combiners.none { it is CompoundCombiner }) { + "Compound combiner cannot be passed into other compound combiner. " + + "Passed combiners: $combiners." + } + } + + override val requiresPerPartitionBoundedInput = + combiners.any { it.requiresPerPartitionBoundedInput } + + /** + * Creates a [CompoundAccumulator] by invoking the `createAccumulator` method on each underlying + * combiner. + * + * @param contributions the privacy id contributions to initialize the accumulators with. + * @return a new [CompoundAccumulator] containing accumulators for each underlying combiner. + */ + override fun createAccumulator(contributions: PrivacyIdContributions) = compoundAccumulator { + for (combiner in combiners) { + when (combiner) { + is PrivacyIdCountCombiner -> + privacyIdCountAccumulator = combiner.createAccumulator(contributions) + is ExactPrivacyIdCountCombiner -> + privacyIdCountAccumulator = combiner.createAccumulator(contributions) + is PostAggregationPartitionSelectionCombiner -> + privacyIdCountAccumulator = combiner.createAccumulator(contributions) + is CountCombiner -> countAccumulator = combiner.createAccumulator(contributions) + is SumCombiner -> sumAccumulator = combiner.createAccumulator(contributions) + is MeanCombiner -> meanAccumulator = combiner.createAccumulator(contributions) + is QuantilesCombiner -> quantilesAccumulator = combiner.createAccumulator(contributions) + is VarianceCombiner -> varianceAccumulator = combiner.createAccumulator(contributions) + is CompoundCombiner -> throwIfCompoundCombiner() + } + } + } + + /** + * Merges two [CompoundAccumulator] instances by merging the corresponding accumulators for each + * underlying combiner. + * + * @param accumulator1 the first accumulator to merge. + * @param accumulator2 the second accumulator to merge. + * @return a new [CompoundAccumulator] with the merged results for each metric. + */ + override fun mergeAccumulators( + accumulator1: CompoundAccumulator, + accumulator2: CompoundAccumulator, + ) = compoundAccumulator { + for (combiner in combiners) { + when (combiner) { + is PrivacyIdCountCombiner -> + privacyIdCountAccumulator = + combiner.mergeAccumulators( + accumulator1.privacyIdCountAccumulator, + accumulator2.privacyIdCountAccumulator, + ) + is ExactPrivacyIdCountCombiner -> + privacyIdCountAccumulator = + combiner.mergeAccumulators( + accumulator1.privacyIdCountAccumulator, + accumulator2.privacyIdCountAccumulator, + ) + is PostAggregationPartitionSelectionCombiner -> + privacyIdCountAccumulator = + combiner.mergeAccumulators( + accumulator1.privacyIdCountAccumulator, + accumulator2.privacyIdCountAccumulator, + ) + is CountCombiner -> + countAccumulator = + combiner.mergeAccumulators(accumulator1.countAccumulator, accumulator2.countAccumulator) + is SumCombiner -> + sumAccumulator = + combiner.mergeAccumulators(accumulator1.sumAccumulator, accumulator2.sumAccumulator) + is MeanCombiner -> + meanAccumulator = + combiner.mergeAccumulators(accumulator1.meanAccumulator, accumulator2.meanAccumulator) + is VarianceCombiner -> + varianceAccumulator = + combiner.mergeAccumulators( + accumulator1.varianceAccumulator, + accumulator2.varianceAccumulator, + ) + is QuantilesCombiner -> + quantilesAccumulator = + combiner.mergeAccumulators( + accumulator1.quantilesAccumulator, + accumulator2.quantilesAccumulator, + ) + is CompoundCombiner -> throwIfCompoundCombiner() + } + } + } + + /** + * Computes the DP aggregates by invoking the `computeMetrics` method on each underlying combiner. + * + * @param accumulator the [CompoundAccumulator] containing the aggregated data. + * @return a [DpAggregates] object containing the computed results for all metrics. + */ + override fun computeMetrics(accumulator: CompoundAccumulator) = dpAggregates { + for (combiner in combiners) { + when (combiner) { + is PrivacyIdCountCombiner -> + privacyIdCount = combiner.computeMetrics(accumulator.privacyIdCountAccumulator) + is ExactPrivacyIdCountCombiner -> {} // no anonymized output + is PostAggregationPartitionSelectionCombiner -> { + val noisedPrivacyIdCount = combiner.computeMetrics(accumulator.privacyIdCountAccumulator) + if (noisedPrivacyIdCount != null) { + privacyIdCount = noisedPrivacyIdCount + } + } + is CountCombiner -> count = combiner.computeMetrics(accumulator.countAccumulator) + is SumCombiner -> sum = combiner.computeMetrics(accumulator.sumAccumulator) + is MeanCombiner -> { + val meanResult = combiner.computeMetrics(accumulator.meanAccumulator) + mean = meanResult.mean + if (meanResult.sum != null) { + sum = meanResult.sum + } + if (meanResult.count != null) { + count = meanResult.count + } + } + is QuantilesCombiner -> + quantiles += combiner.computeMetrics(accumulator.quantilesAccumulator) + is VarianceCombiner -> { + val varianceResult = combiner.computeMetrics(accumulator.varianceAccumulator) + variance = varianceResult.variance + if (varianceResult.count != null) { + count = varianceResult.count + } + if (varianceResult.sum != null) { + sum = varianceResult.sum + } + if (varianceResult.mean != null) { + mean = varianceResult.mean + } + } + is CompoundCombiner -> throwIfCompoundCombiner() + } + } + } + + /** + * Returns whether it contains a combiner for post aggregation partition selection. + * + * @return True if it has post aggregation partition selection combiner. + */ + fun hasPostAggregationCombiner() = + combiners.any { combiner -> combiner is PostAggregationPartitionSelectionCombiner } + + companion object { + private fun throwIfCompoundCombiner() { + throw IllegalStateException("Should not be reached, verified in init section.") + } + } +} + +/** + * Clamp value to the range [minimumValue, maximumValue] if privacy level requires contribution + * bounding. + */ +private fun > T.coerceInIfContributionBoundingEnabled( + minimumValue: T, + maximumValue: T, + privacyLevel: PrivacyLevel, +): T { + // Per-pertition bounding implies clamping. + return if (privacyLevel.withContributionsPerPartitionBounding) { + coerceIn(minimumValue, maximumValue) + } else { + this + } +} + +private fun getNoisedCount( + count: Long, + aggregationParams: AggregationParams, + countBudget: AllocatedBudget, + noiseFactory: (NoiseKind) -> Noise, +): Double { + val noise = noiseFactory(aggregationParams.noiseKind) + + return noise.addNoise( + count.toDouble(), + aggregationParams.maxPartitionsContributed!!, + aggregationParams.maxContributionsPerPartition!!.toDouble(), + countBudget.epsilon(), + countBudget.delta(), + ) +} + +private fun getNoisedNormalizedSum( + normalizedSum: Double, + midValue: Double, + aggregationParams: AggregationParams, + sumBudget: AllocatedBudget, + noiseFactory: (NoiseKind) -> Noise, +): Double { + val noise = noiseFactory(aggregationParams.noiseKind) + // All values were normalized to the symmetric range [minValue-midValue, maxValue-midValue]. + // So the linf sensitivity of 1 record is (maxValue-midValue). + val lInfSensitivity = + (aggregationParams.maxValue!! - midValue) * aggregationParams.maxContributionsPerPartition!! + return noise.addNoise( + normalizedSum, + aggregationParams.maxPartitionsContributed!!, + lInfSensitivity, + sumBudget.epsilon(), + sumBudget.delta(), + ) +} + +private fun getNoisedNormalizedSumOfSquares( + normalizedSumOfSquares: Double, + midValue: Double, + aggregationParams: AggregationParams, + sumOfSquaresBudget: AllocatedBudget, + noiseFactory: (NoiseKind) -> Noise, +): Double { + val noise = noiseFactory(aggregationParams.noiseKind) + // All values were normalized to the symmetric range [minValue-midValue, maxValue-midValue] which + // were then squared and summed up. + // So the linf sensitivity of 1 record is (maxValue-midValue)^2 distributed across allowed + // partition contributions. + val distance = aggregationParams.maxValue!! - midValue + val lInfSensitivity = distance * distance * aggregationParams.maxContributionsPerPartition!! + return noise.addNoise( + normalizedSumOfSquares, + aggregationParams.maxPartitionsContributed!!, + lInfSensitivity, + sumOfSquaresBudget.epsilon(), + sumOfSquaresBudget.delta(), + ) +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/ComputationalGraph.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/ComputationalGraph.kt new file mode 100644 index 00000000..0b5bba80 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/ComputationalGraph.kt @@ -0,0 +1,275 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.CompoundAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdCountAccumulator + +/** + * An assembly of transformations applied to the input data in order to produce anonymized metrics. + * + * [T] is the type of the elements stored in the collection whose data is being anonymized. It must + * be possible to extract the privacy ID, the partition key and optionally the value being + * aggregated from an element of type [T]. + */ +internal interface ComputationalGraph { + fun aggregate(collection: FrameworkCollection): FrameworkTable +} + +/** + * Creates an instance of [ComputationalGraph]. The class is "open" because we spy on it in the + * tests in order to check that the arguments passed to [DpEngine.aggregate] are correctly mapped to + * the computational graph components. + */ +internal open class ComputationalGraphFactory { + open fun createForPublicPartitions( + contributionSampler: ContributionSampler, + combiner: CompoundCombiner, + extractors: DataExtractors, + encodersFactory: EncoderFactory, + publicPartitions: FrameworkCollection, + partitionsBalance: PartitionsBalance, + ) = + PublicPartitionsComputationalGraph( + contributionSampler, + publicPartitions, + partitionsBalance, + combiner, + extractors, + encodersFactory, + ) + + open fun createForPrivatePartitions( + contributionSampler: ContributionSampler, + preAggregationPartitionSelector: PreAggregationPartitionSelector?, + combiner: CompoundCombiner, + extractors: DataExtractors, + encodersFactory: EncoderFactory, + ) = + PrivatePartitionsComputationalGraph( + contributionSampler, + preAggregationPartitionSelector, + combiner, + extractors, + encodersFactory, + ) + + open fun createForSelectPartitions( + contributionSampler: ContributionSampler, + partitionSelector: PreAggregationPartitionSelector, + extractors: DataExtractors, + encodersFactory: EncoderFactory, + ) = + SelectPartitionsComputationalGraph( + contributionSampler, + partitionSelector, + extractors, + encodersFactory, + ) +} + +/** An implementation of a [ComputationalGraph] that computes metrics with public partitions. */ +internal class PublicPartitionsComputationalGraph +internal constructor( + private val contributionSampler: ContributionSampler, + private val publicPartitions: FrameworkCollection, + private val partitionsBalance: PartitionsBalance, + private val combiner: CompoundCombiner, + private val extractors: DataExtractors, + private val encodersFactory: EncoderFactory, +) : ComputationalGraph { + + override fun aggregate( + collection: FrameworkCollection + ): FrameworkTable { + val extractedCol: FrameworkCollection> = + collection.map( + "ExtractContributions", + encoderOfContributionWithPrivacyId( + extractors.privacyIdEncoder, + extractors.partitionKeyEncoder, + encodersFactory, + ), + extractors.contributionExtractor, + ) + val filteredCol = + extractedCol.dropNonPublicPartitions( + publicPartitions, + extractors.partitionKeyEncoder, + partitionsBalance, + ) + val boundedCollection: FrameworkTable = + contributionSampler.sampleContributions(filteredCol) + // We cannot refer to the combiner of the PublicPartitionsComputationalGraph from the DoFn + // below because it breaks serialization. + val combinerCopy = combiner + val accumulatorsPerPrivacyIdContributions: FrameworkTable = + boundedCollection.mapValues( + "CreateAccumulatorFromPrivacyIdContributions", + encodersFactory.protos(CompoundAccumulator::class), + { _, privacyIdContributions -> combinerCopy.createAccumulator(privacyIdContributions) }, + ) + val collectionWithPublicPartitions: FrameworkTable = + accumulatorsPerPrivacyIdContributions.insertPublicPartitions( + publicPartitions, + combinerCopy, + extractors.partitionKeyEncoder, + encodersFactory, + ) + val perPartitionAccumulators: FrameworkTable = + collectionWithPublicPartitions.groupAndCombineValues( + "CombinePerPartitionKey", + combinerCopy::mergeAccumulators, + ) + val dpAggregates: FrameworkTable = + perPartitionAccumulators.mapValues( + "ComputeDpAggregates", + encodersFactory.protos(DpAggregates::class), + ) { _, acc: CompoundAccumulator -> + combinerCopy.computeMetrics(acc) + } + return dpAggregates + } +} + +/** + * An implementation of a [ComputationalGraph] that computes metrics with private partition + * selection. + */ +internal class PrivatePartitionsComputationalGraph +internal constructor( + private val contributionSampler: ContributionSampler, + private val preAggregationPartitionSelector: PreAggregationPartitionSelector?, + private val combiner: CompoundCombiner, + private val extractors: DataExtractors, + private val encodersFactory: EncoderFactory, +) : ComputationalGraph { + + init { + val hasPostAggregationCombiner = combiner.hasPostAggregationCombiner() + val hasPreAggregationCombiner = preAggregationPartitionSelector != null + require(hasPreAggregationCombiner != hasPostAggregationCombiner) { + "Computational graph must have either PreAggregationPartitionSelector or " + + "PostAggregationPartitionSelectionCombiner." + } + } + + override fun aggregate( + collection: FrameworkCollection + ): FrameworkTable { + val extractedCol: FrameworkCollection> = + collection.map( + "ExtractContributions", + encoderOfContributionWithPrivacyId( + extractors.privacyIdEncoder, + extractors.partitionKeyEncoder, + encodersFactory, + ), + extractors.contributionExtractor, + ) + val boundedCollection: FrameworkTable = + contributionSampler.sampleContributions(extractedCol) + // We cannot refer to the combiner of the PrivatePartitionsComputationalGraph from the DoFn + // below because it breaks serialization. + val combinerCopy = combiner + val accumulatorsPerPrivacyIdContributions: FrameworkTable = + boundedCollection.mapValues( + "CreateAccumulatorFromPrivacyIdContributions", + encodersFactory.protos(CompoundAccumulator::class), + { _, privacyIdContributions -> combinerCopy.createAccumulator(privacyIdContributions) }, + ) + var perPartitionAccumulators: FrameworkTable = + accumulatorsPerPrivacyIdContributions.groupAndCombineValues( + "CombinePerPartitionKey", + combinerCopy::mergeAccumulators, + ) + + // Fields of this can not be referred from DoFn below because it breaks serialization. + if (preAggregationPartitionSelector != null) { + val partitionSelectorCopy = preAggregationPartitionSelector + perPartitionAccumulators = + perPartitionAccumulators.filterValues( + "ApplyPreAggregationPartitionSelector", + { v -> partitionSelectorCopy.shouldKeep(v.privacyIdCountAccumulator.count) }, + ) + } + + val dpAggregates: FrameworkTable = + perPartitionAccumulators.mapValues( + "ComputeDpAggregates", + encodersFactory.protos(DpAggregates::class), + ) { _, acc: CompoundAccumulator -> + combinerCopy.computeMetrics(acc) + } + if (combiner.hasPostAggregationCombiner()) { + return dpAggregates.filterValues( + "PostAggregationPartitionSelection", + { it.privacyIdCount != 0.0 }, + ) + } + return dpAggregates + } +} + +/** An implementation of a [ComputationalGraph] for selecting partitions. */ +internal class SelectPartitionsComputationalGraph( + private val contributionSampler: ContributionSampler, + private val partitionSelector: PreAggregationPartitionSelector, + private val extractors: DataExtractors, + private val encodersFactory: EncoderFactory, +) { + + fun selectPartitions(collection: FrameworkCollection): FrameworkCollection { + val extractedCol: FrameworkCollection> = + collection.map( + "ExtractContributions", + encoderOfContributionWithPrivacyId( + extractors.privacyIdEncoder, + extractors.partitionKeyEncoder, + encodersFactory, + ), + extractors.contributionExtractor, + ) + val boundedCollection: FrameworkTable = + contributionSampler.sampleContributions(extractedCol) + val combiner = ExactPrivacyIdCountCombiner() + val accumulatorsPerPrivacyIdContributions: + FrameworkTable = + boundedCollection.mapValues( + "CreateAccumulatorFromPrivacyIdContributions", + encodersFactory.protos(PrivacyIdCountAccumulator::class), + { _, privacyIdContributions -> combiner.createAccumulator(privacyIdContributions) }, + ) + val perPartitionAccumulators: FrameworkTable = + accumulatorsPerPrivacyIdContributions.groupAndCombineValues( + "CombinePerPartitionKey", + combiner::mergeAccumulators, + ) + + // We cannot refer to the this.partitionSelector from the DoFn because it breaks serialization. + val partitionSelectorCopy = partitionSelector + + return perPartitionAccumulators + .filterValues( + "ApplyPreAggregationPartitionSelector", + { partitionSelectorCopy.shouldKeep(it.count) }, + ) + .keys("KeepPartitionKeys") + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/ContributionSampler.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/ContributionSampler.kt new file mode 100644 index 00000000..2d8255d8 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/ContributionSampler.kt @@ -0,0 +1,66 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions + +/** Bounds contributions to the entire non-aggregated data collection. */ +sealed interface ContributionSampler { + /** + * Samples the contributions of the privacy IDs. + * + * Returns a [FrameworkTable] where each entry contains a [PartitionKey] and the contributions of + * a privacy ID to that [PartitionKey] after sampling. For each privacy ID contributing to a given + * [PartitionKey], all its contributions are grouped inside the same entry. + */ + fun sampleContributions( + data: FrameworkCollection> + ): FrameworkTable +} + +/** + * Samples contributions to [maxPartitionsContributed] partitions among the given [contributions] + * assuming that they all belong to the same [PrivacyId]. + */ +internal fun samplePartitions( + contributions: Iterable>, + maxPartitionsContributed: Int, +): Collection> { + val allPartitions = contributions.map { it.partitionKey() }.toSet() + val keptPartitions = sampleNElements(allPartitions, maxPartitionsContributed).toSet() + return contributions.filter { it.partitionKey() in keptPartitions } +} + +/** + * Samples [maxContributionsPerPartition] contributions among the given [partitionContributions] + * assuming that they all belong to the same [PrivacyId] and [PartitionKey]. Combines the result + * into a [PrivacyIdContributions] and returns it. + */ +internal fun sampleContributionsPerPartition( + partitionContributions: Iterable>, + maxContributionsPerPartition: Int, +): PrivacyIdContributions { + val sampledValues: Collection = + sampleNElements(partitionContributions.map { it.value() }, maxContributionsPerPartition) + return privacyIdContributions { values += sampledValues } +} + +private fun sampleNElements(elements: Collection, N: Int): Collection { + if (elements.size <= N) return elements + return elements.shuffled().take(N) +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/CoreTypes.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/CoreTypes.kt new file mode 100644 index 00000000..5667c7d1 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/CoreTypes.kt @@ -0,0 +1,54 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +/** + * Holds the contributing privacy ID of PrivacyIdT type, the contributed partition of PartitionKeyT + * type and value of Double type. + */ +typealias ContributionWithPrivacyId = + Pair, Double> + +/** Helper function to create a [ContributionWithPrivacyId] from the given arguments. */ +fun contributionWithPrivacyId( + privacyId: PrivacyIdT, + partitionKey: PartitionKeyT, + value: Double, +): ContributionWithPrivacyId = Pair(Pair(privacyId, partitionKey), value) + +/** Encoder of [ContributionWithPrivacyId]. */ +fun encoderOfContributionWithPrivacyId( + privacyIdEncoder: Encoder, + partitionKeyEncoder: Encoder, + encodersFactory: EncoderFactory, +) = + encodersFactory.tuple2sOf( + encodersFactory.tuple2sOf(privacyIdEncoder, partitionKeyEncoder), + encodersFactory.doubles(), + ) + +/** Helper function to get the privacy ID of the given [ContributionWithPrivacyId]. */ +fun ContributionWithPrivacyId + .privacyId() = first.first + +/** Helper function to get the partition key of the given [ContributionWithPrivacyId]. */ +fun ContributionWithPrivacyId + .partitionKey() = first.second + +/** Helper function to get the value of the given [ContributionWithPrivacyId]. */ +fun ContributionWithPrivacyId + .value() = second diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DataExtractors.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DataExtractors.kt new file mode 100644 index 00000000..2599d7c5 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DataExtractors.kt @@ -0,0 +1,82 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import java.io.Serializable + +/** An interface of a function that is also serializable. */ +fun interface SerializableFunction : (S) -> T, Serializable {} + +/** An extractor of [ContributionWithPrivacyId] from the row of the input data being anonymized. */ +class DataExtractors +@PublishedApi +internal constructor( + val contributionExtractor: + SerializableFunction>, + val privacyIdEncoder: Encoder, + val partitionKeyEncoder: Encoder, + val hasValueExtractor: Boolean, +) { + companion object { + /** + * Constructs a [DataExtractors] that uses the provided functions to extract a + * [ContributionWithPrivacyId] from the input data row. + */ + inline fun from( + crossinline privacyIdExtractor: (T) -> PrivacyIdT, + privacyIdEncoder: Encoder, + crossinline partitionKeyExtractor: (T) -> PartitionKeyT, + partitionKeyEncoder: Encoder, + crossinline valueExtractor: (T) -> Double, + ) = + DataExtractors( + { + contributionWithPrivacyId( + privacyId = privacyIdExtractor(it), + partitionKey = partitionKeyExtractor(it), + value = valueExtractor(it), + ) + }, + privacyIdEncoder = privacyIdEncoder, + partitionKeyEncoder = partitionKeyEncoder, + hasValueExtractor = true, + ) + + /** + * Constructs a [DataExtractors] that uses the provided functions to extract a privacy id and a + * partition key into [ContributionWithPrivacyId] from the input data row. + */ + inline fun from( + crossinline privacyIdExtractor: (T) -> PrivacyIdT, + privacyIdEncoder: Encoder, + crossinline partitionKeyExtractor: (T) -> PartitionKeyT, + partitionKeyEncoder: Encoder, + ) = + DataExtractors( + { + contributionWithPrivacyId( + privacyId = privacyIdExtractor(it), + partitionKey = partitionKeyExtractor(it), + value = .0, + ) + }, + privacyIdEncoder = privacyIdEncoder, + partitionKeyEncoder = partitionKeyEncoder, + hasValueExtractor = false, + ) + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngine.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngine.kt new file mode 100644 index 00000000..02b7c39e --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngine.kt @@ -0,0 +1,477 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.QUANTILES +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.VARIANCE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.LAPLACE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.DATASET_LEVEL +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITHOUT_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITH_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AbsoluteBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AccountedMechanism.GAUSSIAN_NOISE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AccountedMechanism.LAPLACE_NOISE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AccountedMechanism.POSTAGGREGATED_PARTITION_SELECTION +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AccountedMechanism.PREAGGREGATED_PARTITION_SELECTION +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetAccountant +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetAccountantFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetAccountingStrategy +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetAccountingStrategy.NAIVE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetRequest +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.RelativeBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.TotalBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.PreAggregationPartitionSelectionFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates + +/** + * An engine that computes and returns the DP metrics. + * + * In order to request computation of DP metrics, call [aggregate]. You can call it multiple times + * on the same or different input data collections. After calling [aggregate] once or multiple + * times, call [done]. Once [done] has been called, the [DpEngine] instance cannot be used. + * + * The result of [aggregate] cannot be used immediately. Internally, [aggregate] builds the + * computational graph that implements the anonymization logic but does not immediately know the + * values of the privacy budget parameters (epsilon and delta) that should be used to compute the + * metrics. The actual budget values get allocated when [done] is called. After calling [done], you + * should be able to access the results of all [aggregate] calls on a given instance of [DpEngine] + * unless the computational framework that you are using introduces additional constraints. + * + * The "lazy" privacy budget evaluation logic described above introduces limitations on the + * framework that can be used to perform computations as it should allow for "lazy" evaluation of + * the content of the [aggregate] result. For example, to address this limitation when running + * [aggregate] locally, we use Kotlin [Sequence] as its elements don't get computed until they are + * accessed. + */ +open class DpEngine +internal constructor( + private val encoderFactory: EncoderFactory, + private val budgetAccountant: BudgetAccountant, + private val defaultNoiseFactory: (NoiseKind) -> Noise = NoiseFactory(), + private val computationalGraphFactory: ComputationalGraphFactory = ComputationalGraphFactory(), +) { + companion object Factory { + fun create(encoderFactory: EncoderFactory, budgetSpec: DpEngineBudgetSpec) = + DpEngine( + encoderFactory, + BudgetAccountantFactory.forStrategy(budgetSpec.accountingStrategy, budgetSpec.budget), + NoiseFactory(), + ComputationalGraphFactory(), + ) + } + + private var doneCalled = false + + /** + * Creates a pipeline that computes the DP metrics defined in [aggregationParams]. The metrics are + * computed on [collection]. [dataExtractors] define how the privacy ID, partition key and the + * aggregated value should be extracted from each data element in the [collection]. If + * [publicPartitions] aren't provided, private partition selection is performed. + * + * The actual computation of the DP metrics cannot happen until [done] is called. [done] triggers + * the privacy budget allocation. Hence, any access to the result of this method before [done] has + * been called will throw an exception. + */ + open fun aggregate( + collection: FrameworkCollection, + aggregationParams: AggregationParams, + dataExtractors: DataExtractors, + publicPartitions: FrameworkCollection? = null, + ): FrameworkTable { + throwIfDoneWasCalled() + // Ensure that public partitions are unique, it is important for correctness. + val uniquePublicPartitions = publicPartitions?.distinct("MakeSuppliedPartitionsUnique") + validateAggregationParams( + aggregationParams, + uniquePublicPartitions != null, + dataExtractors.hasValueExtractor, + ) + // Beyond this point we can assume aggregation parameters are correct. + val noiseFactory = + if (aggregationParams.privacyLevel.privacyDisabled) { + ZeroNoiseFactory() + } else { + defaultNoiseFactory + } + val compoundCombiner = + createCompoundCombiner(aggregationParams, uniquePublicPartitions != null, noiseFactory) + val contributionSampler = + createContributionsSampler( + aggregationParams, + compoundCombiner, + dataExtractors.privacyIdEncoder, + dataExtractors.partitionKeyEncoder, + ) + + val graph: ComputationalGraph = + if (uniquePublicPartitions == null) { + val partitionSelector = createPartitionSelectorIfPreaggregationIsUsed(aggregationParams) + computationalGraphFactory.createForPrivatePartitions( + contributionSampler, + partitionSelector, + compoundCombiner, + dataExtractors, + encoderFactory, + ) + } else { + computationalGraphFactory.createForPublicPartitions( + contributionSampler, + compoundCombiner, + dataExtractors, + encoderFactory, + uniquePublicPartitions, + aggregationParams.partitionsBalance, + ) + } + return graph.aggregate(collection) + } + + /** + * Creates a pipeline that computes partition keys from [collection] in a differentially-private + * manner. [dataExtractors] define how the privacy ID and partition key should be extracted from + * each data element in the [collection]. + * + * The actual computation of the DP metrics cannot happen until [done] is called. [done] triggers + * the privacy budget allocation. Hence, any access to the result of this method before [done] has + * been called will throw an exception. + */ + open fun selectPartitions( + collection: FrameworkCollection, + params: SelectPartitionsParams, + dataExtractors: DataExtractors, + ): FrameworkCollection { + throwIfDoneWasCalled() + validateSelectPartitionsParams(params) + + val (contributionSampler, partitionSelector) = + when (params.privacyLevel) { + NONE_WITHOUT_CONTRIBUTION_BOUNDING -> + NoPrivacySampler( + dataExtractors.privacyIdEncoder, + dataExtractors.partitionKeyEncoder, + encoderFactory, + ) to NoPrivacyPartitionSelector() + NONE_WITH_CONTRIBUTION_BOUNDING -> + PartitionSampler( + params.maxPartitionsContributed, + dataExtractors.privacyIdEncoder, + dataExtractors.partitionKeyEncoder, + encoderFactory, + ) to NoPrivacyPartitionSelector() + DATASET_LEVEL -> + PartitionSamplerWithoutValues( + params.maxPartitionsContributed, + dataExtractors.privacyIdEncoder, + dataExtractors.partitionKeyEncoder, + encoderFactory, + ) to createPartitionSelector(params) + } + + val graph: SelectPartitionsComputationalGraph = + computationalGraphFactory.createForSelectPartitions( + contributionSampler, + partitionSelector, + dataExtractors, + encoderFactory, + ) + return graph.selectPartitions(collection) + } + + /** + * Allocates privacy budgets to the metrics whose computation has been requested by calling + * [aggregate]. This method must be called once per [DpEngine] instance. + */ + fun done() { + throwIfDoneWasCalled() + doneCalled = true + budgetAccountant.allocateBudgets() + } + + private fun throwIfDoneWasCalled() { + if (doneCalled) { + throw IllegalStateException( + "done() has already been called on this instance. The instance cannot be used anymore." + ) + } + } + + private fun createContributionsSampler( + params: AggregationParams, + combiner: CompoundCombiner, + privacyIdEncoder: Encoder, + partitionKeyEncoder: Encoder, + ): ContributionSampler { + val privacyLevel = params.privacyLevel + return if ( + privacyLevel.withPartitionsContributedBounding && + privacyLevel.withContributionsPerPartitionBounding + ) { + if (combiner.requiresPerPartitionBoundedInput) { + PartitionAndPerPartitionSampler( + params.maxPartitionsContributed!!, + params.maxContributionsPerPartition!!, + privacyIdEncoder, + partitionKeyEncoder, + encoderFactory, + ) + } else { + PartitionSampler( + params.maxPartitionsContributed!!, + privacyIdEncoder, + partitionKeyEncoder, + encoderFactory, + ) + } + } else if (privacyLevel.withPartitionsContributedBounding) { + // && !withContributionsPerPartitionBounding + PartitionSampler( + params.maxPartitionsContributed!!, + privacyIdEncoder, + partitionKeyEncoder, + encoderFactory, + ) + } else if (privacyLevel.withContributionsPerPartitionBounding) { + // && !withPartitionsContributedBounding + if (combiner.requiresPerPartitionBoundedInput) { + PerPartitionContributionsSampler( + params.maxContributionsPerPartition!!, + privacyIdEncoder, + partitionKeyEncoder, + encoderFactory, + ) + } else { + NoPrivacySampler(privacyIdEncoder, partitionKeyEncoder, encoderFactory) + } + } else { + // !withPartitionsContributedBounding && !withContributionsPerPartitionBounding + NoPrivacySampler(privacyIdEncoder, partitionKeyEncoder, encoderFactory) + } + } + + private fun createCompoundCombiner( + params: AggregationParams, + usePublicPartitions: Boolean, + noiseFactory: (NoiseKind) -> Noise, + ): CompoundCombiner { + val meanInMetrics = params.metrics.any { it.type == MEAN } + val metricCombiners = + params.metrics + .mapNotNull { metric -> + when (metric.type) { + PRIVACY_ID_COUNT -> { + if (usePostAggregationPartitionSelection(params, usePublicPartitions)) { + PostAggregationPartitionSelectionCombiner( + params, + getBudgetForMetric(metric, params), + getBudgetForPostAggregationPartitionSelection(params.partitionSelectionBudget), + noiseFactory, + ) + } else { + PrivacyIdCountCombiner(params, getBudgetForMetric(metric, params), noiseFactory) + } + } + COUNT -> { + if (!meanInMetrics) { + CountCombiner(params, getBudgetForMetric(metric, params), noiseFactory) + } else { + null + } + } + SUM -> { + if (!meanInMetrics) { + SumCombiner(params, getBudgetForMetric(metric, params), noiseFactory) + } else { + null + } + } + MEAN -> { + val (countBudget, sumBudget) = calculateCountSumBudgetsForMean(params) + MeanCombiner(params, countBudget, sumBudget, noiseFactory) + } + VARIANCE -> { + val (countBudget, sumBudget, sumSquaresBudget) = calculateBudgetsForVariance(params) + VarianceCombiner(params, countBudget, sumBudget, sumSquaresBudget, noiseFactory) + } + + is QUANTILES -> { + QuantilesCombiner( + (metric.type as QUANTILES).ranks, + params, + getBudgetForMetric(metric, params), + noiseFactory, + ) + } + } + } + .toMutableList() + if (!usePublicPartitions && !params.metrics.any { it.type == PRIVACY_ID_COUNT }) { + // For private partitions, we need to compute the privacy ID count, even if PRIVACY_ID_COUNT + // is not requested in metrics. + metricCombiners.add(ExactPrivacyIdCountCombiner()) + } + return CompoundCombiner(metricCombiners.toList()) + } + + private fun createPartitionSelectorIfPreaggregationIsUsed( + params: AggregationParams + ): PreAggregationPartitionSelector? { + if (params.privacyLevel.privacyDisabled) { + return NoPrivacyPartitionSelector() + } + if (usePostAggregationPartitionSelection(params, usePublicPartitions = false)) return null + val budget = getBudgetForPreAggregationPartitionSelection(params.partitionSelectionBudget) + // If maxPartitionsContributed unset for partition selection, NullPointerException is expected. + return DpLibPreAggregationPartitionSelector( + params.maxPartitionsContributed!!, + params.preThreshold, + budget, + PreAggregationPartitionSelectionFactory(), + ) + } + + private fun createPartitionSelector( + params: SelectPartitionsParams + ): PreAggregationPartitionSelector { + val budget = getBudgetForPreAggregationPartitionSelection(params.budget) + // If maxPartitionsContributed unset for partition selection, NullPointerException is expected. + // TODO: Support MaxContributions contribution bounding parameter. + return DpLibPreAggregationPartitionSelector( + params.maxPartitionsContributed, + params.preThreshold, + budget, + PreAggregationPartitionSelectionFactory(), + ) + } + + private fun getBudgetForMetric( + metric: MetricDefinition, + params: AggregationParams, + ): AllocatedBudget { + val budgetSpec = metric.budgetSpec ?: RelativeBudgetPerOpSpec(weight = 1.0) + return budgetAccountant.requestBudget( + BudgetRequest(budgetSpec, getNoiseAccountedMechanism(params.noiseKind)) + ) + } + + private fun getBudgetForPreAggregationPartitionSelection( + partitionSelectionBudget: AbsoluteBudgetPerOpSpec? + ): AllocatedBudget { + val budgetSpec = partitionSelectionBudget ?: RelativeBudgetPerOpSpec(weight = 1.0) + return budgetAccountant.requestBudget( + BudgetRequest(budgetSpec, PREAGGREGATED_PARTITION_SELECTION) + ) + } + + private fun getBudgetForPostAggregationPartitionSelection( + partitionSelectionBudget: AbsoluteBudgetPerOpSpec? + ): AllocatedBudget { + val budgetSpec = partitionSelectionBudget ?: RelativeBudgetPerOpSpec(weight = 1.0) + return budgetAccountant.requestBudget( + BudgetRequest(budgetSpec, POSTAGGREGATED_PARTITION_SELECTION) + ) + } + + private fun calculateCountSumBudgetsForMean( + params: AggregationParams + ): Pair { + fun getMetricDefinition(metricType: MetricType) = params.metrics.find { it.type == metricType } + + // meanDefinition is not null, because this function is called only when MEAN is in metrics. + val meanDefinition = getMetricDefinition(MEAN)!! + + // Budget spec for COUNT. + val countBudgetSpec: BudgetPerOpSpec = + if (meanDefinition.budgetSpec != null) { + // It is 50% of MEAN spec, if MEAN spec is provided. + meanDefinition.budgetSpec!!.times(0.5) + } else { + // Or COUNT spec or the default budget spec. + getMetricDefinition(COUNT)?.budgetSpec ?: RelativeBudgetPerOpSpec(weight = 1.0) + } + + // Budget spec for SUM. + val sumBudgetSpec: BudgetPerOpSpec = + if (meanDefinition.budgetSpec != null) { + // It is 50% of MEAN spec, if MEAN spec is provided. + meanDefinition.budgetSpec!!.times(0.5) + } else { + // Or SUM spec or the default budget spec. + getMetricDefinition(SUM)?.budgetSpec ?: RelativeBudgetPerOpSpec(weight = 1.0) + } + + return budgetAccountant.requestBudget( + BudgetRequest(countBudgetSpec, getNoiseAccountedMechanism(params.noiseKind)) + ) to + budgetAccountant.requestBudget( + BudgetRequest(sumBudgetSpec, getNoiseAccountedMechanism(params.noiseKind)) + ) + } + + private fun calculateBudgetsForVariance( + params: AggregationParams + ): Triple { + // Variance is not null because this function is called only when it is in metrics. + val varianceDefinition = params.metrics.find { it.type == VARIANCE }!! + // Budget is split equally between COUNT, SUM and SUM_SQUARES. + val budgetSplit = 1.0 / 3.0 + // If varianceDefinition.budgetSpec is null, the default budget spec is used. + val defaultBudgetSpec = RelativeBudgetPerOpSpec(weight = 1.0) + val noiseAccountedMechanism = getNoiseAccountedMechanism(params.noiseKind) + + val budgetSpec = varianceDefinition.budgetSpec?.times(budgetSplit) ?: defaultBudgetSpec + val budgetRequest = BudgetRequest(budgetSpec, noiseAccountedMechanism) + + return Triple( + budgetAccountant.requestBudget(budgetRequest), // COUNT + budgetAccountant.requestBudget(budgetRequest), // SUM + budgetAccountant.requestBudget(budgetRequest), // SUM_SQUARES + ) + } + + private fun getNoiseAccountedMechanism(noiseKind: NoiseKind) = + when (noiseKind) { + LAPLACE -> LAPLACE_NOISE + GAUSSIAN -> GAUSSIAN_NOISE + } +} + +/** + * The total amount of budget that can be consumed by the partition selection and aggregations + * computed by the [DpEngine] instance. + */ +data class DpEngineBudgetSpec( + val budget: TotalBudget, + val accountingStrategy: BudgetAccountingStrategy = NAIVE, +) + +private fun usePostAggregationPartitionSelection( + params: AggregationParams, + usePublicPartitions: Boolean, +): Boolean = + !usePublicPartitions && + params.metrics.any { it.type == PRIVACY_ID_COUNT } && + params.privacyLevel == DATASET_LEVEL diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpFunctionsParams.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpFunctionsParams.kt new file mode 100644 index 00000000..9a9a6696 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpFunctionsParams.kt @@ -0,0 +1,437 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.errorprone.annotations.Immutable +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.QUANTILES +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.VARIANCE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.DATASET_LEVEL +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AbsoluteBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetPerOpSpec +import java.io.Serializable +import kotlin.reflect.KClass + +// Constant to limit the number of contributions per privacy unit for avoiding OOM and stucking +// on privacy units with too many contributions. Usually such privacy units are not actual +// privacy units, but rather a set of privacy units, e.g. all users w/o privacy id. +// Now it is implemented only for DpEngine.SelectPartitions(). +// TODO: Implement this for DpEngine.Aggregate(). +const val MAX_PROCESSED_CONTRIBUTIONS_PER_PRIVACY_ID: Int = 100_000_000 + +/** Contains shared parameters for validation. */ +sealed interface Params { + /** The privacy level that determines the kind of bounding. */ + val privacyLevel: PrivacyLevel + /** The maximum number of partitions that can be contributed by a privacy unit. */ + val maxPartitionsContributed: Int? + /** + * The pre-threshold to use for partition selection. + * + * Pre-threshold is the minimum number of unique contributors (privacy units) a partition must + * have. Partitions with fewer contributors will be dropped. If set to 1, no pre-thresholding is + * applied. + */ + val preThreshold: Int +} + +/** + * The parameters of the metrics being anonymized: the metric types, the contribution bounds, etc. + * This data-class contains a "bag" of all possible parameters that can be used for any combination + * of the metrics being computed. + */ +@Immutable +data class AggregationParams( + /** The metrics being anonymized. */ + val metrics: ImmutableList, + val noiseKind: NoiseKind, + /** + * The maximum number of partitions that can be contributed by a privacy unit. Used by all + * metrics. Note this is mutually exclusive with maxContributions. + */ + override val maxPartitionsContributed: Int? = null, + /** + * The maximum number of times a privacy unit can contribute to a partition. Used for COUNT, MEAN + * and QUANTILES. Note this is mutually exclusive with maxContributions. + */ + val maxContributionsPerPartition: Int? = null, + /** + * The maximum number of times a privacy unit can contribute to a dataset. Used by all metrics. + * Note this is mutually exclusive with maxContributionsPerPartition. + */ + val maxContributions: Int? = null, + /** + * The minimum bound on the individual value that can be contributed by a user to a partition. + * Used for MEAN and QUANTILES. + */ + val minValue: Double? = null, + /** + * The maximum bound on the individual value that can be contributed by a user to a partition. + * Used for MEAN and QUANTILES. + */ + val maxValue: Double? = null, + /** + * The minimum bound on the sum of the values that can be contributed by a user to a partition. + * Used for SUM. + */ + val minTotalValue: Double? = null, + /** + * The maximum bound on the sum of the values that can be contributed by a user to a partition. + * Used for SUM. + */ + val maxTotalValue: Double? = null, + /** + * The amount of budget used for partition selection. + * + * If [AbsoluteBudgetPerOpSpec] is null, [RelativeBudgetPerOpSpec] with weight = 1 is used, i.e. + * the budget is split evenly among all DP operations (metrics and partition selection). + */ + val partitionSelectionBudget: AbsoluteBudgetPerOpSpec? = null, + /** The pre-threshold to use for partition selection. */ + override val preThreshold: Int = 1, + /** The privacy level that determines the kind of contribution bounding in aggregations. */ + override val privacyLevel: PrivacyLevel = DATASET_LEVEL, + /** + * The balance of partitions. + * + * Optional parameter that influences only public partitions processing and will be used as a hint + * for the execution to make it more optimized. + * + * Processing unbalanced partitions might lead to not enough paralellisation and long processing + * time. In case if it happens for public partitions processing, set to [UNBALANCED], and as a + * result special processing for better paralellisation will be performed. See [PartitionsBalance] + * for definition of balanced/unbalanced partitions. + */ + val partitionsBalance: PartitionsBalance = PartitionsBalance.UNKNOWN, +) : Params, Serializable + +/** + * Validates [AggregationParams]. + * + * @param usePublicPartitions indicates whether [DpEngine.aggregate()] was called with public + * partitions. + * @param hasValueExtractor indicates whether [DpEngine.aggregate()] was called with a DataExtractor + * which contains a value extractor. + */ +fun validateAggregationParams( + params: AggregationParams, + usePublicPartitions: Boolean, + hasValueExtractor: Boolean, +) { + // Validate params shared between AggregationParams and SelectPartitionsParams. + validateBaseParams(params) + + // Privacy level and maxPartitionsContributed are in sync. + if (params.privacyLevel.withPartitionsContributedBounding) { + require(params.maxPartitionsContributed != null || params.maxContributions != null) { + "maxPartitionsContributed or maxContributions must be set because specified ${params.privacyLevel} privacy level requires cross partition bounding." + } + } + + // Metrics. + require(!params.metrics.isEmpty()) { "metrics must not be empty." } + require(params.metrics.map { it.type }.distinct().size == params.metrics.size) { + "metrics must not contain duplicate metric types. Provided ${params.metrics.map { it.type }}." + } + // Max contributions per partition. + require(isGreaterThanZeroIfSet(params.maxContributionsPerPartition)) { + "maxContributionsPerPartition must be positive. Provided value: " + + "${params.maxContributionsPerPartition}." + } + if (params.privacyLevel.withContributionsPerPartitionBounding) { + require( + params.maxContributionsPerPartition != null || + params.maxContributions != null || + (params.minTotalValue != null && params.maxTotalValue != null) + ) { + "maxContributionsPerPartition or maxContributions or (minTotalValue, maxTotalValue) must be set because specified ${params.privacyLevel} privacy level requires per partition bounding." + } + } + // Max contributions. + require(isGreaterThanZeroIfSet(params.maxContributions)) { + "maxContributions must be positive. Provided value: " + "${params.maxContributions}." + } + // Mutually exclusive partition bounds + require(params.maxContributions == null || params.maxPartitionsContributed == null) { + "maxContributions and maxPartitionsContributed are mutually exclusive. " + + "Provided values: maxContributions=${params.maxContributions}, " + + "maxPartitionsContributed=${params.maxPartitionsContributed}." + } + require(params.maxContributions == null || params.maxContributionsPerPartition == null) { + "maxContributions and maxContributionsPerPartition are mutually exclusive. " + + "Provided values: maxContributions=${params.maxContributions}, " + + "maxContributionsPerPartition=${params.maxContributionsPerPartition}." + } + // Min/Max bounds + require(sameNullability(params.minValue, params.maxValue)) { + "minValue and maxValue must be simultaneously equal or not equal to null. Provided values: " + + "minValue=${params.minValue}, maxValue=${params.maxValue}." + } + var areMinMaxValuesSet = false + if (params.minValue != null && params.maxValue != null) { + areMinMaxValuesSet = true + require(params.minValue < params.maxValue) { + "minValue must be less than maxValue. Provided values: " + + "minValue=${params.minValue}, maxValue=${params.maxValue}." + } + } + require(sameNullability(params.minTotalValue, params.maxTotalValue)) { + "minTotalValue and maxTotalValue must be simultaneously equal or not equal to null. " + + "Provided values: minTotalValue=${params.minTotalValue}, " + + "maxTotalValue=${params.maxTotalValue}." + } + var areMinMaxTotalValuesSet = false + if (params.minTotalValue != null && params.maxTotalValue != null) { + areMinMaxTotalValuesSet = true + require(params.minTotalValue <= params.maxTotalValue) { + "minTotalValue must be less or equal to maxTotalValue. Provided values: " + + "minTotalValue=${params.minTotalValue}, maxTotalValue=${params.maxTotalValue}." + } + } + + // Required parameters per each metric. + if (metricIsRequested(COUNT::class, params)) { + require(params.maxContributionsPerPartition != null || params.maxContributions != null) { + "maxContributionsPerPartition or maxContributions must be set for COUNT metric." + } + } + // When MEAN and SUM are set together, then contribution bounding with (minValue, maxValue) + // is used. SUM and VARIANCE should not be set together. + if ( + metricIsRequested(SUM::class, params) && + !metricIsRequested(MEAN::class, params) && + !metricIsRequested(VARIANCE::class, params) + ) { + require(areMinMaxTotalValuesSet) { + "(minTotalValue, maxTotalValue) must be set for SUM metrics." + } + } + + if (metricIsRequested(MEAN::class, params)) { + require(params.maxContributionsPerPartition != null || params.maxContributions != null) { + "maxContributionsPerPartition or maxContributions must be set for MEAN metric." + } + require(areMinMaxValuesSet) { "(minValue, maxValue) must be set for MEAN metric." } + require(!areMinMaxTotalValuesSet) { + "(minTotalValue, maxTotalValue) should not be set if MEAN metric is requested." + } + } + require( + params.metrics.find { it.type == COUNT }?.budgetSpec == null || + params.metrics.find { it.type == MEAN }?.budgetSpec == null + ) { + "BudgetPerOpSpec can not be set for both COUNT and MEAN metrics." + } + require( + params.metrics.find { it.type == SUM }?.budgetSpec == null || + params.metrics.find { it.type == MEAN }?.budgetSpec === null + ) { + "BudgetPerOpSpec can not be set for both SUM and MEAN metrics." + } + require( + params.metrics.find { it.type == MEAN }?.budgetSpec == null || + params.metrics.find { it.type == VARIANCE }?.budgetSpec == null + ) { + "BudgetPerOpSpec can not be set for both MEAN and VARIANCE metrics." + } + // Validation for VARIANCE metric. + if (metricIsRequested(VARIANCE::class, params)) { + require(params.maxContributionsPerPartition != null || params.maxContributions != null) { + "maxContributionsPerPartition or maxContributions must be set for VARIANCE metric." + } + require(areMinMaxValuesSet) { "(minValue, maxValue) must be set for VARIANCE metric." } + require(!areMinMaxTotalValuesSet) { + "(minTotalValue, maxTotalValue) should not be set if VARIANCE metric is requested." + } + } + require( + params.metrics.find { it.type == SUM }?.budgetSpec == null || + params.metrics.find { it.type == VARIANCE }?.budgetSpec == null + ) { + "BudgetPerOpSpec can not be set for both SUM and VARIANCE metrics." + } + + require( + params.metrics.find { it.type == COUNT }?.budgetSpec == null || + params.metrics.find { it.type == VARIANCE }?.budgetSpec == null + ) { + "BudgetPerOpSpec can not be set for both COUNT and VARIANCE metrics." + } + // Validation for QUANTILES metric. + if (metricIsRequested(QUANTILES::class, params)) { + require(params.maxContributionsPerPartition != null) { + "maxContributionsPerPartition must be set for QUANTILES metric." + } + require(areMinMaxValuesSet) { "(minValue, maxValue) must be set for QUANTILES metric." } + } + + // Partition selection + if (usePublicPartitions) { + require(params.partitionSelectionBudget == null) { + "partitionSelectionBudget can not be set for public partitions." + } + } + + // ValueExtractor: only COUNT and PRIVACY_ID_COUNT can be computed w/o a value extractor. + if (!hasValueExtractor) { + val metricsWhichRequireValueExtractor = + params.metrics.map { it.type }.filter { it != COUNT && it != PRIVACY_ID_COUNT } + require(metricsWhichRequireValueExtractor.isEmpty()) { + "Metrics $metricsWhichRequireValueExtractor require a value extractor." + } + } +} + +/** The parameters of [DPEngine.selectPartitions()]. */ +@Immutable +data class SelectPartitionsParams( + /** The maximum number of partitions that can be contributed by a privacy unit. */ + override val maxPartitionsContributed: Int, + /** + * The amount of budget that should be used for partition selection. + * + * If [AbsoluteBudgetPerOpSpec] is null, [RelativeBudgetPerOpSpec] with weight = 1 is used, i.e. + * the budget is split evenly among all DP operations (metrics and partition selection). + */ + val budget: AbsoluteBudgetPerOpSpec? = null, + /** The pre-threshold to use for partition selection. */ + override val preThreshold: Int = 1, + /** The privacy level that determines the kind of contribution bounding in partition selection. */ + override val privacyLevel: PrivacyLevel = DATASET_LEVEL, +) : Params, Serializable + +/** Validates [SelectPartitionsParams]. */ +fun validateSelectPartitionsParams(params: SelectPartitionsParams) { + // Validate params shared between AggregationParams and SelectPartitionsParams. + validateBaseParams(params) +} + +/** + * The balance of the partitions in the input dataset. + * + * Partitions are balanced if there is no partition which contribute > 1% of data. Otherwise, the + * partitions are unbalanced. + */ +enum class PartitionsBalance { + /** Use if you don't know the answer. */ + UNKNOWN, + /** Use if you know that the partitions are balanced. */ + BALANCED, + /** Use if you know that the partitions are unbalanced. */ + UNBALANCED, +} + +/** The type of privacy level that determines the kind of contribution bounding. */ +enum class PrivacyLevel( + val privacyDisabled: Boolean, + val withPartitionsContributedBounding: Boolean, + val withContributionsPerPartitionBounding: Boolean, +) { + DATASET_LEVEL( + privacyDisabled = false, + withPartitionsContributedBounding = true, + withContributionsPerPartitionBounding = true, + ), // Enables contribution bounding across dataset. + NONE_WITHOUT_CONTRIBUTION_BOUNDING( + privacyDisabled = true, + withPartitionsContributedBounding = false, + withContributionsPerPartitionBounding = false, + ), // No privacy, use only for testing and utility analysis. + NONE_WITH_CONTRIBUTION_BOUNDING( + privacyDisabled = true, + withPartitionsContributedBounding = true, + withContributionsPerPartitionBounding = true, + ), // No privacy with cross and per partition bounding, use only for testing + // and utility analysis. +} + +/** The definition of the DP metric to compute. */ +@Immutable +data class MetricDefinition( + val type: MetricType, + /** + * The amount of privacy budget used to anonymize this metric. + * + * If [budgetSpec] is null, [RelativeBudgetPerOpSpec] with weight = 1 is used, i.e. the budget is + * split evenly among all DP calculations (metrics and partition selection). + */ + val budgetSpec: BudgetPerOpSpec? = null, +) : Serializable + +/** The types of metrics that can be anonymized. */ +@Immutable +sealed class MetricType : Serializable { + data object PRIVACY_ID_COUNT : MetricType() + + data object COUNT : MetricType() + + data object SUM : MetricType() + + data object MEAN : MetricType() + + data class QUANTILES(val ranks: ImmutableList) : MetricType() { + init { + require(ranks.all { it in 0.0..1.0 }) { "Ranks for quantiles must be all in [0, 1]." } + } + } + + data object VARIANCE : MetricType() +} + +/** The kind of noise that can be applied to the data. */ +enum class NoiseKind { + LAPLACE, + GAUSSIAN, +} + +private fun sameNullability(a: Double?, b: Double?): Boolean { + return (a == null) == (b == null) +} + +private fun metricIsRequested(metricTypeClass: KClass, params: AggregationParams) = + params.metrics.any { metricTypeClass.isInstance(it.type) } + +private fun isGreaterThanZeroIfSet(value: Int?): Boolean = value == null || value > 0 + +private fun isLessOrEqualToIfSet(value: Int?, upperBound: Int): Boolean = + value == null || value <= upperBound + +private fun validateBaseParams(params: Params) { + // Cross partition bounds. + require(isGreaterThanZeroIfSet(params.maxPartitionsContributed)) { + "maxPartitionsContributed must be positive. Provided value: ${params.maxPartitionsContributed}." + } + + require( + isLessOrEqualToIfSet( + params.maxPartitionsContributed, + MAX_PROCESSED_CONTRIBUTIONS_PER_PRIVACY_ID, + ) + ) { + "maxPartitionsContributed must be less than ${MAX_PROCESSED_CONTRIBUTIONS_PER_PRIVACY_ID} " + + "Provided values: maxPartitionsContributed=${params.maxPartitionsContributed}." + } + + // Pre-threshold. + require(params.preThreshold > 0) { + "preThreshold must be positive. Provided value: ${params.preThreshold}." + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/Encoders.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/Encoders.kt new file mode 100644 index 00000000..d37234a0 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/Encoders.kt @@ -0,0 +1,55 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.protobuf.Message +import kotlin.reflect.KClass + +/** + * A serializer and a deserializer for the data types processed by PipelineDP4j. + * + * An [Encoder] converts between regular Kotlin values and encoded byte-string representations + * stored in a [FrameworkCollection], which are automatically invoked by the rest of the + * PipelineDP4j system whenever it needs to convert between an in-memory Kotlin object and an + * externalizable byte-string representation. + */ +interface Encoder {} + +/** A factory for [Encoder]s */ +interface EncoderFactory { + /** Returns an [Encoder] for a [String] value, which can be stored in a [FrameworkCollection]. */ + fun strings(): Encoder + + /** Returns an [Encoder] for a double value, which can be stored in a [FrameworkCollection]. */ + fun doubles(): Encoder + + /** Returns an [Encoder] for an integer value, which can be stored in a [FrameworkCollection]. */ + fun ints(): Encoder + + /** Encoder for data classes. */ + fun records(recordClass: KClass): Encoder + + /** Returns an [Encoder] for a protobuf value, which can be stored in a [FrameworkCollection]. */ + fun protos(protoClass: KClass): Encoder + + /** Returns an [Encoder] for a pair of tuples, which can be stored in a [FrameworkCollection]. */ + fun tuple2sOf(first: Encoder, second: Encoder): Encoder> +} + +inline fun EncoderFactory.records() = this.records(T::class) + +inline fun EncoderFactory.protos() = this.protos(T::class) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/FrameworkCollection.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/FrameworkCollection.kt new file mode 100644 index 00000000..e8a22408 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/FrameworkCollection.kt @@ -0,0 +1,60 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +/** + * An abstraction for a framework-specific collection. The internal PipelineDP4j logic is + * framework-agnostic and operates on this abstraction. + */ +interface FrameworkCollection { + /** Encoder of elements type in this collection. */ + val elementsEncoder: Encoder + + /** Removes duplicates from the collection, i.e. makes it a set. */ + fun distinct(stageName: String): FrameworkCollection + + /** + * Returns a [FrameworkCollection] consisting of the results of applying the [mapFn] to the + * elements of this [FrameworkCollection]. + */ + fun map(stageName: String, outputType: Encoder, mapFn: (T) -> R): FrameworkCollection + + /** + * Returns a [FrameworkTable] consisting of one key-value pair for each element in this + * [FrameworkCollection], where the value is the original element from the [FrameworkCollection], + * and the key is the result of applying the [keyFn] to that element. + */ + fun keyBy(stageName: String, outputType: Encoder, keyFn: (T) -> K): FrameworkTable + + /** + * Returns a [FrameworkTable] consisting of the results of applying [mapFn] to the elements of + * this collection. + */ + fun mapToTable( + stageName: String, + keyType: Encoder, + valueType: Encoder, + mapFn: (T) -> Pair, + ): FrameworkTable +} + +// TODO: verify for stage name collisions (especially for Beam) (add tests and ability +// to add suffixes to avoid collisisons). +object StageNameUtils { + /** Appends name of the next stage to the current stage name. */ + fun String.append(nextStageName: String) = "$this/$nextStageName" +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/FrameworkTable.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/FrameworkTable.kt new file mode 100644 index 00000000..ab3ee2a4 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/FrameworkTable.kt @@ -0,0 +1,122 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +/** + * An abstraction for a framework-specific table. The internal PipelineDP4j logic is + * framework-agnostic and operates on this abstraction. + */ +interface FrameworkTable { + /** Encoder of keys type in this table. */ + val keysEncoder: Encoder + + /** Encoder of values type in this table. */ + val valuesEncoder: Encoder + + /** + * Returns a [FrameworkCollection] consisting of the results of applying the given [mapFn] to + * every key-value pair in this table. + */ + fun map(stageName: String, outputType: Encoder, mapFn: (K, V) -> R): FrameworkCollection + + /** + * Returns a [FrameworkTable] mapping each distinct key of this table to the result of combining + * (using the [combFn]) all the values of this table with that key. + * + * [combFn] should be a reducer, i.e. it accumulates value starting with the first element and + * applying operation from left to right to current accumulator value and each element. + */ + fun groupAndCombineValues(stageName: String, combFn: (V, V) -> V): FrameworkTable + + /** + * Returns a new table mapping each distinct key of this table to a collection of all the values + * associated with that key in this table. + */ + fun groupByKey(stageName: String): FrameworkTable> + + /** Returns a [FrameworkCollection] containing all keys in this table. */ + fun keys(stageName: String): FrameworkCollection + + /** Returns a [FrameworkCollection] containing all values in this table. */ + fun values(stageName: String): FrameworkCollection + + /** + * Returns a [FrameworkTable] consisting of the results of applying the [mapValuesFn] to every + * value in this table, leaving the keys unchanged. + */ + fun mapValues( + stageName: String, + outputType: Encoder, + mapValuesFn: (K, V) -> VO, + ): FrameworkTable + + /** + * Returns a [FrameworkTable] consisting of the results of each item of the output [Sequence] + * produced by applying the [mapFn] to each key-value pair of this table. + */ + fun flatMapToTable( + stageName: String, + keyType: Encoder, + valueType: Encoder, + mapFn: (K, V) -> Sequence>, + ): FrameworkTable + + /** + * Returns a [FrameworkTable] consisting of the results of applying the [mapFn] to every key-value + * pair in this table. + */ + fun mapToTable( + stageName: String, + outputKeyType: Encoder, + outputValueType: Encoder, + mapFn: (K, V) -> Pair, + ): FrameworkTable + + /** + * Returns a [FrameworkTable] consisting of only the key-value pairs in this table for which the + * value matches the [predicate]. + */ + fun filterValues(stageName: String, predicate: (V) -> Boolean): FrameworkTable + + /** + * Returns a [FrameworkTable] consisting of only the key-value pairs in this table for which the + * key matches the predicate. + */ + fun filterKeys(stageName: String, predicate: (K) -> Boolean): FrameworkTable + + /** + * Returns a [FrameworkTable] consisting of only the key-value pairs in this table for which the + * keys are in [allowedKeys]. + * + * @param unbalancedKeys whether the number of values per keys are very different. If true, the + * implementation may use a more efficient algorithm. + */ + fun filterKeys( + stageName: String, + allowedKeys: FrameworkCollection, + unbalancedKeys: Boolean = false, + ): FrameworkTable + + /** Returns a [FrameworkTable] consisting of the key-value pairs in this table and in [other] */ + fun flattenWith(stageName: String, other: FrameworkTable): FrameworkTable + + /** + * Samples values without replacement per key. The output table will contain the same keys as this + * table, each key will appear only once. The number of values per key will be at most [count]. + */ + fun samplePerKey(stageName: String, count: Int): FrameworkTable> +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/NoPrivacySampler.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/NoPrivacySampler.kt new file mode 100644 index 00000000..faf0dcf5 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/NoPrivacySampler.kt @@ -0,0 +1,38 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +/** + * Performs no sampling and thus has no privacy. Use for testing and utility analysis only. + * + * Returns a map from [PartitionKey] to [PrivacyIdContributions] where [PrivacyIdContributions] is a + * representation of all contributions of a [PrivacyId] to the corresponding [PartitionKey]. + */ +class NoPrivacySampler( + privacyIdEncoder: Encoder, + partitionKeyEncoder: Encoder, + encoderFactory: EncoderFactory, +) : + ContributionSampler by OnlyPerPartitionContributionSampler< + PrivacyIdT, + PartitionKeyT, + >( + maxContributionsPerPartition = Int.MAX_VALUE, + privacyIdEncoder, + partitionKeyEncoder, + encoderFactory, + ) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionAndPerPartitionSampler.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionAndPerPartitionSampler.kt new file mode 100644 index 00000000..83dce1a4 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionAndPerPartitionSampler.kt @@ -0,0 +1,69 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions + +/** + * Samples partitions contributed by each [PrivacyId] and per-partition contributions. + * + * Returns a map from [PartitionKey] to [PrivacyIdContributions] where [PrivacyIdContributions] is a + * representation of all contributions of a [PrivacyId] to the corresponding [PartitionKey]. + * + * Note: this class does not perform any checks on the consistency of [AggregationParams]. We expect + * this to be done earlier in the call. + * + * Note: the bounder assumes that all contributions that belong to the same privacy ID can fit in + * memory. + */ +class PartitionAndPerPartitionSampler( + private val maxPartitionsContributed: Int, + private val maxContributionsPerPartition: Int, + private val privacyIdEncoder: Encoder, + private val partitionKeyEncoder: Encoder, + private val encoderFactory: EncoderFactory, +) : ContributionSampler { + override fun sampleContributions( + data: FrameworkCollection> + ): FrameworkTable { + val inputByPid: + FrameworkTable>> = + data + .keyBy("KeyByPrivacyId", privacyIdEncoder) { it.privacyId() } + .groupByKey("GroupByPrivacyId") + // TODO: Cover with tests (i.e. test should fail if this is not copied). + // Necessary for DoFn to be serializable. + val maxPartitionsContributedCopy = maxPartitionsContributed + val maxContributionsPerPartitionCopy = maxContributionsPerPartition + return inputByPid.flatMapToTable( + "SamplePartitionsAndPerPartitionContributions", + partitionKeyEncoder, + encoderFactory.protos(PrivacyIdContributions::class), + ) { _, contributions -> + val l0BoundedData = samplePartitions(contributions, maxPartitionsContributedCopy) + val groupedByPartitionKey: + Map>> = + l0BoundedData.groupBy { it.partitionKey() } + groupedByPartitionKey + .mapValues { (_, partitionContributions) -> + sampleContributionsPerPartition(partitionContributions, maxContributionsPerPartitionCopy) + } + .map { it.toPair() } + .asSequence() + } + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSampler.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSampler.kt new file mode 100644 index 00000000..021797d4 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSampler.kt @@ -0,0 +1,110 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions + +/** + * Samples partitions contributed by each [PrivacyId]. + * + * Returns a map from [PartitionKey] to [PrivacyIdContributions] where [PrivacyIdContributions] is a + * representation of all contributions of a [PrivacyId] to the corresponding [PartitionKey]. + * + * Note: this class does not perform any checks on the consistency of [AggregationParams]. We expect + * this to be done earlier in the call. + */ +class PartitionSampler( + private val maxPartitionsContributed: Int, + private val privacyIdEncoder: Encoder, + private val partitionKeyEncoder: Encoder, + private val encoderFactory: EncoderFactory, +) : ContributionSampler { + override fun sampleContributions( + data: FrameworkCollection> + ): FrameworkTable { + val inputByPid: + FrameworkTable>> = + data + .keyBy("KeyByPrivacyId", privacyIdEncoder) { it.privacyId() } + .groupByKey("GroupByPrivacyId") + val maxPartitionsContributed = maxPartitionsContributed + return inputByPid.flatMapToTable( + "SamplePartitions", + partitionKeyEncoder, + encoderFactory.protos(PrivacyIdContributions::class), + ) { _, contributions -> + val l0BoundedData = samplePartitions(contributions, maxPartitionsContributed) + val groupedByPartitionKey: + Map>> = + l0BoundedData.groupBy { it.partitionKey() } + groupedByPartitionKey + .mapValues { (_, partitionContributions) -> + privacyIdContributions { values += partitionContributions.map { it.value() } } + } + .map { it.toPair() } + .asSequence() + } + } +} + +/** + * Samples partitions contributed by each [PrivacyId]. Drops the values of the contributions. + * + * Returns a map from [PartitionKey] to [PrivacyIdContributions], where [PrivacyIdContributions] has + * empty [values]. This class is more scalable than PartitionSampler because: + * 1. It does not keep values (which are not need for SelectPartitions). + * 2. It does 2 sampling per key - by (partition, privacy_id) and by partition. As a result records + * corresponding to one partitions are processed on different machines. + * + * The algorithm which performs one grouping per key (per privacy id) is faster for on average + * dataset, but it less scalable, when one privacy id has a lot of contributions. + * + * Note: this class does not perform any checks on the consistency of [AggregationParams]. We expect + * this to be done earlier in the call. + */ +class PartitionSamplerWithoutValues( + private val maxPartitionsContributed: Int, + private val privacyIdEncoder: Encoder, + private val partitionKeyEncoder: Encoder, + private val encoderFactory: EncoderFactory, +) : ContributionSampler { + override fun sampleContributions( + data: FrameworkCollection> + ): FrameworkTable { + val maxPartitionsContributed = maxPartitionsContributed + return data + .keyBy("KeyByPrivacyId", encoderFactory.tuple2sOf(privacyIdEncoder, partitionKeyEncoder)) { + it.privacyId() to it.partitionKey() + } + .samplePerKey("LinfSampling", 1) + .mapToTable("DropPartitionKeyFromKey", privacyIdEncoder, partitionKeyEncoder) { + _, + contributions -> + // Contribtions is a list of size 1, since we sampled 1 element per key. + contributions[0].privacyId() to contributions[0].partitionKey() + } + .samplePerKey("L0Sampling", maxPartitionsContributed) + .flatMapToTable( + "ConvertToPrivacyIdContributions", + partitionKeyEncoder, + encoderFactory.protos(PrivacyIdContributions::class), + ) { _, partitionKeys -> + partitionKeys.asSequence().map { it to privacyIdContributions {} } + } + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PerPartitionContributionsSampler.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PerPartitionContributionsSampler.kt new file mode 100644 index 00000000..fb294bd9 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PerPartitionContributionsSampler.kt @@ -0,0 +1,76 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions + +/** + * Samples per-partition contributions by each [PrivacyId]. + * + * Returns a map from [PartitionKey] to [PrivacyIdContributions] where [PrivacyIdContributions] is a + * representation of all contributions of a [PrivacyId] to the corresponding [PartitionKey]. + * + * Note: this class does not perform any checks on the consistency of [AggregationParams]. We expect + * this to be done earlier in the call. + */ +class PerPartitionContributionsSampler( + maxContributionsPerPartition: Int, + privacyIdEncoder: Encoder, + partitionKeyEncoder: Encoder, + encoderFactory: EncoderFactory, +) : + ContributionSampler by OnlyPerPartitionContributionSampler( + maxContributionsPerPartition, + privacyIdEncoder, + partitionKeyEncoder, + encoderFactory, + ) + +/** + * A generalized implementation of per-partition contribution sampling. + * + * [PerPartitionContributionsSampler] and [NoPrivacySampler] delegate their implementations to this + * class. It is introduced to avoid code duplication. + */ +internal class OnlyPerPartitionContributionSampler( + private val maxContributionsPerPartition: Int, + private val privacyIdEncoder: Encoder, + private val partitionKeyEncoder: Encoder, + private val encoderFactory: EncoderFactory, +) : ContributionSampler { + override fun sampleContributions( + data: FrameworkCollection> + ): FrameworkTable { + val maxContributionsPerPartitionLocalCopy = maxContributionsPerPartition + return data + .keyBy( + "KeyByPartitionKeyAndPrivacyId", + encoderFactory.tuple2sOf(partitionKeyEncoder, privacyIdEncoder), + ) { + it.partitionKey() to it.privacyId() + } + .groupByKey("GroupByPartitionKeyAndPrivacyId") + .mapToTable( + "DropPrivacyIdAndSampleContributions", + partitionKeyEncoder, + encoderFactory.protos(PrivacyIdContributions::class), + ) { (partitionKey, _), contributions -> + partitionKey to + sampleContributionsPerPartition(contributions, maxContributionsPerPartitionLocalCopy) + } + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitions.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitions.kt new file mode 100644 index 00000000..980b0a23 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitions.kt @@ -0,0 +1,132 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.PreAggregationPartitionSelectionFactory +import java.io.Serializable + +/** Interface for pre-aggregation partition selectors. */ +interface PreAggregationPartitionSelector : Serializable { + fun shouldKeep(privacyIdCount: Long): Boolean +} + +/** + * An Implementation of [PreAggregationPartitionSelector] that uses the DP library. + * + * @property maxPartitionsContributed performs contribution bounding. + * @property prethreshold thresholds contributions before partition selection occurs. + * @property budget is the amount of privacy budget that can be used by partition selection. + * @property factory is used to create DP building block library objects. It is also used for + * dependency injection in tests. + */ +class DpLibPreAggregationPartitionSelector( + private val maxPartitionsContributed: Int, + val preThreshold: Int, + private val budget: AllocatedBudget, + private val factory: PreAggregationPartitionSelectionFactory, +) : PreAggregationPartitionSelector, Serializable { + + override fun shouldKeep(privacyIdCount: Long): Boolean { + val preAggregationPartitionSelection = + factory.create( + epsilon = budget.epsilon(), + delta = budget.delta(), + maxPartitionsContributed = maxPartitionsContributed, + preThreshold = preThreshold, + ) + preAggregationPartitionSelection.incrementBy(privacyIdCount) + return preAggregationPartitionSelection.shouldKeepPartition() + } +} + +/** Interface for post-aggregation partition selectors. */ +interface PostAggregationPartitionSelector : Serializable { + /** + * @param privacyIdCount is the true number of privacy units that contributed to this partition. + * @return the noise value if the partition should be kept or returns null if the partition should + * be dropped. + */ + fun addNoiseIfShouldKeep(privacyIdCount: Long): Double? + + /** + * The [threshold] for the partition selection. As an esential part of the computational graph, + * the [threshold] will be reported to clients. + */ + val threshold: Double +} + +/** + * An Implementation of [PostAggregationPartitionSelector] that uses the DP library. + * + * @property maxPartitionsContributed performs contribution bounding. + * @property prethreshold thresholds contributions before partition selection occurs. + * @property noiseBudget is the amount of privacy budget that can be used by partition selection. + * @property thresholdingBudget is the amount of privacy budget that can be used by thresholding. + * @property noiseFactory is used to generate noise in partition selection. + */ +class PostAggregationPartitionSelectorImpl( + private val maxPartitionsContributed: Int, + private val noiseKind: NoiseKind, + val preThreshold: Int, + private val noiseBudget: AllocatedBudget, + private val thresholdingBudget: AllocatedBudget, + private val noiseFactory: (NoiseKind) -> Noise, +) : PostAggregationPartitionSelector, Serializable { + + /** + * @param privacyIdCount is the true number of privacy units that contributed to this partition. + * @return the noise value if the partition should be kept or returns null if the partition should + * be dropped. + */ + override fun addNoiseIfShouldKeep(privacyIdCount: Long): Double? { + if (privacyIdCount < preThreshold) return null + + val noisePrivacyIdCount = + noiseFactory(noiseKind) + .addNoise( + privacyIdCount.toDouble(), + maxPartitionsContributed, + /* lInfSensitivity = */ 1.0, + noiseBudget.epsilon(), + noiseBudget.delta(), + ) + return if (noisePrivacyIdCount >= threshold + (preThreshold - 1)) noisePrivacyIdCount else null + } + + override val threshold: Double by lazy { + 1 - + noiseFactory(noiseKind) + .computeQuantile( + /* rank= */ thresholdingBudget.delta() / maxPartitionsContributed, + /* x= */ 0.0, + maxPartitionsContributed, + /* lInfSensitivity=*/ 1.0, + noiseBudget.epsilon(), + noiseBudget.delta(), + ) + } +} + +class NoPrivacyPartitionSelector : PreAggregationPartitionSelector, Serializable { + /** + * @param privacyIdCount is the true number of privacy units that contributed to this partition. + * @return always returns true because no privacy is applied. + */ + override fun shouldKeep(privacyIdCount: Long): Boolean = true +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitions.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitions.kt new file mode 100644 index 00000000..5b9b10ce --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitions.kt @@ -0,0 +1,86 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PartitionsBalance.UNBALANCED +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.CompoundAccumulator + +/* + * Filters out all [ContributionWithPrivacyId]s whose [PartitionKey]s are not present in + * the [publicPartitions] collection. + */ +internal fun FrameworkCollection< + ContributionWithPrivacyId +> + .dropNonPublicPartitions( + publicPartitions: FrameworkCollection, + partitionKeyEncoder: Encoder, + partitionsBalance: PartitionsBalance, +): FrameworkCollection> { + return keyBy("KeyByPartitionKey", partitionKeyEncoder) { it.partitionKey() } + .filterKeys( + "DropNonPublicPartition", + publicPartitions, + unbalancedKeys = partitionsBalance == UNBALANCED, + ) + .values("DropPartitionKey") +} + +/* + * Adds all public partitions to the [FrameworkTable] with values equal to + * [CompoundCombiner#emptyAccumulator]. + * + * Note that it just extends the table with this data, i.e. even if a public partition is already + * present in the table it will be added anyway with an empty value. You can think of it as just + * concatenation of two tables (collections of pairs) where the first collection is the initial data + * and the second one is a collection of (public_partition, empty_accumulator_value) pairs which + * includes all public partitions that are passed into this function. + */ +internal fun FrameworkTable + .insertPublicPartitions( + publicPartitions: FrameworkCollection, + combiner: CompoundCombiner, + partitionKeyEncoder: Encoder, + encoderFactory: EncoderFactory, +): FrameworkTable { + return insertAllKeysWithValues( + publicPartitions, + // Some accumulators might require budget to be allocated, therefore we should create empty + // accumulator only in the function and not outside, because function will be called only when + // budget is allocated. + { combiner.emptyAccumulator() }, + partitionKeyEncoder, + encoderFactory.protos(CompoundAccumulator::class), + ) +} + +/* + * Inserts provided [keys] into the table associated with values produced by + * [insertionElementProducer]. + */ +private fun FrameworkTable.insertAllKeysWithValues( + keys: FrameworkCollection, + insertionElementProducer: (K) -> V, + keyEncoder: Encoder, + valueEncoder: Encoder, +): FrameworkTable { + val tableWithInsertedElements = + keys.mapToTable("AddValuesToKeys", keyEncoder, valueEncoder) { + it to insertionElementProducer(it) + } + return flattenWith("UniteCollectionWithTableWithInsertedElements", tableWithInsertedElements) +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/AllocatedBudget.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/AllocatedBudget.kt new file mode 100644 index 00000000..b8877300 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/AllocatedBudget.kt @@ -0,0 +1,79 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core.budget + +import java.io.Serializable + +/** + * Epsilon and delta values allocated for a DP mechanism. + * + * [AllocatedBudget] is created by a [BudgetAccountant] as a response to a budget consumption + * request ([BudgetRequest]). The exact epsilon and delta values are not populated when + * [AllocatedBudget] is created. Instead, they are populated by the [initialize] method called by + * the [BudgetAccountant] when budgets are getting allocated. Once the epsilon and delta have been + * populated, the mechanism who requested the budget can use them. + * + * @property epsilon the allocated epsilon value (ε). Initially -1.0. + * @property delta the allocated delta value (δ). Initially -1.0. + * @property initialized indicates whether the budget has been initialized with valid values. + */ +class AllocatedBudget : Serializable { + private var epsilon = -1.0 + private var delta = -1.0 + private var initialized = false + + companion object Factory { + fun create() = AllocatedBudget() + } + + /** + * Initializes the allocated budget with the given epsilon and delta values. + * + * This method should only be called once to set the values. Attempts to re-initialize will result + * in an exception. + * + * @param epsilon the epsilon (ε) value for the allocated budget. + * @param delta the delta (δ) value for the allocated budget. + * @throws IllegalStateException if the budget has already been initialized. + */ + fun initialize(epsilon: Double, delta: Double) { + if (initialized) { + throw IllegalStateException( + "The budget has already been initialized with epsilon = $epsilon and delta = $delta. It can't be initialized second time." + ) + } + this.epsilon = epsilon + this.delta = delta + initialized = true + } + + /** + * Returns the allocated epsilon (ε) value. + * + * @throws IllegalStateException if the budget has not been initialized. + */ + fun epsilon() = + if (initialized) epsilon else throw IllegalStateException("The budget hasn't been initialized.") + + /** + * Returns the allocated delta (δ) value. + * + * @throws IllegalStateException if the budget has not been initialized. + */ + fun delta() = + if (initialized) delta else throw IllegalStateException("The budget hasn't been initialized.") +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BUILD.bazel b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BUILD.bazel new file mode 100644 index 00000000..31072245 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BUILD.bazel @@ -0,0 +1,43 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_library") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +kt_jvm_library( + name = "budget_spec", + srcs = ["BudgetSpec.kt"], + deps = [ + "@maven//:com_google_errorprone_error_prone_annotations", + ], +) + +kt_jvm_library( + name = "allocated_budget", + srcs = ["AllocatedBudget.kt"], +) + +kt_jvm_library( + name = "budget_accountant", + srcs = ["BudgetAccountant.kt"], + deps = [ + ":allocated_budget", + ":budget_spec", + ], +) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetAccountant.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetAccountant.kt new file mode 100644 index 00000000..02c242e5 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetAccountant.kt @@ -0,0 +1,266 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core.budget + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetAccountingStrategy.NAIVE +import java.lang.IllegalArgumentException +import java.lang.IllegalStateException + +/** + * An accountant who keeps track of the available amount of budget and allocates budget. + * + * Various mechanisms can request budget by calling. As a response, [BudgetAccountant] returns an + * instance of [AllocatedBudget], which is not initialized initially. In order to initialize the + * [AllocatedBudget]s, call [allocateBudgets]. The [BudgetAccountant] instance cannot be used once + * budgets have been allocated. + */ +interface BudgetAccountant { + /** + * Records a [BudgetRequest] and returns the corresponding uninitialized [AllocatedBudget]. The + * actual budget will be allocated when [allocateBudgets] is called. + * + * @param budgetRequest the request for a privacy budget. + * @return an uninitialized [AllocatedBudget] that will be updated later. + * @throws IllegalStateException if [allocateBudgets] has already been called on this instance. + */ + fun requestBudget(budgetRequest: BudgetRequest): AllocatedBudget + + /** + * Allocates budgets to all previously recorded [BudgetRequest]s. This method should only be + * called once. + * + * @throws IllegalStateException if budgets have already been allocated. + */ + fun allocateBudgets() +} + +/** + * A [BudgetAccountant] who uses naive budget composition to compose budgets (i.e., it sums-up + * epsilon and delta values). It is initialized with the [TotalBudget] available. + * + * A [NaiveBudgetAccountant] accepts budget requests with [AbsoluteBudgetPerOpSpec] and + * [RelativeBudgetPerOpSpec]. If there is enough budget to serve all requests, + * [NaiveBudgetAccountant] allocates the exact epsilon and delta values requested by the + * corresponding [AbsoluteBudgetPerOpSpec]s. Then it allocates the remaining budget according to the + * [RelativeBudgetPerOpSpec]s. For example, if the total available epsilon is 10.0 and the following + * requests have been sent: + * - Request1 with [AbsoluteBudgetPerOpSpec] and epsilon = 4.0, + * - Request2 with [RelativeBudgetPerOpSpec] and weight = 1.0, + * - Request3 with [RelativeBudgetPerOpSpec] and weight = 2.0, + * + * Then [NaiveBudgetAccountant] will allocate: + * - epsilon = 4.0 for Request1 + * - epsilon = * ( / ) = = (10.0 - 4.0) * (1.0 / + * 3.0) = 2.0 for Request2 + * - epsilon = * ( / ) = = (10.0 - 4.0) * (2.0 / + * 3.0) = 4.0 for Request3 + * + * If there is not enough budget to serve all requests, [allocateBudgets] throws an exception and + * none of the [AllocatedBudget]s created by this instance gets updated. + * + * @property totalBudget the total budget available for allocation. + * @constructor Creates a new [NaiveBudgetAccountant] with the specified total budget. + */ +class NaiveBudgetAccountant(private val totalBudget: TotalBudget) : BudgetAccountant { + private val absoluteBudgets: MutableList = mutableListOf() + private val relativeBudgets: MutableList = mutableListOf() + private var budgetsAllocated = false + + companion object { + const val FLOATING_POINT_ARITHMETICS_TOLERANCE = 1e9 + } + + override fun requestBudget(budgetRequest: BudgetRequest): AllocatedBudget { + if (budgetsAllocated) { + throw IllegalStateException( + "Budget cannot be requested because allocateBudgets() has already been called on this instance." + ) + } + val allocatedBudget = AllocatedBudget.create() + when (budgetRequest.budgetSpec) { + is AbsoluteBudgetPerOpSpec -> + absoluteBudgets.add(RequestedAndAllocatedBudget(budgetRequest, allocatedBudget)) + is RelativeBudgetPerOpSpec -> + relativeBudgets.add(RequestedAndAllocatedBudget(budgetRequest, allocatedBudget)) + } + return allocatedBudget + } + + override fun allocateBudgets() { + if (budgetsAllocated) { + throw IllegalStateException("Budgets have already been allocated.") + } + budgetsAllocated = true + + var totalRequestedEpsilon = 0.0 + var totalRequestedDelta = 0.0 + for (requestedAndAllocated in absoluteBudgets) { + val budgetSpec = requestedAndAllocated.requested.budgetSpec as AbsoluteBudgetPerOpSpec + totalRequestedEpsilon += budgetSpec.epsilon + totalRequestedDelta += budgetSpec.delta + } + val remainingEpsilon = totalBudget.epsilon - totalRequestedEpsilon + val remainingDelta = totalBudget.delta - totalRequestedDelta + + checkEnoughAbsoluteBudget(totalRequestedEpsilon, totalRequestedDelta) + checkEnoughRelativeBudget(remainingEpsilon, remainingDelta) + + allocateAbsoluteBudgets() + allocateRelativeBudgets(remainingEpsilon, remainingDelta) + } + + private fun checkEnoughAbsoluteBudget(requestedEpsilon: Double, requestedDelta: Double) { + if ( + notEnoughBudget(requestedEpsilon, totalBudget.epsilon) || + notEnoughBudget(requestedDelta, totalBudget.delta) + ) { + throw IllegalArgumentException( + "Can't allocate absolute budget. The total requested budget is higher " + + "than the available budget. " + + "Total requested epsilon = $requestedEpsilon, " + + "available epsilon = ${totalBudget.epsilon}, " + + "total requested delta = $requestedDelta, " + + "available delta = ${totalBudget.delta}." + ) + } + } + + fun notEnoughBudget(requested: Double, remaining: Double): Boolean { + val diff = remaining - requested + if (diff >= 0.0) { + return false + } + return Math.abs(diff) > remaining / FLOATING_POINT_ARITHMETICS_TOLERANCE + } + + private fun allocateAbsoluteBudgets() { + for (requestedAndAllocated in absoluteBudgets) { + val budgetSpec = requestedAndAllocated.requested.budgetSpec as AbsoluteBudgetPerOpSpec + requestedAndAllocated.allocated.initialize( + epsilon = budgetSpec.epsilon, + delta = budgetSpec.delta, + ) + } + } + + private fun allocateRelativeBudgets(remainingEpsilon: Double, remainingDelta: Double) { + var totalEpsilonWeight = 0.0 + var totalDeltaWeight = 0.0 + for (requestedAndAllocated in relativeBudgets) { + val budgetSpec = requestedAndAllocated.requested.budgetSpec as RelativeBudgetPerOpSpec + if (requestedAndAllocated.requested.mechanism.usesEpsilon) { + totalEpsilonWeight += budgetSpec.weight + } + if (requestedAndAllocated.requested.mechanism.usesDelta) { + totalDeltaWeight += budgetSpec.weight + } + } + for (requestedAndAllocated in relativeBudgets) { + val budgetSpec = requestedAndAllocated.requested.budgetSpec as RelativeBudgetPerOpSpec + val allocatedEpsilon = + if (requestedAndAllocated.requested.mechanism.usesEpsilon) { + budgetSpec.weight / totalEpsilonWeight * remainingEpsilon + } else { + 0.0 + } + val allocatedDelta = + if (requestedAndAllocated.requested.mechanism.usesDelta) { + budgetSpec.weight / totalDeltaWeight * remainingDelta + } else { + 0.0 + } + requestedAndAllocated.allocated.initialize(allocatedEpsilon, allocatedDelta) + } + } + + private fun checkEnoughRelativeBudget(remainingEpsilon: Double, remainingDelta: Double) { + if (relativeEpsilonRequested() && remainingEpsilon <= 0.0) { + throw IllegalArgumentException( + "Can't allocate relative budget. There is no epsilon available after allocation of the absolute budget." + ) + } + if (relativeDeltaRequested() && remainingDelta <= 0.0) { + throw IllegalArgumentException( + "Can't allocate relative budget. There is no delta available after allocation of the absolute budget." + ) + } + } + + private fun relativeEpsilonRequested(): Boolean = + relativeBudgets.any { it.requested.mechanism.usesEpsilon } + + private fun relativeDeltaRequested(): Boolean = + relativeBudgets.any { it.requested.mechanism.usesDelta } +} + +/** + * A request to [BudgetAccountant] to allocate the cost to the budget consumed by an operation. + * + * The requested consumption can be expressed in terms of relative weights or absolute values. The + * weights are relative to the other [BudgetRequest]s sent to the same instance of + * [BudgetAccountant]. See the documentation of a specific [BudgetAccountant] implementation in + * order to learn how it composes relative and absolute budgets. + * + * @property budgetSpec the privacy budget specification for the operation. + * @property mechanism the type of mechanism for which the budget is requested. + */ +data class BudgetRequest(val budgetSpec: BudgetPerOpSpec, val mechanism: AccountedMechanism) + +/** + * Represents the type of mechanism that consumes the privacy budget. + * + * @property usesEpsilon whether the mechanism consumes epsilon. + * @property usesDelta whether the mechanism consumes delta. + */ +enum class AccountedMechanism(val usesEpsilon: Boolean, val usesDelta: Boolean) { + GAUSSIAN_NOISE(true, true), + LAPLACE_NOISE(true, false), + PREAGGREGATED_PARTITION_SELECTION(true, true), + POSTAGGREGATED_PARTITION_SELECTION(false, true), +} + +/** + * A pair of [BudgetRequest] and the corresponding [AllocatedBudget]. + * + * When [BudgetAccountant] receives a [BudgetRequest], it doesn't allocate budgets immediately but + * returns an [AllocatedBudget] instance, which is initialized later. We use this data structure to + * remember which [AllocatedBudget] corresponds to which [BudgetRequest]. + */ +internal data class RequestedAndAllocatedBudget( + val requested: BudgetRequest, + val allocated: AllocatedBudget, +) + +enum class BudgetAccountingStrategy { + NAIVE +} + +/** A factory for creating [BudgetAccountant] instances based on a given strategy. */ +object BudgetAccountantFactory { + /** + * Creates a [BudgetAccountant] instance for the specified [BudgetAccountingStrategy] and + * [TotalBudget]. + * + * @param accountingStrategy the budgeting strategy to use. + * @param totalBudget the total budget available for allocation. + * @return a new [BudgetAccountant] instance. + */ + fun forStrategy(accountingStrategy: BudgetAccountingStrategy, totalBudget: TotalBudget) = + when (accountingStrategy) { + NAIVE -> NaiveBudgetAccountant(totalBudget) + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetSpec.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetSpec.kt new file mode 100644 index 00000000..d518af4e --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetSpec.kt @@ -0,0 +1,122 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core.budget + +import com.google.errorprone.annotations.Immutable +import java.io.Serializable + +// TODO: remove code redundandcy with the API budget spec. + +/** + * Represents the budget allocated for anonymizing a metric or partition selection. + * + * This is a sealed interface with two implementations: [AbsoluteBudgetPerOpSpec] for absolute + * budget values and [RelativeBudgetPerOpSpec] for relative weights. + */ +@Immutable +sealed interface BudgetPerOpSpec { + /** + * Multiplies this budget specification by a given factor. + * + * @param factor the factor to multiply the budget by. + * @return a new budget specification with the multiplied values or weights. + */ + operator fun times(factor: Double): BudgetPerOpSpec +} + +/** + * Represents an absolute budget (epsilon and delta) for anonymizing a metric or partition + * selection. + * + * @property epsilon the epsilon (ε) privacy budget value. Must be non-negative. + * @property delta the delta (δ) privacy budget value. Must be non-negative. + */ +@Immutable +data class AbsoluteBudgetPerOpSpec(val epsilon: Double, val delta: Double) : + BudgetPerOpSpec, Serializable { + init { + BudgetValidationUtils.validateEpsilon(epsilon) + BudgetValidationUtils.validateDelta(delta) + } + + override fun times(factor: Double) = AbsoluteBudgetPerOpSpec(factor * epsilon, factor * delta) +} + +/** + * Represents a relative weight for anonymizing a metric or partition selection. + * + * The weight is relative to the weights of other metrics computed by the same `DpEngine`. + * + * @property weight the relative weight. Must be strictly positive. + */ +@Immutable +data class RelativeBudgetPerOpSpec(val weight: Double) : BudgetPerOpSpec, Serializable { + init { + BudgetValidationUtils.validateWeight(weight) + } + + override fun times(factor: Double) = RelativeBudgetPerOpSpec(factor * weight) +} + +/** + * The total amount of budget given to DpEngine for accounting both relative and absolute operation + * costs. + * + * @property epsilon the total epsilon (ε) privacy budget value. Must be non-negative. + * @property delta the total delta (δ) privacy budget value. Must be non-negative. Defaults to 0.0. + */ +@Immutable +data class TotalBudget(val epsilon: Double, val delta: Double = 0.0) : Serializable { + init { + BudgetValidationUtils.validateEpsilon(epsilon) + BudgetValidationUtils.validateDelta(delta) + } +} + +/** Utility object for validating budget parameters. */ +private object BudgetValidationUtils { + + /** + * Validates that epsilon is non-negative. + * + * @param epsilon the epsilon value to validate. + * @throws IllegalArgumentException if epsilon is negative. + */ + fun validateEpsilon(epsilon: Double) { + require(epsilon >= 0.0) { "Epsilon must be >= 0.0. Provided epsilon: $epsilon." } + } + + /** + * Validates that delta is non-negative. + * + * @param delta the delta value to validate. + * @throws IllegalArgumentException if delta is negative. + */ + fun validateDelta(delta: Double) { + require(delta >= 0.0) { "Delta must be >= 0.0. Provided delta: $delta." } + } + + /** + * Validates that a weight is strictly positive. + * + * @param weight the weight value to validate. + * @throws IllegalArgumentException if weight is not strictly positive. + */ + fun validateWeight(weight: Double) { + require(weight > 0.0) { "Weight must be > 0. Provided weight: $weight." } + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/BUILD.bazel b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/BUILD.bazel new file mode 100644 index 00000000..16649be1 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/BUILD.bazel @@ -0,0 +1,38 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_library") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +kt_jvm_library( + name = "noise_factories", + srcs = ["NoiseFactories.kt"], + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_functions_params", + "@maven//:com_google_privacy_differentialprivacy_differentialprivacy", + ], +) + +kt_jvm_library( + name = "pre_aggregation_partition_selection_factory", + srcs = ["PreAggregationPartitionSelectionFactory.kt"], + deps = [ + "@maven//:com_google_privacy_differentialprivacy_differentialprivacy", + ], +) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/NoiseFactories.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/NoiseFactories.kt new file mode 100644 index 00000000..a1d415f5 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/NoiseFactories.kt @@ -0,0 +1,40 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary + +import com.google.privacy.differentialprivacy.GaussianNoise +import com.google.privacy.differentialprivacy.LaplaceNoise +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.ZeroNoise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.LAPLACE +import java.io.Serializable + +/** Generates a [Noise] instance with the given [NoiseKind]. */ +class NoiseFactory : (NoiseKind) -> Noise, Serializable { + override fun invoke(noiseKind: NoiseKind) = + when (noiseKind) { + LAPLACE -> LaplaceNoise() + GAUSSIAN -> GaussianNoise() + } +} + +/** For any [NoiseKind] returns ZeroNoise (i.e. no noise at all). */ +class ZeroNoiseFactory : (NoiseKind) -> Noise, Serializable { + override fun invoke(noiseKind: NoiseKind) = ZeroNoise() +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/PreAggregationPartitionSelectionFactory.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/PreAggregationPartitionSelectionFactory.kt new file mode 100644 index 00000000..b70d82f5 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary/PreAggregationPartitionSelectionFactory.kt @@ -0,0 +1,35 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary + +import com.google.privacy.differentialprivacy.PreAggSelectPartition +import java.io.Serializable + +open class PreAggregationPartitionSelectionFactory : Serializable { + open fun create( + epsilon: Double, + delta: Double, + maxPartitionsContributed: Int, + preThreshold: Int, + ) = + PreAggSelectPartition.builder() + .epsilon(epsilon) + .delta(delta) + .maxPartitionsContributed(maxPartitionsContributed) + .preThreshold(preThreshold) + .build() +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/BUILD.bazel b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/BUILD.bazel new file mode 100644 index 00000000..bef169d9 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/BUILD.bazel @@ -0,0 +1,51 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_library") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +kt_jvm_library( + name = "local_encoders", + srcs = ["LocalEncoderFactory.kt"], + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:encoders", + "@maven//:com_google_protobuf_protobuf_java", + ], +) + +kt_jvm_library( + name = "local_collections", + srcs = [ + "LocalCollection.kt", + "LocalTable.kt", + ], + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:framework_collections", + ], +) + +kt_jvm_library( + name = "local_dp_engine_factory", + srcs = ["LocalDpEngineFactory.kt"], + deps = [ + ":local_encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_engine", + ], +) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalCollection.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalCollection.kt new file mode 100644 index 00000000..e6e5976f --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalCollection.kt @@ -0,0 +1,54 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.local + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.Encoder +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkTable + +/** An implementation of [FrameworkCollection], which runs all operations locally. */ +class LocalCollection(val data: Sequence) : FrameworkCollection { + override val elementsEncoder = object : Encoder {} + + override fun distinct(stageName: String) = LocalCollection(data.distinct()) + + override fun map( + stageName: String, + outputType: Encoder, + mapFn: (T) -> R, + ): FrameworkCollection { + return LocalCollection(data.map(mapFn)) + } + + override fun keyBy( + stageName: String, + outputType: Encoder, + keyFn: (T) -> K, + ): FrameworkTable { + val tableData: Sequence> = data.map { keyFn(it) to it } + return LocalTable(tableData) + } + + override fun mapToTable( + stageName: String, + keyType: Encoder, + valueType: Encoder, + mapFn: (T) -> Pair, + ): FrameworkTable { + return LocalTable(data.map { mapFn(it) }) + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalDpEngineFactory.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalDpEngineFactory.kt new file mode 100644 index 00000000..d4363fb1 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalDpEngineFactory.kt @@ -0,0 +1,24 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.local + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.DpEngine +import com.google.privacy.differentialprivacy.pipelinedp4j.core.DpEngineBudgetSpec + +/** Creates a [DpEngine] that runs DP aggregations locally. */ +fun DpEngine.Factory.createLocalEngine(budgetSpec: DpEngineBudgetSpec) = + create(LocalEncoderFactory(), budgetSpec) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalEncoderFactory.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalEncoderFactory.kt new file mode 100644 index 00000000..76489cdd --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalEncoderFactory.kt @@ -0,0 +1,45 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.local + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.Encoder +import com.google.privacy.differentialprivacy.pipelinedp4j.core.EncoderFactory +import com.google.protobuf.Message +import kotlin.reflect.KClass + +class LocalEncoderFactory() : EncoderFactory { + // The implementation of local encoders is empty because when the data is being processed + // locally (in-process), it doesn't need to be serialized. + override fun strings(): Encoder { + return object : Encoder {} + } + + override fun doubles(): Encoder { + return object : Encoder {} + } + + override fun ints(): Encoder { + return object : Encoder {} + } + + override fun records(recordClass: KClass): Encoder = object : Encoder {} + + override fun protos(protoClass: KClass): Encoder = object : Encoder {} + + override fun tuple2sOf(first: Encoder, second: Encoder) = + object : Encoder> {} +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalTable.kt b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalTable.kt new file mode 100644 index 00000000..10fcd134 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalTable.kt @@ -0,0 +1,114 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.local + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.Encoder +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.core.FrameworkTable + +/** An implementation of [FrameworkTable], which runs all operations locally. */ +class LocalTable(val data: Sequence>) : FrameworkTable { + override val keysEncoder = object : Encoder {} + + override val valuesEncoder = object : Encoder {} + + override fun map( + stageName: String, + outputType: Encoder, + mapFn: (K, V) -> R, + ): FrameworkCollection = LocalCollection(data.map { mapFn(it.first, it.second) }) + + override fun groupAndCombineValues(stageName: String, combFn: (V, V) -> V): FrameworkTable { + return LocalTable( + groupByKey(stageName).data.map { (key, values) -> key to values.reduce(combFn) } + ) + } + + override fun groupByKey(stageName: String): LocalTable> { + return LocalTable( + sequence { + yieldAll( + data + .groupBy(keySelector = { kv -> kv.first }, valueTransform = { kv -> kv.second }) + .toList() + ) + } + ) + } + + override fun keys(stageName: String): FrameworkCollection = + LocalCollection(data.map { it.first }) + + override fun values(stageName: String): FrameworkCollection = + LocalCollection(data.map { it.second }) + + override fun mapValues( + stageName: String, + outputType: Encoder, + mapValuesFn: (K, V) -> VO, + ): FrameworkTable { + return LocalTable(data.map { Pair(it.first, mapValuesFn(it.first, it.second)) }) + } + + override fun mapToTable( + stageName: String, + outputKeyType: Encoder, + outputValueType: Encoder, + mapFn: (K, V) -> Pair, + ): FrameworkTable { + return LocalTable(data.map { mapFn(it.first, it.second) }) + } + + override fun flatMapToTable( + stageName: String, + keyType: Encoder, + valueType: Encoder, + mapFn: (K, V) -> Sequence>, + ): FrameworkTable = LocalTable(data.flatMap { (k, v) -> mapFn(k, v) }) + + override fun filterValues(stageName: String, predicate: (V) -> Boolean): FrameworkTable = + LocalTable(data.filter { (_, value) -> predicate(value) }) + + override fun filterKeys(stageName: String, predicate: (K) -> Boolean) = + LocalTable(data.filter { (key, _) -> predicate(key) }) + + override fun filterKeys( + stageName: String, + allowedKeys: FrameworkCollection, + unbalancedKeys: Boolean, + ): FrameworkTable { + val allowedKeysHashSet = (allowedKeys as LocalCollection).data.toCollection(HashSet()) + return filterKeys(stageName) { k -> k in allowedKeysHashSet } + } + + override fun flattenWith(stageName: String, other: FrameworkTable): LocalTable { + val localOther = other as LocalTable + return LocalTable(sequenceOf(data, localOther.data).flatten()) + } + + override fun samplePerKey(stageName: String, count: Int): LocalTable> { + return LocalTable( + groupByKey(stageName).data.map { (k, v) -> + val elements = v.toList() + val sampledElements = + if (elements.size <= count) elements else elements.shuffled().take(count) + + k to sampledElements.toList() + } + ) + } +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/BUILD.bazel b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/BUILD.bazel new file mode 100644 index 00000000..13f8ebff --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/BUILD.bazel @@ -0,0 +1,41 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@com_github_grpc_grpc_kotlin//:kt_jvm_grpc.bzl", "kt_jvm_proto_library") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +proto_library( + name = "accumulators_proto", + srcs = ["accumulators.proto"], +) + +kt_jvm_proto_library( + name = "accumulators_kt_proto", + deps = [":accumulators_proto"], +) + +proto_library( + name = "dpaggregates_proto", + srcs = ["dpaggregates.proto"], +) + +kt_jvm_proto_library( + name = "dpaggregates_kt_proto", + deps = [":dpaggregates_proto"], +) diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/accumulators.proto b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/accumulators.proto new file mode 100644 index 00000000..472cdd68 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/accumulators.proto @@ -0,0 +1,59 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package privacy.differentialprivacy.pipelinedp4j.proto; + +option java_package = "com.google.privacy.differentialprivacy.pipelinedp4j.proto"; +option java_multiple_files = true; + +message PrivacyIdCountAccumulator { + int64 count = 1; +} + +message CountAccumulator { + int64 count = 1; +} + +message SumAccumulator { + double sum = 1; +} + +message MeanAccumulator { + int64 count = 1; + double normalized_sum = 2; +} + +message QuantilesAccumulator { + bytes serialized_quantiles_summary = 1; +} + +message VarianceAccumulator { + int64 count = 1; + double normalized_sum = 2; + double normalized_sum_squares = 3; +} + +// A collection of accumulators for all metrics that can possibly be computed. +// Only accumulators that correspond to the aggregations, which are actually +// being computed, are populated. +message CompoundAccumulator { + PrivacyIdCountAccumulator privacy_id_count_accumulator = 3; + CountAccumulator count_accumulator = 1; + SumAccumulator sum_accumulator = 2; + MeanAccumulator mean_accumulator = 4; + QuantilesAccumulator quantiles_accumulator = 5; + VarianceAccumulator variance_accumulator = 6; +} diff --git a/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/dpaggregates.proto b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/dpaggregates.proto new file mode 100644 index 00000000..6bcfc593 --- /dev/null +++ b/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/proto/dpaggregates.proto @@ -0,0 +1,40 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package privacy.differentialprivacy.pipelinedp4j.proto; + +option java_package = "com.google.privacy.differentialprivacy.pipelinedp4j.proto"; +option java_outer_classname = "DpAggregatesProto"; +option java_multiple_files = true; + +// The anonymized aggregated result returned by PipelineDP4j. The proto has +// fields for all possible metrics that can be computed. Only metrics requested +// by the user are populated. +message DpAggregates { + double privacy_id_count = 3; + double count = 1; + double sum = 2; + double mean = 4; + + // Sorted by rank. + repeated double quantiles = 5; + double variance = 6; +} + +// Contribution that belong to the same privacy ID and partition key. +message PrivacyIdContributions { + repeated double values = 1; +} diff --git a/pipelinedp4j/pom.template b/pipelinedp4j/pom.template new file mode 100644 index 00000000..ec403805 --- /dev/null +++ b/pipelinedp4j/pom.template @@ -0,0 +1,42 @@ + + +4.0.0 + +com.google.privacy.differentialprivacy.pipelinedp4j +pipelinedp4j +RELEASE_VERSION +jar + +A framework for Differential Privacy + + Solution for generating differential private statistics using large-scale data + processing, such as Apache Beam or Apache Spark. + +https://github.com/google/differential-privacy/tree/main/pipelinedp4j + + + + Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + + + + + + Differential Privacy Team + dp-open-source@google.com + Google + https://google.com + + + + + scm:git:git://github.com/google/differential-privacy.git + https://github.com/google/differential-privacy/tree/main/pipelinedp4j + + + + {generated_bzl_deps} + + diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/ApiTests.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/ApiTests.kt new file mode 100644 index 00000000..b019639f --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/ApiTests.kt @@ -0,0 +1,23 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import org.junit.runner.RunWith +import org.junit.runners.Suite + +/** Provides a list of JUnit test classes to Bazel. When creating a new test class, add it here. */ +@RunWith(Suite::class) @Suite.SuiteClasses(BeamQueryTest::class) class ApiTests {} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BUILD.bazel b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BUILD.bazel new file mode 100644 index 00000000..e96015fa --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BUILD.bazel @@ -0,0 +1,37 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_test") + +kt_jvm_test( + name = "api_tests", + srcs = [ + "ApiTests.kt", + "BeamQueryBuilderTest.kt", + "BeamQueryTest.kt", + ], + test_class = "com.google.privacy.differentialprivacy.pipelinedp4j.api.ApiTests", + runtime_deps = [ + "@maven//:org_apache_beam_beam_runners_direct_java", + ], + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/api", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_functions_params", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_spec", + "@maven//:com_google_truth_truth", + "@maven//:junit_junit", + "@maven//:org_apache_beam_beam_sdks_java_core", + "@maven//:org_jetbrains_kotlin_kotlin_test", + ], +) diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryBuilderTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryBuilderTest.kt new file mode 100644 index 00000000..5ccf2147 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryBuilderTest.kt @@ -0,0 +1,98 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import com.google.common.truth.Truth.assertThat +import kotlin.test.assertFailsWith +import org.apache.beam.sdk.coders.DoubleCoder +import org.apache.beam.sdk.coders.KvCoder +import org.apache.beam.sdk.coders.StringUtf8Coder +import org.apache.beam.sdk.testing.TestPipeline +import org.apache.beam.sdk.transforms.Create +import org.apache.beam.sdk.values.KV +import org.junit.Rule +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class BeamQueryBuilderTest { + @get:Rule val testPipeline: TestPipeline = TestPipeline.create() + + @Test + fun build_sameOutputColumnNames_throwsException() { + val pCollection = + testPipeline.apply( + "Create input data", + Create.of(listOf, Double>>()) + .withCoder( + KvCoder.of(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), DoubleCoder.of()) + ), + ) + + val queryBuilder = + QueryBuilder.from(pCollection, { it.key.value }) + .groupBy({ it.key.key }, maxGroupsContributed = 1, maxContributionsPerGroup = 1) + .sum( + { it.value }, + minTotalValuePerPrivacyUnitInGroup = 1.0, + maxTotalValuePerPrivacyUnitInGroup = 2.0, + outputColumnName = "sameColumnName", + ) + .count("sameColumnName") + + val e = assertFailsWith { queryBuilder.build() } + assertThat(e) + .hasMessageThat() + .contains("There aggregations with duplicate output column names: [sameColumnName]") + + testPipeline.run().waitUntilFinish() + } + + @Test + fun build_differentValues_throwsException() { + val pCollection = + testPipeline.apply( + "Create input data", + Create.of(listOf, Double>>()) + .withCoder( + KvCoder.of(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), DoubleCoder.of()) + ), + ) + + val queryBuilder = + QueryBuilder.from(pCollection, { it.key.value }) + .groupBy({ it.key.key }, maxGroupsContributed = 1, maxContributionsPerGroup = 1) + .sum( + { it.value }, + minTotalValuePerPrivacyUnitInGroup = 1.0, + maxTotalValuePerPrivacyUnitInGroup = 2.0, + outputColumnName = "sameColumnName", + ) + .sum( + { it.value * 2.0 }, + minTotalValuePerPrivacyUnitInGroup = 1.0, + maxTotalValuePerPrivacyUnitInGroup = 2.0, + outputColumnName = "otherColumnName", + ) + + val e = assertFailsWith { queryBuilder.build() } + assertThat(e).hasMessageThat().contains("Aggregation of different values is not supported yet.") + + testPipeline.run().waitUntilFinish() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryTest.kt new file mode 100644 index 00000000..f9f80b9f --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/api/BeamQueryTest.kt @@ -0,0 +1,175 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.api + +import com.google.common.truth.Truth.assertThat +import org.apache.beam.sdk.coders.DoubleCoder +import org.apache.beam.sdk.coders.KvCoder +import org.apache.beam.sdk.coders.StringUtf8Coder +import org.apache.beam.sdk.testing.PAssert +import org.apache.beam.sdk.testing.TestPipeline +import org.apache.beam.sdk.transforms.Create +import org.apache.beam.sdk.values.KV +import org.apache.beam.sdk.values.PCollection +import org.junit.Rule +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class BeamQueryTest { + @get:Rule val testPipeline: TestPipeline = TestPipeline.create() + + @Test + fun run_onePublicGroupTwoDifferentContributions_allPossibleAggregations_calculatesStatisticsCorrectly() { + val pCollection = + testPipeline.apply( + "Create input data", + Create.of( + listOf( + KV.of(KV.of("group1", "pid1"), 1.0), + KV.of(KV.of("group1", "pid1"), 1.5), + KV.of(KV.of("group1", "pid2"), 2.0), + ) + ) + .withCoder( + KvCoder.of(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), DoubleCoder.of()) + ), + ) + val publicGroups = + testPipeline.apply( + "Create public groups", + Create.of(listOf("group1")).withCoder(StringUtf8Coder.of()), + ) + val valueExtractor = { it: KV, Double> -> it.value } + + val result: PCollection = + QueryBuilder.from(pCollection, { it.key.value }) + .groupBy( + { it.key.key }, + maxGroupsContributed = 1, + maxContributionsPerGroup = 2, + publicGroups, + ) + .countDistinctPrivacyUnits("pid_cnt") + .count("cnt") + .sum(valueExtractor, outputColumnName = "sumResult") + .mean(valueExtractor, minValue = 1.0, maxValue = 2.0, "meanResult") + .variance(valueExtractor, minValue = 1.0, maxValue = 2.0, "varianceResult") + .quantiles( + valueExtractor, + ranks = listOf(0.5), + minValue = 1.0, + maxValue = 2.0, + "quantilesResult", + ) + .build() + .run(TotalBudget(epsilon = 1000.0), NoiseKind.LAPLACE) + + PAssert.that(result).satisfies { + assertThat(it).hasSize(1) + val queryPerGroupResult = it.iterator().next() + assertThat(queryPerGroupResult.groupKey).isEqualTo("group1") + assertThat(queryPerGroupResult.aggregationResults).hasSize(6) + assertThat(queryPerGroupResult.aggregationResults.keys) + .containsExactly( + "pid_cnt", + "cnt", + "sumResult", + "meanResult", + "varianceResult", + "quantilesResult_0.5", + ) + assertThat(queryPerGroupResult.aggregationResults["pid_cnt"]).isWithin(0.5).of(2.0) + assertThat(queryPerGroupResult.aggregationResults["cnt"]).isWithin(0.5).of(3.0) + assertThat(queryPerGroupResult.aggregationResults["sumResult"]).isWithin(0.5).of(4.5) + assertThat(queryPerGroupResult.aggregationResults["meanResult"]).isWithin(0.5).of(1.5) + // (1^2+(1.5)^2+2^2)/3-((1.0+1.5+2)/3)^2 = 0.1(6) + assertThat(queryPerGroupResult.aggregationResults["varianceResult"]).isWithin(0.05).of(0.16) + assertThat(queryPerGroupResult.aggregationResults["quantilesResult_0.5"]) + .isWithin(0.5) + .of(1.5) + null + } + + testPipeline.run().waitUntilFinish() + } + + @Test + fun run_sumAndQuantiles_calculatesCorrectly() { + val pCollection = + testPipeline.apply( + "Create input data", + Create.of( + listOf( + KV.of(KV.of("group1", "pid1"), 1.0), + KV.of(KV.of("group1", "pid1"), 1.5), + KV.of(KV.of("group1", "pid2"), 2.0), + ) + ) + .withCoder( + KvCoder.of(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), DoubleCoder.of()) + ), + ) + val publicGroups = + testPipeline.apply( + "Create public groups", + Create.of(listOf("group1")).withCoder(StringUtf8Coder.of()), + ) + val valueExtractor = { it: KV, Double> -> it.value } + + val result: PCollection = + QueryBuilder.from(pCollection, { it.key.value }) + .groupBy( + { it.key.key }, + maxGroupsContributed = 1, + maxContributionsPerGroup = 2, + publicGroups, + ) + .sum( + valueExtractor, + minTotalValuePerPrivacyUnitInGroup = 2.0, + maxTotalValuePerPrivacyUnitInGroup = 2.5, + outputColumnName = "sumResult", + ) + .quantiles( + valueExtractor, + ranks = listOf(0.5), + minValue = 1.0, + maxValue = 2.0, + "quantilesResult", + ) + .build() + .run(TotalBudget(epsilon = 1000.0), NoiseKind.LAPLACE) + + PAssert.that(result).satisfies { + assertThat(it).hasSize(1) + val queryPerGroupResult = it.iterator().next() + assertThat(queryPerGroupResult.groupKey).isEqualTo("group1") + assertThat(queryPerGroupResult.aggregationResults).hasSize(2) + assertThat(queryPerGroupResult.aggregationResults.keys) + .containsExactly("sumResult", "quantilesResult_0.5") + assertThat(queryPerGroupResult.aggregationResults["sumResult"]).isWithin(0.5).of(4.5) + assertThat(queryPerGroupResult.aggregationResults["quantilesResult_0.5"]) + .isWithin(0.5) + .of(1.5) + null + } + + testPipeline.run().waitUntilFinish() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BUILD.bazel b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BUILD.bazel new file mode 100644 index 00000000..b3734a9b --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BUILD.bazel @@ -0,0 +1,42 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_test") + +kt_jvm_test( + name = "beam_tests", + srcs = [ + "BeamCollectionTest.kt", + "BeamEncodersTest.kt", + "BeamTableTest.kt", + "BeamTests.kt", + ], + test_class = "com.google.privacy.differentialprivacy.pipelinedp4j.beam.BeamTests", + runtime_deps = [ + "@maven//:org_apache_beam_beam_runners_direct_java", + ], + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/beam:beam_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/beam:beam_encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:core_types", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:accumulators_kt_proto", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_testparameterinjector_test_parameter_injector", + "@maven//:com_google_truth_truth", + "@maven//:junit_junit", + "@maven//:org_apache_beam_beam_sdks_java_core", + "@maven//:org_hamcrest_hamcrest", + ], +) diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamCollectionTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamCollectionTest.kt new file mode 100644 index 00000000..ad4f95c6 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamCollectionTest.kt @@ -0,0 +1,106 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.beam + +import com.google.common.truth.Truth.assertThat +import org.apache.beam.sdk.coders.VarIntCoder +import org.apache.beam.sdk.testing.PAssert +import org.apache.beam.sdk.testing.TestPipeline +import org.apache.beam.sdk.transforms.Create +import org.apache.beam.sdk.values.KV +import org.junit.Rule +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class BeamCollectionTest { + @get:Rule val testPipeline: TestPipeline = TestPipeline.create() + + @Test + fun elementsEncoder_returnsCorrectEncoder() { + val pCollection = testPipeline.apply(Create.of(listOf()).withCoder(VarIntCoder.of())) + val beamCollection = BeamCollection(pCollection) + + val result = beamCollection.elementsEncoder + + testPipeline.run().waitUntilFinish() + + assertThat(result).isInstanceOf(BeamEncoder::class.java) + assertThat(result.coder).isEqualTo(VarIntCoder.of()) + } + + @Test + fun distinct_removesDuplicates() { + val pCollection = testPipeline.apply(Create.of(listOf(1, 2, 1)).withCoder(VarIntCoder.of())) + val beamCollection = BeamCollection(pCollection) + + val result: BeamCollection = beamCollection.distinct("stageName") + + PAssert.that(result.data).containsInAnyOrder(1, 2) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun map_appliesMapFn() { + val pCollection = testPipeline.apply(Create.of(listOf(1)).withCoder(VarIntCoder.of())) + val beamCollection = BeamCollection(pCollection) + + val result: BeamCollection = + beamCollection.map("Test", beamEncoderFactory.strings(), { v -> v.toString() }) + + PAssert.that(result.data).containsInAnyOrder("1") + + testPipeline.run().waitUntilFinish() + } + + @Test + fun keyBy_keysCollection() { + val pCollection = testPipeline.apply(Create.of(listOf(1)).withCoder(VarIntCoder.of())) + val beamCollection = BeamCollection(pCollection) + + val result: BeamTable = + beamCollection.keyBy("Test", beamEncoderFactory.strings(), { v -> v.toString() }) + + PAssert.that(result.data).containsInAnyOrder(KV.of("1", 1)) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun mapToTable_appliesMapFn() { + val pCollection = testPipeline.apply(Create.of(listOf(1)).withCoder(VarIntCoder.of())) + val beamCollection = BeamCollection(pCollection) + + val result: BeamTable = + beamCollection.mapToTable( + "Test", + beamEncoderFactory.strings(), + beamEncoderFactory.ints(), + { v -> Pair(v.toString(), v) }, + ) + + PAssert.that(result.data).containsInAnyOrder(KV.of("1", 1)) + + testPipeline.run().waitUntilFinish() + } + + companion object { + private val beamEncoderFactory = BeamEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamEncodersTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamEncodersTest.kt new file mode 100644 index 00000000..c76201d6 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamEncodersTest.kt @@ -0,0 +1,143 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.beam + +import com.google.privacy.differentialprivacy.pipelinedp4j.core.ContributionWithPrivacyId +import com.google.privacy.differentialprivacy.pipelinedp4j.core.contributionWithPrivacyId +import com.google.privacy.differentialprivacy.pipelinedp4j.core.encoderOfContributionWithPrivacyId +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.CompoundAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.compoundAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.meanAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.quantilesAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.sumAccumulator +import com.google.protobuf.ByteString +import org.apache.beam.sdk.testing.PAssert +import org.apache.beam.sdk.testing.TestPipeline +import org.apache.beam.sdk.transforms.Create +import org.junit.Rule +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class BeamEncodersTest { + @get:Rule val testPipeline: TestPipeline = TestPipeline.create() + + @Test + fun strings_isPossibleToCreateBeamPCollectionOfThatType() { + val input = listOf("a", "b", "c") + val inputCoder = beamEncoderFactory.strings().coder + + val pCollection = testPipeline.apply(Create.of(input).withCoder(inputCoder)) + + PAssert.that(pCollection).containsInAnyOrder(input) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun doubles_isPossibleToCreateBeamPCollectionOfThatType() { + val input = listOf(-1.2, 0.0, 2.1) + val inputCoder = beamEncoderFactory.doubles().coder + + val pCollection = testPipeline.apply(Create.of(input).withCoder(inputCoder)) + + PAssert.that(pCollection).containsInAnyOrder(input) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun ints_isPossibleToCreateBeamPCollectionOfThatType() { + val input = listOf(-1, 0, 1) + val inputCoder = beamEncoderFactory.ints().coder + + val pCollection = testPipeline.apply(Create.of(input).withCoder(inputCoder)) + + PAssert.that(pCollection).containsInAnyOrder(input) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun records_isPossibleToCreateBeamPCollectionOfThatType() { + val input = + listOf( + contributionWithPrivacyId("privacyId1", "partitionKey1", -1.0), + contributionWithPrivacyId("privacyId2", "partitionKey1", 0.0), + contributionWithPrivacyId("privacyId1", "partitionKey2", 1.0), + contributionWithPrivacyId("privacyId3", "partitionKey3", 1.2345), + ) + val inputCoder = + (encoderOfContributionWithPrivacyId( + beamEncoderFactory.strings(), + beamEncoderFactory.strings(), + beamEncoderFactory, + ) + as BeamEncoder>) + .coder + + val pCollection = testPipeline.apply(Create.of(input).withCoder(inputCoder)) + + PAssert.that(pCollection).containsInAnyOrder(input) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun protos_isPossibleToCreateBeamPCollectionOfThatType() { + val input = + listOf( + compoundAccumulator { + sumAccumulator = sumAccumulator { sum = -123.0 } + meanAccumulator = meanAccumulator { + count = 12 + normalizedSum = -1.543 + } + quantilesAccumulator = quantilesAccumulator { + serializedQuantilesSummary = + ByteString.copyFrom(byteArrayOf(0x48, 0x65, 0x6c, 0x6c, 0x6f)) + } + }, + compoundAccumulator {}, + ) + val inputCoder = beamEncoderFactory.protos(CompoundAccumulator::class).coder + + val pCollection = testPipeline.apply(Create.of(input).withCoder(inputCoder)) + + PAssert.that(pCollection).containsInAnyOrder(input) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun tuple2sOf_isPossibleToCreateBeamPCollectionOfThatType() { + val input = listOf("pid1" to 1, "pid1" to 1, "pid1" to -2, "pid2" to 3) + val inputCoder = + beamEncoderFactory.tuple2sOf(beamEncoderFactory.strings(), beamEncoderFactory.ints()).coder + + val pCollection = testPipeline.apply(Create.of(input).withCoder(inputCoder)) + + PAssert.that(pCollection).containsInAnyOrder(input) + + testPipeline.run().waitUntilFinish() + } + + companion object { + private val beamEncoderFactory = BeamEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTableTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTableTest.kt new file mode 100644 index 00000000..122cf1f0 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTableTest.kt @@ -0,0 +1,344 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.beam + +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import org.apache.beam.sdk.coders.KvCoder +import org.apache.beam.sdk.coders.StringUtf8Coder +import org.apache.beam.sdk.coders.VarIntCoder +import org.apache.beam.sdk.testing.PAssert +import org.apache.beam.sdk.testing.TestPipeline +import org.apache.beam.sdk.transforms.Create +import org.apache.beam.sdk.values.KV +import org.junit.Rule +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class BeamTableTest { + @get:Rule val testPipeline: TestPipeline = TestPipeline.create() + + @Test + fun keysEncoder_returnsCorrectEncoder() { + val pCollection = + testPipeline.apply( + Create.of>(listOf()) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + + val result = beamCollection.keysEncoder + + testPipeline.run().waitUntilFinish() + + assertThat(result).isInstanceOf(BeamEncoder::class.java) + assertThat(result.coder).isEqualTo(StringUtf8Coder.of()) + } + + @Test + fun valuesEncoder_returnsCorrectEncoder() { + val pCollection = + testPipeline.apply( + Create.of>(listOf()) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + + val result = beamCollection.valuesEncoder + + testPipeline.run().waitUntilFinish() + + assertThat(result).isInstanceOf(BeamEncoder::class.java) + assertThat(result.coder).isEqualTo(VarIntCoder.of()) + } + + @Test + fun map_appliesMapFn() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of(1, 10))).withCoder(KvCoder.of(VarIntCoder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + val mapFn: (Int, Int) -> String = { k, v -> "${k}_$v" } + + val result: BeamCollection = + beamCollection.map("Test", beamEncoderFactory.strings(), mapFn) + + PAssert.that(result.data).containsInAnyOrder("1_10") + + testPipeline.run().waitUntilFinish() + } + + @Test + fun groupAndCombineValues_appliesCombiner() { + val pCollection = + testPipeline.apply( + Create.of( + listOf( + KV.of("positive", 1), + KV.of("positive", 10), + KV.of("negative", -1), + KV.of("negative", -10), + ) + ) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + val combineFn: (Int, Int) -> Int = { v1, v2 -> v1 + v2 } + + val result: BeamTable = beamCollection.groupAndCombineValues("Test", combineFn) + + PAssert.that(result.data).containsInAnyOrder(KV.of("positive", 11), KV.of("negative", -11)) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun groupByKey_groupsValues() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of("positive", 1), KV.of("positive", 10), KV.of("negative", -1))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + + val result: BeamTable> = beamCollection.groupByKey("stageName") + + val expected = mapOf("positive" to setOf(1, 10), "negative" to setOf(-1)) + // We can't use PAssert.containsInAnyOrder because order in Iterable is not deterministic. + PAssert.that(result.data).satisfies { output: Iterable>> -> + val kotlinMap = output.associate { it.key to it.value.toSet() } + assertThat(expected).isEqualTo(kotlinMap) + // we need to return something to satisfy the function signature. + null + } + + testPipeline.run().waitUntilFinish() + } + + @Test + fun keys_returnsKeys() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of("key", "value"))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())) + ) + val beamCollection = BeamTable(pCollection) + + val result: BeamCollection = beamCollection.keys("stageName") + + PAssert.that(result.data).containsInAnyOrder("key") + + testPipeline.run().waitUntilFinish() + } + + @Test + fun keys_returnsValues() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of("key", "value"))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())) + ) + val beamCollection = BeamTable(pCollection) + + val result: BeamCollection = beamCollection.values("stageName") + + PAssert.that(result.data).containsInAnyOrder("value") + + testPipeline.run().waitUntilFinish() + } + + @Test + fun mapValues_appliesMapFn() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of("one", 1))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + val mapFn: (String, Int) -> String = { k, v -> "${k}_$v" } + + val result: BeamTable = + beamCollection.mapValues("stageName", beamEncoderFactory.strings(), mapFn) + + PAssert.that(result.data).containsInAnyOrder(KV.of("one", "one_1")) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun mapToTable_appliesMapFn() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of("one", 1))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + val mapFn: (String, Int) -> Pair = { k, v -> Pair(v, k) } + + val result: BeamTable = + beamCollection.mapToTable( + "Test", + beamEncoderFactory.ints(), + beamEncoderFactory.strings(), + mapFn, + ) + + PAssert.that(result.data).containsInAnyOrder(KV.of(1, "one")) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun flatMapToTable_appliesMapFn() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of("one", 1))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + val mapFn: (String, Int) -> Sequence> = { k, v -> + sequenceOf(Pair(v, k), Pair(v, k)) + } + + val result: BeamTable = + beamCollection.flatMapToTable( + "Test", + beamEncoderFactory.ints(), + beamEncoderFactory.strings(), + mapFn, + ) + + PAssert.that(result.data).containsInAnyOrder(KV.of(1, "one"), KV.of(1, "one")) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun filterValues_appliesPredicate() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of("one", 1), KV.of("two", 2))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + val predicate: (Int) -> Boolean = { v -> v == 1 } + + val result: BeamTable = beamCollection.filterValues("Test", predicate) + + PAssert.that(result.data).containsInAnyOrder(KV.of("one", 1)) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun filterKeys_appliesPredicate() { + val pCollection = + testPipeline.apply( + Create.of(listOf(KV.of("one", 1), KV.of("two", 2), KV.of("three", 3), KV.of("two", -2))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) + ) + val beamCollection = BeamTable(pCollection) + val predicate: (String) -> Boolean = { k -> k == "two" } + + val result: BeamTable = beamCollection.filterKeys("Test", predicate) + + PAssert.that(result.data).containsInAnyOrder(KV.of("two", 2), KV.of("two", -2)) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun filterKeys_allowedKeysStoredInBeamCollection_keepsOnlyAllowedKeys( + @TestParameter unbalancedKeys: Boolean + ) { + val pCollection = + testPipeline.apply( + "CreateInputData", + Create.of(listOf(KV.of("one", 1), KV.of("two", 2), KV.of("three", 3), KV.of("two", -2))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())), + ) + val beamCollection = BeamTable(pCollection) + val allowedKeysPCollection = + testPipeline.apply( + "CreateAllowedKeys", + Create.of(listOf("three", "two", "four")).withCoder(StringUtf8Coder.of()), + ) + val allowedKeysBeamCollection = BeamCollection(allowedKeysPCollection) + + val result: BeamTable = + beamCollection.filterKeys("stageName", allowedKeysBeamCollection, unbalancedKeys) + + PAssert.that(result.data) + .containsInAnyOrder(KV.of("two", 2), KV.of("three", 3), KV.of("two", -2)) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun filterKeys_allowedKeysStoredInLocalCollection_keepsOnlyAllowedKeys() { + val pCollection = + testPipeline.apply( + "CreateInputData", + Create.of(listOf(KV.of("one", 1), KV.of("two", 2), KV.of("three", 3), KV.of("two", -2))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())), + ) + val beamCollection = BeamTable(pCollection) + val allowedKeys = sequenceOf("three", "two", "four") + val allowedKeysLocalCollection = LocalCollection(allowedKeys) + + val result: BeamTable = + beamCollection.filterKeys("stageName", allowedKeysLocalCollection) + + PAssert.that(result.data) + .containsInAnyOrder(KV.of("two", 2), KV.of("three", 3), KV.of("two", -2)) + + testPipeline.run().waitUntilFinish() + } + + @Test + fun flattenWith_flattensCollections() { + val pCollection = + testPipeline.apply( + "Create1", + Create.of(listOf(KV.of("one", 1))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())), + ) + val beamCollection = BeamTable(pCollection) + val otherPCollection = + testPipeline.apply( + "Create2", + Create.of(listOf(KV.of("two", 2))) + .withCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())), + ) + val otherBeamCollection = BeamTable(otherPCollection) + + val result: BeamTable = + beamCollection.flattenWith("stageName", otherBeamCollection) + + PAssert.that(result.data).containsInAnyOrder(KV.of("one", 1), KV.of("two", 2)) + + testPipeline.run().waitUntilFinish() + } + + companion object { + private val beamEncoderFactory = BeamEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTests.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTests.kt new file mode 100644 index 00000000..e2b88ac5 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/beam/BeamTests.kt @@ -0,0 +1,25 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.beam + +import org.junit.runner.RunWith +import org.junit.runners.Suite + +/** Provides a list of JUnit test classes to Bazel. When creating a new test class, add it here. */ +@RunWith(Suite::class) +@Suite.SuiteClasses(BeamCollectionTest::class, BeamEncodersTest::class, BeamTableTest::class) +class BeamTests {} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/BUILD.bazel b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/BUILD.bazel new file mode 100644 index 00000000..66e0578b --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/BUILD.bazel @@ -0,0 +1,89 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_library", "kt_jvm_test") + +kt_jvm_test( + name = "core_tests", + srcs = [ + "CompoundCombinerTest.kt", + "CoreTests.kt", + "CountCombinerTest.kt", + "DataExtractorsTest.kt", + "DpEngineTest.kt", + "DpEngineTestFactory.kt", + "DpFunctionsParamsTest.kt", + "EndToEndTest.kt", + "ExactPrivacyIdCountCombinerTest.kt", + "MeanCombinerTest.kt", + "NoPrivacySamplerTest.kt", + "PartitionAndPerPartitionSamplerTest.kt", + "PartitionSamplerTest.kt", + "PartitionSamplerWithoutValuesTest.kt", + "PerPartitionContributionsSamplerTest.kt", + "PostAggregationPartitionSelectionCombinerTest.kt", + "PrivacyIdCombinerTest.kt", + "PrivatePartitionsComputationalGraphTest.kt", + "PrivatePartitionsTest.kt", + "PublicPartitionsComputationalGraphTest.kt", + "PublicPartitionsTest.kt", + "QuantilesCombinerTest.kt", + "SelectPartitionsComputationalGraphTest.kt", + "SumCombinerTest.kt", + "VarianceCombinerTest.kt", + ], + associates = ["//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_engine"], + kotlinc_opts = "//:kotlinc_options_for_parameterized_tests", + test_class = "com.google.privacy.differentialprivacy.pipelinedp4j.core.CoreTests", + deps = [ + ":test_data_types", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:contribution_sampler", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:core_types", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:data_extractors", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_functions_params", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:framework_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:allocated_budget", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_accountant", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_spec", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary:noise_factories", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary:pre_aggregation_partition_selection_factory", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_dp_engine_factory", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:accumulators_kt_proto", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:dpaggregates_kt_proto", + "@maven//:com_google_privacy_differentialprivacy_differentialprivacy", + "@maven//:com_google_testparameterinjector_test_parameter_injector", + "@maven//:com_google_truth_extensions_truth_proto_extension", + "@maven//:com_google_truth_truth", + "@maven//:junit_junit", + "@maven//:org_jetbrains_kotlin_kotlin_test", + "@maven//:org_mockito_kotlin_mockito_kotlin", + "@maven//:org_mockito_mockito_core", + ], +) + +kt_jvm_library( + name = "test_data_types", + srcs = ["TestDataTypes.kt"], + visibility = [ + "//visibility:public", + ], + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:data_extractors", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:encoders", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_encoders", + ], +) diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CompoundCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CompoundCombinerTest.kt new file mode 100644 index 00000000..d2df51b1 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CompoundCombinerTest.kt @@ -0,0 +1,538 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.VARIANCE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.compoundAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.countAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.dpAggregates +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.meanAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdCountAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.sumAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.varianceAccumulator +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class CompoundCombinerTest { + private val COUNT_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT)), + noiseKind = NoiseKind.GAUSSIAN, + maxPartitionsContributed = Int.MAX_VALUE, + maxContributionsPerPartition = Int.MAX_VALUE, + ) + private val COUNT_AND_SUM_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(SUM)), + noiseKind = NoiseKind.GAUSSIAN, + maxPartitionsContributed = Int.MAX_VALUE, + maxContributionsPerPartition = Int.MAX_VALUE, + minTotalValue = -Double.MAX_VALUE, + maxTotalValue = Double.MAX_VALUE, + ) + private val COUNT_AND_MEAN_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(MEAN)), + noiseKind = NoiseKind.GAUSSIAN, + maxPartitionsContributed = 100, + maxContributionsPerPartition = 10, + minValue = -100.0, + maxValue = 100.0, + ) + private val COUNT_AND_VARIANCE_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(VARIANCE)), + noiseKind = NoiseKind.GAUSSIAN, + maxPartitionsContributed = 100, + maxContributionsPerPartition = 10, + minValue = -100.0, + maxValue = 100.0, + ) + private val UNUSED_ALLOCATED_BUDGET = AllocatedBudget() + + init { + UNUSED_ALLOCATED_BUDGET.initialize(1.1, 1e-3) + } + + @Test + fun createAccumulator_oneMetric_createsOneAccumulator() { + val compoundCombiner = + CompoundCombiner(listOf(CountCombiner(COUNT_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()))) + + val accumulator = + compoundCombiner.createAccumulator( + privacyIdContributions { values += listOf(10.0, 10.0, 10.0) } + ) + + assertThat(accumulator) + .isEqualTo(compoundAccumulator { countAccumulator = countAccumulator { count = 3 } }) + } + + @Test + fun createAccumulator_allMetrics_createsAllAccumulators() { + val compoundCombiner = + CompoundCombiner( + listOf( + CountCombiner(COUNT_AND_SUM_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()), + SumCombiner(COUNT_AND_SUM_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()), + ) + ) + + val accumulator = + compoundCombiner.createAccumulator( + privacyIdContributions { values += listOf(10.0, 10.0, 10.0) } + ) + + assertThat(accumulator) + .isEqualTo( + compoundAccumulator { + countAccumulator = countAccumulator { count = 3 } + sumAccumulator = sumAccumulator { sum = 30.0 } + } + ) + } + + @Test + fun createAccumulator_meanCombiner_createsMeanAccumulator() { + val compoundCombiner = + CompoundCombiner( + listOf( + MeanCombiner( + COUNT_AND_MEAN_PARAMS, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + ) + ) + + val accumulator = + compoundCombiner.createAccumulator( + privacyIdContributions { values += listOf(5.0, 10.5, 19.0) } + ) + + assertThat(accumulator) + .isEqualTo( + compoundAccumulator { + meanAccumulator = meanAccumulator { + count = 3 + normalizedSum = 34.5 + } + } + ) + } + + @Test + fun createAccumulator_varianceCombiner_createsVarianceAccumulator() { + val compoundCombiner = + CompoundCombiner( + listOf( + VarianceCombiner( + COUNT_AND_MEAN_PARAMS, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + ) + ) + + val accumulator = + compoundCombiner.createAccumulator( + privacyIdContributions { values += listOf(5.0, 10.5, 19.0) } + ) + + assertThat(accumulator) + .isEqualTo( + compoundAccumulator { + varianceAccumulator = varianceAccumulator { + count = 3 + normalizedSum = 34.5 + normalizedSumSquares = 496.25 + } + } + ) + } + + @Test + fun mergeAccumulators_multipleMetrics_mergesAccumulators() { + val compoundCombiner = + CompoundCombiner( + listOf( + CountCombiner(COUNT_AND_SUM_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()), + SumCombiner(COUNT_AND_SUM_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()), + ) + ) + + val mergedAccumulator = + compoundCombiner.mergeAccumulators( + compoundAccumulator { + countAccumulator = countAccumulator { count = 1 } + sumAccumulator = sumAccumulator { sum = 10.0 } + }, + compoundAccumulator { + countAccumulator = countAccumulator { count = 2 } + sumAccumulator = sumAccumulator { sum = 20.0 } + }, + ) + + assertThat(mergedAccumulator) + .isEqualTo( + compoundAccumulator { + countAccumulator = countAccumulator { count = 3 } + sumAccumulator = sumAccumulator { sum = 30.0 } + } + ) + } + + @Test + fun mergeAccumulators_oneMetric_mergesAccumulators() { + val compoundCombiner = + CompoundCombiner(listOf(CountCombiner(COUNT_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()))) + + val mergedAccumulator = + compoundCombiner.mergeAccumulators( + compoundAccumulator { countAccumulator = countAccumulator { count = 1 } }, + compoundAccumulator { countAccumulator = countAccumulator { count = 2 } }, + ) + + assertThat(mergedAccumulator) + .isEqualTo(compoundAccumulator { countAccumulator = countAccumulator { count = 3 } }) + } + + @Test + fun mergeAccumulators_meanCombiner_mergesAccumulators() { + val compoundCombiner = + CompoundCombiner( + listOf( + MeanCombiner( + COUNT_AND_MEAN_PARAMS, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + ) + ) + + val mergedAccumulator = + compoundCombiner.mergeAccumulators( + compoundAccumulator { + meanAccumulator = meanAccumulator { + count = 1 + normalizedSum = 10.0 + } + }, + compoundAccumulator { + meanAccumulator = meanAccumulator { + count = 2 + normalizedSum = 20.0 + } + }, + ) + + assertThat(mergedAccumulator) + .isEqualTo( + compoundAccumulator { + meanAccumulator = meanAccumulator { + count = 3 + normalizedSum = 30.0 + } + } + ) + } + + @Test + fun mergeAccumulators_varianceCombiner_mergesAccumulators() { + val compoundCombiner = + CompoundCombiner( + listOf( + VarianceCombiner( + COUNT_AND_VARIANCE_PARAMS, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + ) + ) + + val mergedAccumulator = + compoundCombiner.mergeAccumulators( + compoundAccumulator { + varianceAccumulator = varianceAccumulator { + count = 1 + normalizedSum = 10.0 + normalizedSumSquares = 100.0 + } + }, + compoundAccumulator { + varianceAccumulator = varianceAccumulator { + count = 2 + normalizedSum = 20.0 + normalizedSumSquares = 200.0 + } + }, + ) + + assertThat(mergedAccumulator) + .isEqualTo( + compoundAccumulator { + varianceAccumulator = varianceAccumulator { + count = 3 + normalizedSum = 30.0 + normalizedSumSquares = 300.0 + } + } + ) + } + + @Test + fun computeMetrics_allMetrics_returnsAllMetrics() { + val compoundCombiner = + CompoundCombiner( + listOf( + CountCombiner(COUNT_AND_SUM_PARAMS, UNUSED_ALLOCATED_BUDGET, ZeroNoiseFactory()), + SumCombiner(COUNT_AND_SUM_PARAMS, UNUSED_ALLOCATED_BUDGET, ZeroNoiseFactory()), + ) + ) + + val dpAggregates = + compoundCombiner.computeMetrics( + compoundAccumulator { + countAccumulator = countAccumulator { count = 3 } + sumAccumulator = sumAccumulator { sum = 30.0 } + } + ) + + assertThat(dpAggregates) + .isEqualTo( + dpAggregates { + count = 3.0 + sum = 30.0 + } + ) + } + + @Test + fun computeMetrics_oneMetric_returnsOneMetric() { + val compoundCombiner = + CompoundCombiner( + listOf(CountCombiner(COUNT_PARAMS, UNUSED_ALLOCATED_BUDGET, ZeroNoiseFactory())) + ) + + val dpAggregates = + compoundCombiner.computeMetrics( + compoundAccumulator { countAccumulator = countAccumulator { count = 3 } } + ) + + assertThat(dpAggregates).isEqualTo(dpAggregates { count = 3.0 }) + } + + @Test + fun computeMetrics_meanCombiner_returnsMeanMetric() { + val compoundCombiner = + CompoundCombiner( + listOf( + MeanCombiner( + COUNT_AND_MEAN_PARAMS.copy(metrics = ImmutableList.of(MetricDefinition(MEAN))), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + ) + ) + + val dpAggregates = + compoundCombiner.computeMetrics( + compoundAccumulator { + meanAccumulator = meanAccumulator { + count = 3 + normalizedSum = 30.0 + } + } + ) + + assertThat(dpAggregates).isEqualTo(dpAggregates { mean = 10.0 }) + } + + @Test + fun computeMetrics_meanCombiner_returnsCountSumMean() { + val compoundCombiner = + CompoundCombiner( + listOf( + MeanCombiner( + COUNT_AND_MEAN_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(MEAN), + MetricDefinition(COUNT), + MetricDefinition(SUM), + ) + ), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + ) + ) + + val dpAggregates = + compoundCombiner.computeMetrics( + compoundAccumulator { + meanAccumulator = meanAccumulator { + count = 3 + normalizedSum = 30.0 + } + } + ) + + assertThat(dpAggregates) + .isEqualTo( + dpAggregates { + count = 3.0 + sum = 30.0 + mean = 10.0 + } + ) + } + + @Test + fun computeMetrics_varianceCombiner_returnsVarianceMetric() { + val compoundCombiner = + CompoundCombiner( + listOf( + VarianceCombiner( + COUNT_AND_VARIANCE_PARAMS.copy(metrics = ImmutableList.of(MetricDefinition(VARIANCE))), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + ) + ) + + val dpAggregates = + compoundCombiner.computeMetrics( + compoundAccumulator { + varianceAccumulator = varianceAccumulator { + count = 10 + normalizedSum = 120.0 + normalizedSumSquares = 1500.0 + } + } + ) + + assertThat(dpAggregates).isEqualTo(dpAggregates { variance = 6.0 }) + } + + @Test + fun computeMetrics_varianceCombiner_returnsCountSumMeanVariance() { + val compoundCombiner = + CompoundCombiner( + listOf( + VarianceCombiner( + COUNT_AND_VARIANCE_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(MEAN), + MetricDefinition(COUNT), + MetricDefinition(SUM), + MetricDefinition(VARIANCE), + ) + ), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + ) + ) + + val dpAggregates = + compoundCombiner.computeMetrics( + compoundAccumulator { + varianceAccumulator = varianceAccumulator { + count = 10 + normalizedSum = 120.0 + normalizedSumSquares = 1500.0 + } + } + ) + + assertThat(dpAggregates) + .isEqualTo( + dpAggregates { + count = 10.0 + sum = 120.0 + mean = 12.0 + variance = 6.0 + } + ) + } + + @Test + fun createAccumulator_exactPrivacyIdCountCombiner_createsAccumulator() { + val compoundCombiner = CompoundCombiner(listOf(ExactPrivacyIdCountCombiner())) + + val accumulator = + compoundCombiner.createAccumulator(privacyIdContributions { values += listOf(10.0, 5.0) }) + + assertThat(accumulator) + .isEqualTo( + compoundAccumulator { privacyIdCountAccumulator = privacyIdCountAccumulator { count = 1 } } + ) + } + + @Test + fun mergeAccumulator_exactPrivacyIdCountCombiner_mergesAccumulators() { + val compoundCombiner = CompoundCombiner(listOf(ExactPrivacyIdCountCombiner())) + + val mergedAccumulator = + compoundCombiner.mergeAccumulators( + compoundAccumulator { privacyIdCountAccumulator = privacyIdCountAccumulator { count = 4 } }, + compoundAccumulator { privacyIdCountAccumulator = privacyIdCountAccumulator { count = 5 } }, + ) + + assertThat(mergedAccumulator) + .isEqualTo( + compoundAccumulator { privacyIdCountAccumulator = privacyIdCountAccumulator { count = 9 } } + ) + } + + @Test + fun computeMetrics_exactPrivacyIdCountCombiner_returnsEmptyMetrics() { + val compoundCombiner = CompoundCombiner(listOf(ExactPrivacyIdCountCombiner())) + + val dpAggregates = + compoundCombiner.computeMetrics( + compoundAccumulator { privacyIdCountAccumulator = privacyIdCountAccumulator { count = 3 } } + ) + + assertThat(dpAggregates).isEqualTo(dpAggregates {}) // No Exact Privacy Id metric + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CoreTests.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CoreTests.kt new file mode 100644 index 00000000..985c81e3 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CoreTests.kt @@ -0,0 +1,49 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import org.junit.runner.RunWith +import org.junit.runners.Suite + +/** Provides a list of JUnit test classes to Bazel. When creating a new test class, add it here. */ +@RunWith(Suite::class) +@Suite.SuiteClasses( + CompoundCombinerTest::class, + CountCombinerTest::class, + DataExtractorsTest::class, + DpEngineTest::class, + DpFunctionsParamsTest::class, + EndToEndTest::class, + ExactPrivacyIdCountCombinerTest::class, + MeanCombinerTest::class, + NoPrivacySamplerTest::class, + PartitionAndPerPartitionSamplerTest::class, + PartitionSamplerTest::class, + PartitionSamplerWithoutValuesTest::class, + PerPartitionContributionsSamplerTest::class, + PostAggregationPartitionSelectionCombinerTest::class, + PrivacyIdCombinerTest::class, + PrivatePartitionsComputationalGraphTest::class, + PrivatePartitionsTest::class, + PublicPartitionsComputationalGraphTest::class, + PublicPartitionsTest::class, + QuantilesCombinerTest::class, + SelectPartitionsComputationalGraphTest::class, + SumCombinerTest::class, + VarianceCombinerTest::class, +) +class CoreTests {} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CountCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CountCombinerTest.kt new file mode 100644 index 00000000..3689424d --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/CountCombinerTest.kt @@ -0,0 +1,190 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.DATASET_LEVEL +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITHOUT_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.countAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.mock +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class CountCombinerTest { + private val AGG_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 3, + maxContributionsPerPartition = 5, + ) + + private val noiseMock: Noise = mock() + private val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> noiseMock } + private val UNUSED_ALLOCATED_BUDGET = AllocatedBudget() + + init { + UNUSED_ALLOCATED_BUDGET.initialize(1.1, 1e-3) + } + + @Test + fun emptyAccumulator_countIsZero() { + val combiner = CountCombiner(AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = combiner.emptyAccumulator() + + assertThat(accumulator).isEqualTo(countAccumulator { count = 0 }) + } + + @Test + fun createAccumulator_countsItems() { + val combiner = CountCombiner(AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(1.0, 1.0, 1.0) }) + + assertThat(accumulator).isEqualTo(countAccumulator { count = 3 }) + } + + @Test + fun createAccumulator_privacyLevelWithContributionBounding_clampsCount() { + val combiner = + CountCombiner( + AGG_PARAMS.copy(maxContributionsPerPartition = 2, privacyLevel = DATASET_LEVEL), + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(1.0, 1.0, 1.0) }) + + assertThat(accumulator).isEqualTo(countAccumulator { count = 2 }) + } + + @Test + fun createAccumulator_privacyLevelWithoutContributionBounding_doesNotClampCount() { + val combiner = + CountCombiner( + AGG_PARAMS.copy( + maxContributionsPerPartition = 2, + privacyLevel = NONE_WITHOUT_CONTRIBUTION_BOUNDING, + ), + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(1.0, 1.0, 1.0) }) + + assertThat(accumulator).isEqualTo(countAccumulator { count = 3 }) + } + + @Test + fun mergeAccumulators_sumsCounts() { + val combiner = CountCombiner(AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = + combiner.mergeAccumulators(countAccumulator { count = 1 }, countAccumulator { count = 2 }) + + assertThat(accumulator).isEqualTo(countAccumulator { count = 3 }) + } + + @Test + @TestParameters("{noiseKind: LAPLACE, delta: 0.0}", "{noiseKind: GAUSSIAN, delta: 0.1}") + fun computeMetrics_addsNoise(noiseKind: NoiseKind, delta: Double) { + val paramsWithNoise = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT)), + noiseKind = noiseKind, + maxPartitionsContributed = 30, + maxContributionsPerPartition = 50, + ) + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, delta) + val combiner = CountCombiner(paramsWithNoise, allocatedBudget, NoiseFactory()) + + val result = combiner.computeMetrics(countAccumulator { count = 1 }) + + assertThat(result).isNotEqualTo(1) + } + + @Test + fun computeMetrics_passesCorrectParametersToNoise() { + val params = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 3, + maxContributionsPerPartition = 5, + ) + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-3) + val combiner = CountCombiner(params, allocatedBudget, noiseFactoryMock) + + val unused = combiner.computeMetrics(countAccumulator { count = 1 }) + + verify(noiseMock) + .addNoise( + /* x= */ 1.0, + /* l0Sensitivity= */ 3, + /* lInfSensitivity= */ 5.0, + /* epsilon= */ 1.1, + /*delta= */ 1e-3, + ) + } + + @Test + fun computeMetrics_withoutNoise_withMultipleContributionsIncludingEmptyAccumulator_returnsCorrectResult() { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-3) + val combiner = CountCombiner(AGG_PARAMS, allocatedBudget, ZeroNoiseFactory()) + + val accumulator0 = combiner.emptyAccumulator() + val accumulator1 = + combiner.createAccumulator(privacyIdContributions { values += listOf(0.0, 0.0) }) + val accumulator2 = combiner.createAccumulator(privacyIdContributions { values += listOf(0.0) }) + val accumulator3 = combiner.mergeAccumulators(accumulator0, accumulator1) + val finalAccumulator = combiner.mergeAccumulators(accumulator2, accumulator3) + val result = combiner.computeMetrics(finalAccumulator) + + assertThat(result).isEqualTo(3.0) + } + + @Test + fun computeMetrics_withoutNoise_onlyEmptyAccumulator_returnsZeroCount() { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-3) + val combiner = CountCombiner(AGG_PARAMS, allocatedBudget, ZeroNoiseFactory()) + + val result = combiner.computeMetrics(combiner.emptyAccumulator()) + + assertThat(result).isEqualTo(0.0) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DataExtractorsTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DataExtractorsTest.kt new file mode 100644 index 00000000..fd20cff1 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DataExtractorsTest.kt @@ -0,0 +1,65 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class DataExtractorsTest { + @Test + fun from_withValueExtractor_constructsDataExtractors() { + val inputRow = InputRow(privacyId = "userId", partitionKey = "partitionKey", value = 10.0) + val ef = LocalEncoderFactory() + + val dataExtractors = + DataExtractors.from( + { it.privacyId }, + ef.strings(), + { it.partitionKey }, + ef.strings(), + { it.value }, + ) + val extractedContribution = dataExtractors.contributionExtractor.invoke(inputRow) + + assertThat(extractedContribution) + .isEqualTo(contributionWithPrivacyId("userId", "partitionKey", 10.0)) + } + + @Test + fun from_withoutValueExtractor_constructsDataExtractors() { + val inputRow = InputRow(privacyId = "userId", partitionKey = "partitionKey", value = 20.0) + val ef = LocalEncoderFactory() + + val dataExtractors = + DataExtractors.from( + { it.privacyId }, + ef.strings(), + { it.partitionKey }, + ef.strings(), + ) + val extractedContribution = dataExtractors.contributionExtractor.invoke(inputRow) + + assertThat(extractedContribution) + .isEqualTo(contributionWithPrivacyId("userId", "partitionKey", 0.0)) + } +} + +data class InputRow(val privacyId: String, val partitionKey: String, val value: Double) diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngineTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngineTest.kt new file mode 100644 index 00000000..22160c01 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngineTest.kt @@ -0,0 +1,1171 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.GaussianNoise +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.QUANTILES +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.VARIANCE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.LAPLACE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.DATASET_LEVEL +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITHOUT_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITH_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AbsoluteBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetAccountingStrategy.NAIVE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.RelativeBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.TotalBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.local.createLocalEngine +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.dpAggregates +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.ArgumentMatchers.anyDouble +import org.mockito.ArgumentMatchers.anyInt +import org.mockito.ArgumentMatchers.isA +import org.mockito.kotlin.any +import org.mockito.kotlin.anyOrNull +import org.mockito.kotlin.argThat +import org.mockito.kotlin.argumentCaptor +import org.mockito.kotlin.eq +import org.mockito.kotlin.isA +import org.mockito.kotlin.isNull +import org.mockito.kotlin.spy +import org.mockito.kotlin.verify + +/** + * Unit tests and behavioural (mocking) tests for DpEngine. All tests use the dpEngine instance + * designed for testing which does not apply noise. Mocked tests verify that the proper internal + * calls are made. + */ +@RunWith(TestParameterInjector::class) +class DpEngineTest { + + @Test + fun aggregate_calledAfterDone_throws() { + val dpEngine = DpEngine.createForTesting(LOCAL_EF, LARGE_BUDGET_SPEC, ZeroNoiseFactory()) + dpEngine.done() + + val e = + assertFailsWith { + dpEngine.aggregate( + LocalCollection(sequenceOf()), + COUNT_PARAMS, + testDataExtractors, + LocalCollection(sequenceOf()), + ) + } + assertThat(e).hasMessageThat().contains("done() has already been called") + } + + @Test + fun done_calledTwice_throws() { + val dpEngine = DpEngine.createForTesting(LOCAL_EF, LARGE_BUDGET_SPEC, ZeroNoiseFactory()) + dpEngine.done() + + val e = assertFailsWith { dpEngine.done() } + assertThat(e).hasMessageThat().contains("done() has already been called") + } + + @Test + fun aggregate_incorrectAggregateParams_throws() { + val e = + assertFailsWith { + DpEngine.createForTesting(LOCAL_EF, LARGE_BUDGET_SPEC, ZeroNoiseFactory()) + .aggregate( + LocalCollection(sequenceOf()), + // empty metrics are not allowed + COUNT_PARAMS.copy(metrics = ImmutableList.of()), + testDataExtractors, + LocalCollection(sequenceOf()), + ) + } + assertThat(e).hasMessageThat().contains("metrics must not be empty") + } + + @Test + fun aggregate_invalidDataExtactors_throws() { + val dataExtractorsWithoutValueExtractor = + DataExtractors.from( + privacyIdExtractor = { row -> row.privacyId }, + LOCAL_EF.strings(), + partitionKeyExtractor = { row -> row.partitionKey }, + LOCAL_EF.strings(), + ) + + val e = + assertFailsWith { + DpEngine.createForTesting(LOCAL_EF, LARGE_BUDGET_SPEC, ZeroNoiseFactory()) + .aggregate(LocalCollection(sequenceOf()), SUM_PARAMS, dataExtractorsWithoutValueExtractor) + } + assertThat(e).hasMessageThat().contains("Metrics [SUM] require a value extractor") + } + + @Test + fun aggregate_partitionSelectionSetForPublicPartition_throws() { + val e = + assertFailsWith { + DpEngine.createForTesting(LOCAL_EF, LARGE_BUDGET_SPEC, ZeroNoiseFactory()) + .aggregate( + LocalCollection(sequenceOf()), + COUNT_PARAMS.copy(partitionSelectionBudget = AbsoluteBudgetPerOpSpec(1.0, 1e-5)), + testDataExtractors, + LocalCollection(sequenceOf()), + ) + } + assertThat(e) + .hasMessageThat() + .contains("partitionSelectionBudget can not be set for public partitions") + } + + @Test + fun aggregate_clampsCount() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "US", 10.0), + TestDataRow("Alice", "US", 10.0), + TestDataRow("Alice", "US", 10.0), + ) + ) + val publicPartitions = LocalCollection(sequenceOf("US")) + val dpEngine = DpEngine.createForTesting(LOCAL_EF, LARGE_BUDGET_SPEC, ZeroNoiseFactory()) + val params = COUNT_PARAMS.copy(maxContributionsPerPartition = 2) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + assertThat(dpAggregates.data.toList()).containsExactly(Pair("US", dpAggregates { count = 2.0 })) + } + + /** + * Count and sum clamping happen separately. Contributions discarded during count clamping are + * still summed-up when sum is being computed. For example, if the client contributes + * [10, 20, 30], maxContributionsPerPartition = 2 and maxTotalSum = 60, the user will contribute 2 + * to count and 60 to sum (i.e., all contributions will be counted towards sum as long as their + * total sum doesn't exceed maxTotalSum). This behavior is tested below. + */ + @Test + fun aggregate_countClampingDoesntAffectSum() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "US", 10.0), + TestDataRow("Alice", "US", 10.0), + TestDataRow("Alice", "US", 10.0), + ) + ) + val publicPartitions = LocalCollection(sequenceOf("US")) + val dpEngine = DpEngine.createForTesting(LOCAL_EF, LARGE_BUDGET_SPEC, ZeroNoiseFactory()) + val params = COUNT_AND_SUM_PARAMS.copy(maxContributionsPerPartition = 2, maxTotalValue = 30.0) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + assertThat(dpAggregates.data.toList()) + .containsExactly( + Pair( + "US", + dpAggregates { + count = 2.0 + sum = 30.0 + }, + ) + ) + } + + @Test + fun aggregate_clampsTotalSum() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "US", 10.0), + TestDataRow("Alice", "US", 20.0), + TestDataRow("Bob", "NL", -10.0), + TestDataRow("Bob", "NL", -20.0), + ) + ) + val publicPartitions = LocalCollection(sequenceOf("US", "NL")) + val dpEngine = DpEngine.createForTesting(LOCAL_EF, LARGE_BUDGET_SPEC, ZeroNoiseFactory()) + val params = COUNT_AND_SUM_PARAMS.copy(minTotalValue = -25.0, maxTotalValue = 25.0) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + assertThat(dpAggregates.data.toList()) + .containsExactly( + Pair( + "US", + dpAggregates { + count = 2.0 + sum = 25.0 + }, + ), + Pair( + "NL", + dpAggregates { + count = 2.0 + sum = -25.0 + }, + ), + ) + } + + @Test + fun aggregate_addsNoise() { + val inputData = + LocalCollection(sequenceOf(TestDataRow("Alice", "US", 1.0), TestDataRow("Bob", "US", 2.0))) + val publicPartitions = LocalCollection(sequenceOf("US")) + val dpEngine = DpEngine.createLocalEngine(LARGE_BUDGET_SPEC) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = + ImmutableList.of( + MetricDefinition(COUNT, AbsoluteBudgetPerOpSpec(0.1, 1e-5)), + MetricDefinition(SUM, AbsoluteBudgetPerOpSpec(0.1, 1e-5)), + MetricDefinition(PRIVACY_ID_COUNT, AbsoluteBudgetPerOpSpec(0.1, 1e-5)), + ), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 5, + maxContributionsPerPartition = 5, + minTotalValue = -5.0, + maxTotalValue = 5.0, + ) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + val dpAggregatesAnotherRun = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + assertThat(dpAggregates.data.toMap()["US"]!!.count) + .isNotEqualTo(dpAggregatesAnotherRun.data.toMap()["US"]!!.count) + assertThat(dpAggregates.data.toMap()["US"]!!.sum) + .isNotEqualTo(dpAggregatesAnotherRun.data.toMap()["US"]!!.sum) + assertThat(dpAggregates.data.toMap()["US"]!!.privacyIdCount) + .isNotEqualTo(dpAggregatesAnotherRun.data.toMap()["US"]!!.privacyIdCount) + } + + @Test + fun aggregate_aggregateReturnsMean() { + val inputData = + LocalCollection(sequenceOf(TestDataRow("Alice", "US", 1.0), TestDataRow("Bob", "US", 2.0))) + val publicPartitions = LocalCollection(sequenceOf("US")) + val dpEngine = DpEngine.createLocalEngine(LARGE_BUDGET_SPEC) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = + ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(SUM), MetricDefinition(MEAN)), + noiseKind = LAPLACE, + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + minValue = -2.0, + maxValue = 2.0, + ) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + val partitionResult = dpAggregates.data.toMap()["US"]!! + assertThat(partitionResult.count).isWithin(1e-1).of(2.0) + assertThat(partitionResult.sum).isWithin(1e-1).of(3.0) + assertThat(partitionResult.mean).isWithin(1e-10).of(partitionResult.sum / partitionResult.count) + } + + @Test + fun aggregate_aggregateReturnsVariance() { + val inputData = + LocalCollection(sequenceOf(TestDataRow("Alice", "US", 1.0), TestDataRow("Bob", "US", 2.0))) + val publicPartitions = LocalCollection(sequenceOf("US")) + val dpEngine = DpEngine.createLocalEngine(LARGE_BUDGET_SPEC) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = + ImmutableList.of( + MetricDefinition(COUNT), + MetricDefinition(SUM), + MetricDefinition(MEAN), + MetricDefinition(VARIANCE), + ), + noiseKind = LAPLACE, + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + minValue = -2.0, + maxValue = 2.0, + ) + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + val partitionResult = dpAggregates.data.toMap()["US"]!! + assertThat(partitionResult.count).isWithin(1e-1).of(2.0) + assertThat(partitionResult.sum).isWithin(1e-1).of(3.0) + assertThat(partitionResult.mean).isWithin(1e-10).of(partitionResult.sum / partitionResult.count) + assertThat(partitionResult.variance) + .isWithin(1e-1) + .of(((1.0 * 1.0) + (2.0 * 2.0)) / 2.0 - (3.0 / 2.0) * (3.0 / 2.0)) + } + + enum class CombinersTestCase( + val params: AggregationParams, + val countCombinerPresent: Boolean, + val sumCombinerPresent: Boolean, + ) { + COUNT(params = COUNT_PARAMS, countCombinerPresent = true, sumCombinerPresent = false), + SUM(params = SUM_PARAMS, countCombinerPresent = false, sumCombinerPresent = true), + COUNT_AND_SUM( + params = COUNT_AND_SUM_PARAMS, + countCombinerPresent = true, + sumCombinerPresent = true, + ), + } + + @Test + fun aggregate_createsCombinersForMetrics(@TestParameter testCase: CombinersTestCase) { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.aggregate( + LocalCollection(sequenceOf()), + testCase.params, + testDataExtractors, + LocalCollection(sequenceOf()), + ) + + val combinerCaptor = argumentCaptor() + verify(graphFactorySpy) + .createForPublicPartitions( + any(), + combinerCaptor.capture(), + any(), + any(), + any(), + any(), + ) + val combiner = combinerCaptor.firstValue + assertThat(combiner.combiners.any { it is CountCombiner }) + .isEqualTo(testCase.countCombinerPresent) + assertThat(combiner.combiners.any { it is SumCombiner }).isEqualTo(testCase.sumCombinerPresent) + } + + enum class ContributionSamplerTestCase( + val aggregationParams: AggregationParams, + val expectedContributionSampler: Class>, + ) { + DATASET_LEVEL_WITH_PRIVACY_ID_COUNT_METRIC( + PRIVACY_ID_COUNT_PARAMS.copy(privacyLevel = DATASET_LEVEL), + PartitionSampler::class.java, + ), + DATASET_LEVEL_WITH_COUNT_METRIC( + COUNT_PARAMS.copy(privacyLevel = DATASET_LEVEL), + PartitionSampler::class.java, + ), + DATASET_LEVEL_WITH_SUM_METRIC( + SUM_PARAMS.copy(privacyLevel = DATASET_LEVEL), + PartitionSampler::class.java, + ), + DATASET_LEVEL_WITH_MEAN_METRIC( + MEAN_PARAMS.copy(privacyLevel = DATASET_LEVEL), + PartitionAndPerPartitionSampler::class.java, + ), + DATASET_LEVEL_WITH_QUANTILES_METRIC( + QUANTILES_PARAMS.copy(privacyLevel = DATASET_LEVEL), + PartitionAndPerPartitionSampler::class.java, + ), + DATASET_LEVEL_WITH_ALL_METRICS( + AggregationParams( + privacyLevel = DATASET_LEVEL, + noiseKind = GAUSSIAN, + metrics = + ImmutableList.of( + MetricDefinition(PRIVACY_ID_COUNT), + MetricDefinition(COUNT), + MetricDefinition(SUM), + MetricDefinition(MEAN), + MetricDefinition(QUANTILES(ranks = ImmutableList.of())), + ), + maxPartitionsContributed = 10, + maxContributionsPerPartition = 20, + minValue = -10.0, + maxValue = 10.0, + ), + PartitionAndPerPartitionSampler::class.java, + ), + // It is enough to test with one metric because logic that depends on the metric is agnostic to + // privacy level. + NONE_WITH_CONTRIBUTION_BOUNDING_PRIVACY_LEVEL( + COUNT_PARAMS.copy(privacyLevel = NONE_WITH_CONTRIBUTION_BOUNDING), + PartitionSampler::class.java, + ), + NONE_WITHOUT_CONTRIBUTION_BOUNDING_PRIVACY_LEVEL( + COUNT_PARAMS.copy(privacyLevel = NONE_WITHOUT_CONTRIBUTION_BOUNDING), + NoPrivacySampler::class.java, + ), + } + + @Test + fun aggregate_createsCorrectContributionSampler( + @TestParameter testCase: ContributionSamplerTestCase + ) { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.aggregate( + LocalCollection(sequenceOf()), + testCase.aggregationParams, + testDataExtractors, + ) + + val contributionSamplerCaptor = argumentCaptor>() + verify(graphFactorySpy) + .createForPrivatePartitions( + // isA(testCase.expectedContributionSampler) fails for some reason with message: isA(...) + // must not be null. + contributionSamplerCaptor.capture(), + anyOrNull(), + any(), + any(), + any(), + ) + assertThat(contributionSamplerCaptor.firstValue) + .isInstanceOf(testCase.expectedContributionSampler) + } + + @TestParameters( + "{privacyLevel: NONE_WITH_CONTRIBUTION_BOUNDING}", + "{privacyLevel: NONE_WITHOUT_CONTRIBUTION_BOUNDING}", + ) + @Test + fun aggregate_noPrivacy_createsNoPrivacyPartitionSelector(privacyLevel: PrivacyLevel) { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.aggregate( + LocalCollection(sequenceOf()), + PRIVACY_ID_COUNT_PARAMS.copy(privacyLevel = privacyLevel), + testDataExtractors, + ) + + verify(graphFactorySpy) + .createForPrivatePartitions( + any(), + isA(), + any(), + any(), + any(), + ) + } + + @Test + fun aggregate_withPublicPartitions_createsGraphWithPublicPartitions() { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + val publicPartitions = LocalCollection(sequenceOf("Green", "White", "Green")) + + val unused = + dpEngine.aggregate( + LocalCollection(sequenceOf()), + COUNT_AND_SUM_PARAMS, + testDataExtractors, + publicPartitions, + ) + + val publicPartitionsCaptor = argumentCaptor>() + verify(graphFactorySpy) + .createForPublicPartitions( + any(), + any(), + any(), + any(), + publicPartitionsCaptor.capture(), + any(), + ) + assertThat(publicPartitionsCaptor.allValues).hasSize(1) + assertThat(publicPartitionsCaptor.firstValue).isInstanceOf(LocalCollection::class.java) + assertThat((publicPartitionsCaptor.firstValue as LocalCollection).data.toList()) + .isEqualTo(listOf("Green", "White")) + } + + @Test + fun aggregate_passesDataExtractorsToComputationalGraph() { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.aggregate( + LocalCollection(sequenceOf()), + COUNT_AND_SUM_PARAMS, + testDataExtractors, + LocalCollection(sequenceOf()), + ) + + verify(graphFactorySpy) + .createForPublicPartitions(any(), any(), eq(testDataExtractors), any(), any(), any()) + } + + @Test + fun aggregate_passesLocalFactoryToComputationalGraph() { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.aggregate( + LocalCollection(sequenceOf()), + COUNT_AND_SUM_PARAMS, + testDataExtractors, + LocalCollection(sequenceOf()), + ) + + verify(graphFactorySpy) + .createForPublicPartitions( + any(), + any(), + any(), + eq(LOCAL_EF), + any(), + any(), + ) + } + + @Test + fun aggregate_privatePartitions_createsGraphWithPrivatePartitions() { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.aggregate( + LocalCollection(sequenceOf()), + COUNT_AND_SUM_PARAMS.copy(preThreshold = 10), + testDataExtractors, + ) + + val combinerCaptor = argumentCaptor() + verify(graphFactorySpy) + .createForPrivatePartitions( + isA>(), + argThat { (this as DpLibPreAggregationPartitionSelector).preThreshold == 10 }, + combinerCaptor.capture(), + isA>(), + eq(LOCAL_EF), + ) + val combiners = combinerCaptor.firstValue.combiners + assertThat(combiners.count()).isEqualTo(3) + assertThat(combiners.any { it is CountCombiner }).isTrue() + assertThat(combiners.any { it is SumCombiner }).isTrue() + // No PRIVACY_ID_COUNT in metrics, so ExactPrivacyIdCountCombiner should be used. + assertThat(combiners.any { it is ExactPrivacyIdCountCombiner }).isTrue() + } + + @Test + fun aggregate_createsGraphWithPostAggregationPartitionSelection() { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.aggregate( + LocalCollection(sequenceOf()), + PRIVACY_ID_COUNT_PARAMS.copy(preThreshold = 10), + testDataExtractors, + ) + + val combinerCaptor = argumentCaptor() + + verify(graphFactorySpy) + .createForPrivatePartitions( + any(), + preAggregationPartitionSelector = isNull(), + combinerCaptor.capture(), + any(), + any(), + ) + val compoundCombiner = combinerCaptor.firstValue + // Post aggregation partition selection is performed. + assertThat(compoundCombiner.hasPostAggregationCombiner()).isTrue() + val combiners = compoundCombiner.combiners.toList() + assertThat(combiners.size).isEqualTo(1) + val partitionSelector = + (combiners[0] as PostAggregationPartitionSelectionCombiner).getPartitionSelector() + assertThat(partitionSelector).isInstanceOf(PostAggregationPartitionSelectorImpl::class.java) + assertThat((partitionSelector as PostAggregationPartitionSelectorImpl).preThreshold) + .isEqualTo(10) + } + + @Test + fun selectPartitons_computationalGraphIsCorrect() { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.selectPartitions( + LocalCollection(sequenceOf()), + SelectPartitionsParams(maxPartitionsContributed = 5, preThreshold = 100), + testDataExtractors, + ) + + val partitionSelectorCaptor = argumentCaptor() + verify(graphFactorySpy) + .createForSelectPartitions( + isA>(), + partitionSelectorCaptor.capture(), + isA>(), + any(), + ) + assertThat(partitionSelectorCaptor.allValues.map { it.preThreshold }).isEqualTo(listOf(100)) + } + + @Test + fun selectPartitons_nonePrivacyWithoutContributionBounding_computationalGraphIsCorrect() { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.selectPartitions( + LocalCollection(sequenceOf()), + SelectPartitionsParams( + privacyLevel = NONE_WITHOUT_CONTRIBUTION_BOUNDING, + maxPartitionsContributed = 5, + ), + testDataExtractors, + ) + + verify(graphFactorySpy) + .createForSelectPartitions( + isA>(), + isA(), + isA>(), + any(), + ) + } + + @Test + fun selectPartitons_nonePrivacyWithContributionBounding_computationalGraphIsCorrect() { + val graphFactorySpy: ComputationalGraphFactory = spy() + val dpEngine = + DpEngine.createForTesting( + LOCAL_EF, + LARGE_BUDGET_SPEC, + computationalGraphFactory = graphFactorySpy, + ) + + val unused = + dpEngine.selectPartitions( + LocalCollection(sequenceOf()), + SelectPartitionsParams( + privacyLevel = NONE_WITH_CONTRIBUTION_BOUNDING, + maxPartitionsContributed = 5, + ), + testDataExtractors, + ) + + verify(graphFactorySpy) + .createForSelectPartitions( + isA>(), + isA(), + isA>(), + any(), + ) + } + + enum class CountSumBudgetTestCase( + val totalMetricsBudget: TotalBudget, + val requestedCountBudget: BudgetPerOpSpec, + val requestedSumBudget: BudgetPerOpSpec, + val countNoiseEpsilon: Double, + val countNoiseDelta: Double, + val sumNoiseEpsilon: Double, + val sumNoiseDelta: Double, + ) { + ABSOLUTE( + totalMetricsBudget = TotalBudget(3.0, 0.3), + requestedCountBudget = AbsoluteBudgetPerOpSpec(1.0, 0.1), + requestedSumBudget = AbsoluteBudgetPerOpSpec(2.0, 0.2), + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 2.0, + sumNoiseDelta = 0.2, + ), + RELATIVE( + totalMetricsBudget = TotalBudget(3.0, 0.3), + requestedCountBudget = RelativeBudgetPerOpSpec(1.0), + requestedSumBudget = RelativeBudgetPerOpSpec(2.0), + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 2.0, + sumNoiseDelta = 0.2, + ), + ABSOLUTE_AND_RELATIVE( + totalMetricsBudget = TotalBudget(3.0, 0.3), + requestedCountBudget = AbsoluteBudgetPerOpSpec(1.0, 0.1), + requestedSumBudget = RelativeBudgetPerOpSpec(1.0), + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 2.0, + sumNoiseDelta = 0.2, + ), + } + + @Test + fun aggregate_withRelativeNaiveBudgetSplit_allocatesBudgetAccordingToAggregationBudgetSpec( + @TestParameter testCase: CountSumBudgetTestCase + ) { + val inputData = + LocalCollection(sequenceOf(TestDataRow("Alice", "US", 1.0), TestDataRow("Bob", "US", 2.0))) + val publicPartitions = LocalCollection(sequenceOf("US")) + val gaussianNoiseSpy: GaussianNoise = spy() + val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> gaussianNoiseSpy } + val budgetSpec = + DpEngineBudgetSpec(budget = testCase.totalMetricsBudget, accountingStrategy = NAIVE) + val dpEngine = DpEngine.createForTesting(LOCAL_EF, budgetSpec, noiseFactoryMock) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = + ImmutableList.of( + MetricDefinition(COUNT, testCase.requestedCountBudget), + MetricDefinition(SUM, testCase.requestedSumBudget), + ), + noiseKind = GAUSSIAN, + // Choose large values to avoid contribution clamping but keep the values low enough to + // avoid sensitivity overflow. + maxPartitionsContributed = 100, + maxContributionsPerPartition = 100, + minTotalValue = -100.0, + maxTotalValue = 100.0, + ) + + val result = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + // Access the result to trigger the computation. + assertThat(result.data.toList()).isNotEmpty() + // Check parameters of the noise addition to Count. + val countDeltaCaptor = argumentCaptor() + verify(gaussianNoiseSpy) + .addNoise( + anyDouble(), + anyInt(), + anyDouble(), + eq(testCase.countNoiseEpsilon), + countDeltaCaptor.capture(), + ) + assertThat(countDeltaCaptor.firstValue).isWithin(1e-15).of(testCase.countNoiseDelta) + // Check parameters of the noise addition to Sum. + val sumDeltaCaptor = argumentCaptor() + verify(gaussianNoiseSpy) + .addNoise( + anyDouble(), + anyInt(), + anyDouble(), + eq(testCase.sumNoiseEpsilon), + sumDeltaCaptor.capture(), + ) + assertThat(sumDeltaCaptor.firstValue).isWithin(1e-15).of(testCase.sumNoiseDelta) + } + + enum class MeanBudgetTestCase( + val totalMetricsBudget: TotalBudget, + val requestedCountBudget: BudgetPerOpSpec?, + val requestedSumBudget: BudgetPerOpSpec?, + val requestedMeanBudget: BudgetPerOpSpec?, + val countNoiseEpsilon: Double, + val countNoiseDelta: Double, + val sumNoiseEpsilon: Double, + val sumNoiseDelta: Double, + ) { + ABSOLUTE_COUNT_SUM( + totalMetricsBudget = TotalBudget(3.0, 0.3), + requestedCountBudget = AbsoluteBudgetPerOpSpec(1.0, 0.1), + requestedSumBudget = AbsoluteBudgetPerOpSpec(2.0, 0.2), + requestedMeanBudget = null, + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 2.0, + sumNoiseDelta = 0.2, + ), + RELATIVE_COUNT_SUM( + totalMetricsBudget = TotalBudget(3.0, 0.3), + requestedCountBudget = RelativeBudgetPerOpSpec(1.0), + requestedSumBudget = RelativeBudgetPerOpSpec(2.0), + requestedMeanBudget = null, + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 2.0, + sumNoiseDelta = 0.2, + ), + ABSOLUTE_AND_RELATIVE_COUNT_SUM( + totalMetricsBudget = TotalBudget(3.0, 0.3), + requestedCountBudget = AbsoluteBudgetPerOpSpec(1.0, 0.1), + requestedSumBudget = RelativeBudgetPerOpSpec(1.0), + requestedMeanBudget = null, + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 2.0, + sumNoiseDelta = 0.2, + ), + ABSOLUTE_MEAN( + totalMetricsBudget = TotalBudget(3.0, 0.3), + requestedCountBudget = null, + requestedSumBudget = null, + requestedMeanBudget = AbsoluteBudgetPerOpSpec(2.0, 0.2), + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 1.0, + sumNoiseDelta = 0.1, + ), + RELATIVE_MEAN( + totalMetricsBudget = TotalBudget(2.0, 0.2), + requestedCountBudget = null, + requestedSumBudget = null, + requestedMeanBudget = RelativeBudgetPerOpSpec(1.0), + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 1.0, + sumNoiseDelta = 0.1, + ), + MEAN_DEFAULT( + totalMetricsBudget = TotalBudget(2.0, 0.2), + requestedCountBudget = null, + requestedSumBudget = null, + requestedMeanBudget = null, + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 1.0, + sumNoiseDelta = 0.1, + ), + MEAN_COUNT_PROVIDED_SUM_DEFAULT( + totalMetricsBudget = TotalBudget(2.0, 0.2), + requestedCountBudget = AbsoluteBudgetPerOpSpec(1.5, 0.05), + requestedSumBudget = null, + requestedMeanBudget = null, + countNoiseEpsilon = 1.5, + countNoiseDelta = 0.05, + sumNoiseEpsilon = 0.5, + sumNoiseDelta = 0.15, + ), + } + + @Test + fun aggregate_withAbsoluteNaiveBudgetSplit_allocatesBudgetAccordingToAggregationBudgetSpec( + @TestParameter testCase: MeanBudgetTestCase + ) { + val inputData = + LocalCollection(sequenceOf(TestDataRow("Alice", "US", 1.0), TestDataRow("Bob", "US", 2.0))) + val publicPartitions = LocalCollection(sequenceOf("US")) + val gaussianNoiseSpy: GaussianNoise = spy() + val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> gaussianNoiseSpy } + val budgetSpec = + DpEngineBudgetSpec(budget = testCase.totalMetricsBudget, accountingStrategy = NAIVE) + val dpEngine = DpEngine.createForTesting(LOCAL_EF, budgetSpec, noiseFactoryMock) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = + ImmutableList.of( + MetricDefinition(COUNT, testCase.requestedCountBudget), + MetricDefinition(SUM, testCase.requestedSumBudget), + MetricDefinition(MEAN, testCase.requestedMeanBudget), + ), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 10, + maxContributionsPerPartition = 20, + minValue = -10.0, + maxValue = 10.0, + ) + + val result = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + // Access the result to trigger the computation. + assertThat(result.data.toList()).isNotEmpty() + // Check parameters of the noise addition to Count. + val countDeltaCaptor = argumentCaptor() + verify(gaussianNoiseSpy) + .addNoise( + eq(2.0), // True count + anyInt(), + anyDouble(), + eq(testCase.countNoiseEpsilon), + countDeltaCaptor.capture(), + ) + assertThat(countDeltaCaptor.firstValue).isWithin(1e-15).of(testCase.countNoiseDelta) + // Check parameters of the noise addition to Sum. + val sumDeltaCaptor = argumentCaptor() + verify(gaussianNoiseSpy) + .addNoise( + eq(3.0), // True sum + anyInt(), + anyDouble(), + eq(testCase.sumNoiseEpsilon), + sumDeltaCaptor.capture(), + ) + assertThat(sumDeltaCaptor.firstValue).isWithin(1e-15).of(testCase.sumNoiseDelta) + } + + enum class VarianceBudgetTestCase( + val totalMetricsBudget: TotalBudget, + val requestedVarianceBudget: BudgetPerOpSpec?, + val countNoiseEpsilon: Double, + val countNoiseDelta: Double, + val sumNoiseEpsilon: Double, + val sumNoiseDelta: Double, + val sumSquaresNoiseEpsilon: Double, + val sumSquaresNoiseDelta: Double, + ) { + DEFAULT_BUDGET( + totalMetricsBudget = TotalBudget(3.0, 0.3), + requestedVarianceBudget = null, + countNoiseEpsilon = 1.0, + countNoiseDelta = 0.1, + sumNoiseEpsilon = 1.0, + sumNoiseDelta = 0.1, + sumSquaresNoiseEpsilon = 1.0, + sumSquaresNoiseDelta = 0.1, + ), + VARIANCE_EVEN_SPLIT_ABSOLUTE( + totalMetricsBudget = TotalBudget(6.0, 0.6), + requestedVarianceBudget = AbsoluteBudgetPerOpSpec(6.0, 0.6), + countNoiseEpsilon = 2.0, + countNoiseDelta = 0.2, + sumNoiseEpsilon = 2.0, + sumNoiseDelta = 0.2, + sumSquaresNoiseEpsilon = 2.0, + sumSquaresNoiseDelta = 0.2, + ), + VARIANCE_EVEN_SPLIT_RELATIVE( + totalMetricsBudget = TotalBudget(6.0, 0.6), + requestedVarianceBudget = RelativeBudgetPerOpSpec(6.0), + countNoiseEpsilon = 2.0, + countNoiseDelta = 0.2, + sumNoiseEpsilon = 2.0, + sumNoiseDelta = 0.2, + sumSquaresNoiseEpsilon = 2.0, + sumSquaresNoiseDelta = 0.2, + ), + } + + @Test + fun aggregate_withAbsoluteNaiveBudgetSplit_allocatesBudgetAccordingToAggregationBudgetSpec( + @TestParameter testCase: VarianceBudgetTestCase + ) { + val inputData = + LocalCollection(sequenceOf(TestDataRow("Alice", "US", 1.0), TestDataRow("Bob", "US", 2.0))) + val publicPartitions = LocalCollection(sequenceOf("US")) + val gaussianNoiseSpy: GaussianNoise = spy() + val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> gaussianNoiseSpy } + val budgetSpec = + DpEngineBudgetSpec(budget = testCase.totalMetricsBudget, accountingStrategy = NAIVE) + val dpEngine = DpEngine.createForTesting(LOCAL_EF, budgetSpec, noiseFactoryMock) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(VARIANCE, testCase.requestedVarianceBudget)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 10, + maxContributionsPerPartition = 20, + minValue = -10.0, + maxValue = 10.0, + ) + + val result = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + // Access the result to trigger the computation. + assertThat(result.data.toList()).isNotEmpty() + // Check parameters of the noise addition to Count. + val countDeltaCaptor = argumentCaptor() + verify(gaussianNoiseSpy) + .addNoise( + eq(2.0), // True count + anyInt(), + anyDouble(), + eq(testCase.countNoiseEpsilon), + countDeltaCaptor.capture(), + ) + assertThat(countDeltaCaptor.firstValue).isWithin(1e-15).of(testCase.countNoiseDelta) + // Check parameters of the noise addition to Sum. + val sumDeltaCaptor = argumentCaptor() + verify(gaussianNoiseSpy) + .addNoise( + eq(3.0), // True sum + anyInt(), + anyDouble(), + eq(testCase.sumNoiseEpsilon), + sumDeltaCaptor.capture(), + ) + assertThat(sumDeltaCaptor.firstValue).isWithin(1e-15).of(testCase.sumNoiseDelta) + // Check parameters of the noise addition to Sum of Squares. + val sumOfSquaresDeltaCaptor = argumentCaptor() + verify(gaussianNoiseSpy) + .addNoise( + eq(5.0), // True sum of squares + anyInt(), + anyDouble(), + eq(testCase.sumSquaresNoiseEpsilon), + sumOfSquaresDeltaCaptor.capture(), + ) + assertThat(sumOfSquaresDeltaCaptor.firstValue).isWithin(1e-15).of(testCase.sumSquaresNoiseDelta) + } + + companion object { + // A DpEngineBudgetSpec with budget large enough to make sure that tests don't run out of it. + private val LARGE_BUDGET_SPEC = + DpEngineBudgetSpec(budget = TotalBudget(epsilon = 2000.0, delta = 0.999999)) + private val PRIVACY_ID_COUNT_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(PRIVACY_ID_COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 1_000_000, + maxContributionsPerPartition = 1_000_000, + ) + private val COUNT_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 1_000_000, + maxContributionsPerPartition = 1_000_000, + ) + private val SUM_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(SUM)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 1_000_000, + minTotalValue = -Double.MAX_VALUE, + maxTotalValue = Double.MAX_VALUE, + ) + private val COUNT_AND_SUM_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(SUM)), + noiseKind = GAUSSIAN, + // Choose large values to avoid contribution clamping but keep the values low enough to + // avoid sensitivity overflow. + maxPartitionsContributed = 100, + maxContributionsPerPartition = 100, + minTotalValue = -100.0, + maxTotalValue = 100.0, + ) + private val MEAN_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(MEAN)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 10, + maxContributionsPerPartition = 20, + minValue = -10.0, + maxValue = 10.0, + ) + private val QUANTILES_PARAMS = + AggregationParams( + metrics = + ImmutableList.of( + MetricDefinition(QUANTILES(ranks = ImmutableList.of(0.0, 0.0001, 0.5, 0.999, 1.0))) + ), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 10, + maxContributionsPerPartition = 20, + minValue = -10.0, + maxValue = 10.0, + ) + private val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngineTestFactory.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngineTestFactory.kt new file mode 100644 index 00000000..fb8a8ee8 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpEngineTestFactory.kt @@ -0,0 +1,34 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetAccountantFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory + +internal fun DpEngine.Factory.createForTesting( + encoderFactory: EncoderFactory, + budgetSpec: DpEngineBudgetSpec, + noiseFactory: (NoiseKind) -> Noise = NoiseFactory(), + computationalGraphFactory: ComputationalGraphFactory = ComputationalGraphFactory(), +) = + DpEngine( + encoderFactory, + BudgetAccountantFactory.forStrategy(budgetSpec.accountingStrategy, budgetSpec.budget), + noiseFactory, + computationalGraphFactory, + ) diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpFunctionsParamsTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpFunctionsParamsTest.kt new file mode 100644 index 00000000..54e3e4fe --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/DpFunctionsParamsTest.kt @@ -0,0 +1,490 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.QUANTILES +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.VARIANCE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.LAPLACE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AbsoluteBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.RelativeBudgetPerOpSpec +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class DpFunctionsParamsTest { + @Test + fun validateAggregationParams_validParams_doesntThrow() { + validateAggregationParams( + AGGREGATION_PARAMS, + usePublicPartitions = true, + hasValueExtractor = true, + ) + validateAggregationParams( + AGGREGATION_PARAMS, + usePublicPartitions = false, + hasValueExtractor = true, + ) + validateAggregationParams( + AGGREGATION_PARAMS, + usePublicPartitions = false, + hasValueExtractor = false, + ) + validateAggregationParams( + AGGREGATION_PARAMS.copy( + maxContributionsPerPartition = null, + metrics = ImmutableList.of(MetricDefinition(SUM)), + minValue = null, + maxValue = null, + minTotalValue = 1.0, + maxTotalValue = 2.0, + ), + usePublicPartitions = true, + hasValueExtractor = true, + ) + } + + enum class InvalidAggregationParamsTestCase( + val aggregationParams: AggregationParams, + val publicPartitions: Boolean = false, + val hasValueExtractor: Boolean = true, + val exceptionMessage: String, + ) { + NOT_POSITIVE_MAX_PARTITION_CONTRIBUTED( + aggregationParams = AGGREGATION_PARAMS.copy(maxPartitionsContributed = 0), + exceptionMessage = "maxPartitionsContributed must be positive. Provided value: 0.", + ), + MAX_PARTITIONS_CONTRIBUTED_NOT_SET_WHEN_PRIVACY_LEVEL_REQUIRES_CROSS_PARTITION_BOUNDING( + aggregationParams = + AGGREGATION_PARAMS.copy( + privacyLevel = PrivacyLevel.DATASET_LEVEL, + maxPartitionsContributed = null, + maxContributions = null, + ), + exceptionMessage = + "maxPartitionsContributed or maxContributions must be set because specified DATASET_LEVEL privacy level requires cross partition bounding", + ), + NOT_POSITIVE_PRETHRESHOLD( + aggregationParams = AGGREGATION_PARAMS.copy(preThreshold = 0), + exceptionMessage = "preThreshold must be positive. Provided value: 0", + ), + NO_METRICS( + aggregationParams = AGGREGATION_PARAMS.copy(metrics = ImmutableList.of()), + exceptionMessage = "metrics must not be empty.", + ), + ZERO_MAX_CONTRIBUTIONS_PER_PARTITION( + aggregationParams = AGGREGATION_PARAMS.copy(maxContributionsPerPartition = 0), + exceptionMessage = "maxContributionsPerPartition must be positive. Provided value: 0.", + ), + NEGATIVE_MAX_CONTRIBUTIONS_PER_PARTITION( + aggregationParams = AGGREGATION_PARAMS.copy(maxContributionsPerPartition = -1), + exceptionMessage = "maxContributionsPerPartition must be positive. Provided value: -1.", + ), + ZERO_MAX_CONTRIBUTIONS( + aggregationParams = AGGREGATION_PARAMS.copy(maxContributions = 0), + exceptionMessage = "maxContributions must be positive. Provided value: 0.", + ), + NEGATIVE_MAX_CONTRIBUTIONS( + aggregationParams = AGGREGATION_PARAMS.copy(maxContributions = -1), + exceptionMessage = "maxContributions must be positive. Provided value: -1.", + ), + MUTUALLY_EXCLUSIVE_MAX_CONTRIBUTIONS_PER_PARTITION_MAX_CONTRIBUTIONS( + aggregationParams = + AGGREGATION_PARAMS.copy( + maxPartitionsContributed = null, + maxContributionsPerPartition = 1, + maxContributions = 1, + ), + exceptionMessage = + "maxContributions and maxContributionsPerPartition are mutually exclusive. " + + "Provided values: maxContributions=1, maxContributionsPerPartition=1", + ), + MUTUALLY_EXCLUSIVE_MAX_PARTITIONS_CONTRIBUTED_MAX_CONTRIBUTIONS( + aggregationParams = + AGGREGATION_PARAMS.copy( + maxPartitionsContributed = 1, + maxContributionsPerPartition = null, + maxContributions = 1, + ), + exceptionMessage = + "maxContributions and maxPartitionsContributed are mutually exclusive. " + + "Provided values: maxContributions=1, maxPartitionsContributed=1", + ), + MUTUALLY_EXCLUSIVE_MAX_CONTRIBUTIONS_ALL_SET( + aggregationParams = + AGGREGATION_PARAMS.copy( + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + maxContributions = 1, + ), + exceptionMessage = + "maxContributions and maxPartitionsContributed are mutually exclusive. Provided values: maxContributions=1, maxPartitionsContributed=1", + ), + MIN_VALUE_SET_MAX_VALUE_NOT_SET( + aggregationParams = AGGREGATION_PARAMS.copy(minValue = 1.0, maxValue = null), + exceptionMessage = "minValue and maxValue must be simultaneously equal or not equal to null.", + ), + MIN_VALUE_NOT_SET_MAX_VALUE_SET( + aggregationParams = AGGREGATION_PARAMS.copy(minValue = null, maxValue = 2.0), + exceptionMessage = + "minValue and maxValue must be simultaneously equal or not equal to " + + "null. Provided values: minValue=null, maxValue=2.0", + ), + MIN_VALUE_GREATER_THAN_MAX_VALUE( + aggregationParams = AGGREGATION_PARAMS.copy(minValue = 1.5, maxValue = 1.0), + exceptionMessage = + "minValue must be less than maxValue. Provided values: " + "minValue=1.5, maxValue=1.0", + ), + MIN_VALUE_IS_EQUAL_TO_MAX_VALUE( + aggregationParams = AGGREGATION_PARAMS.copy(minValue = 1.5, maxValue = 1.5), + exceptionMessage = + "minValue must be less than maxValue. Provided values: " + "minValue=1.5, maxValue=1.5", + ), + MIN_TOTAL_VALUE_SET_MAX_TOTAL_VALUE_NOT_SET( + aggregationParams = AGGREGATION_PARAMS.copy(minTotalValue = 1.0, maxTotalValue = null), + exceptionMessage = + "minTotalValue and maxTotalValue must be simultaneously equal or not equal to null. " + + "Provided values: minTotalValue=1.0, maxTotalValue=null", + ), + MIN_TOTAL_VALUE_NOT_SET_MAX_TOTAL_VALUE_SET( + aggregationParams = AGGREGATION_PARAMS.copy(minTotalValue = null, maxTotalValue = 2.0), + exceptionMessage = + "minTotalValue and maxTotalValue must be simultaneously equal or not equal to null.", + ), + MIN_TOTAL_VALUE_GREATER_THAN_MAX_TOTAL_VALUE( + aggregationParams = AGGREGATION_PARAMS.copy(minTotalValue = 2.0, maxTotalValue = 0.0), + exceptionMessage = + "minTotalValue must be less or equal to maxTotalValue. Provided values: " + + "minTotalValue=2.0, maxTotalValue=0.0", + ), + MEAN_WITH_TOTAL_VALUE( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(SUM), MetricDefinition(MEAN)), + minValue = 0.0, + maxValue = 3.0, + minTotalValue = 1.5, + maxTotalValue = 5.0, + ), + exceptionMessage = + "(minTotalValue, maxTotalValue) should not be set if MEAN metric is requested", + ), + VARIANCE_WITH_TOTAL_VALUE( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(SUM), MetricDefinition(VARIANCE)), + minValue = 0.0, + maxValue = 3.0, + minTotalValue = 1.5, + maxTotalValue = 5.0, + ), + exceptionMessage = + "(minTotalValue, maxTotalValue) should not be set if VARIANCE metric is requested", + ), + MAX_CONTRIBUTIONS_PER_PARTITION_MAX_CONTRIBUTIONS_NOT_SET_FOR_COUNT( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(SUM)), + maxContributionsPerPartition = null, + maxContributions = null, + minTotalValue = -1.0, + maxTotalValue = 1.0, + minValue = null, + maxValue = null, + ), + exceptionMessage = + "maxContributionsPerPartition or maxContributions must be set for COUNT metric.", + ), + MAX_CONTRIBUTIONS_PER_PARTITION_MAX_CONTRIBUTIONS_NOT_SET_FOR_MEAN( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(MEAN), MetricDefinition(SUM)), + maxContributionsPerPartition = null, + maxContributions = null, + minTotalValue = -1.0, + maxTotalValue = 1.0, + minValue = null, + maxValue = null, + ), + exceptionMessage = + "maxContributionsPerPartition or maxContributions must be set for MEAN metric.", + ), + MAX_CONTRIBUTIONS_PER_PARTITION_MAX_CONTRIBUTIONS_NOT_SET_FOR_QUANTILES( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(QUANTILES(ranks = ImmutableList.of())), + MetricDefinition(SUM), + ), + maxContributionsPerPartition = null, + minTotalValue = -1.0, + maxTotalValue = 1.0, + minValue = null, + maxValue = null, + ), + exceptionMessage = "maxContributionsPerPartition must be set for QUANTILES metric.", + ), + MIN_TOTAL_VALUE_NOT_SET_FOR_SUM( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(SUM)), + minTotalValue = null, + maxTotalValue = null, + ), + exceptionMessage = "(minTotalValue, maxTotalValue) must be set for SUM metrics.", + ), + MIN_VALUE_NOT_SET_FOR_MEAN( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(MEAN), MetricDefinition(SUM)), + minTotalValue = 0.0, + maxTotalValue = 1.0, + minValue = null, + maxValue = null, + ), + exceptionMessage = "(minValue, maxValue) must be set for MEAN metric.", + ), + VALUE_EXTRACTOR_NOT_SET_FOR_SUM_AND_MEAN( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = + ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(MEAN), MetricDefinition(SUM)) + ), + hasValueExtractor = false, + exceptionMessage = "Metrics [MEAN, SUM] require a value extractor.", + ), + MIN_VALUE_NOT_SET_FOR_QUANTILES( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(QUANTILES(ranks = ImmutableList.of()))), + minTotalValue = 0.0, + maxTotalValue = 1.0, + minValue = null, + maxValue = null, + ), + exceptionMessage = "(minValue, maxValue) must be set for QUANTILES metric.", + ), + BUDGET_SPEC_SET_FOR_MEAN_AND_COUNT( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(MEAN, RelativeBudgetPerOpSpec(weight = 1.0)), + MetricDefinition(COUNT, AbsoluteBudgetPerOpSpec(epsilon = 2.0, delta = 1e-12)), + ) + ), + exceptionMessage = "BudgetPerOpSpec can not be set for both COUNT and MEAN metrics.", + ), + BUDGET_SPEC_SET_FOR_MEAN_AND_SUM( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(MEAN, RelativeBudgetPerOpSpec(weight = 1.0)), + MetricDefinition(SUM, RelativeBudgetPerOpSpec(weight = 2.0)), + ) + ), + exceptionMessage = "BudgetPerOpSpec can not be set for both SUM and MEAN metrics.", + ), + MAX_CONTRIBUTIONS_PER_PARTITION_MAX_CONTRIBUTIONS_NOT_SET_FOR_VARIANCE( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(VARIANCE)), + maxContributionsPerPartition = null, + maxContributions = null, + minTotalValue = -1.0, + maxTotalValue = 1.0, + minValue = null, + maxValue = null, + ), + exceptionMessage = + "maxContributionsPerPartition or maxContributions must be set for VARIANCE metric.", + ), + MIN_VALUE_NOT_SET_FOR_VARIANCE( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(VARIANCE)), + minTotalValue = 0.0, + maxTotalValue = 1.0, + minValue = null, + maxValue = null, + ), + exceptionMessage = "(minValue, maxValue) must be set for VARIANCE metric.", + ), + MAX_VALUE_NOT_SET_FOR_VARIANCE( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(VARIANCE)), + minTotalValue = -1.0, + maxTotalValue = 0.0, + minValue = null, + maxValue = null, + ), + exceptionMessage = "(minValue, maxValue) must be set for VARIANCE metric.", + ), + BUDGET_SPEC_SET_FOR_VARIANCE_AND_MEAN( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(VARIANCE, RelativeBudgetPerOpSpec(weight = 1.0)), + MetricDefinition(MEAN, RelativeBudgetPerOpSpec(weight = 1.0)), + ) + ), + exceptionMessage = "BudgetPerOpSpec can not be set for both MEAN and VARIANCE metrics.", + ), + BUDGET_SPEC_SET_FOR_VARIANCE_AND_COUNT( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(VARIANCE, RelativeBudgetPerOpSpec(weight = 1.0)), + MetricDefinition(COUNT, AbsoluteBudgetPerOpSpec(epsilon = 2.0, delta = 1e-12)), + ) + ), + exceptionMessage = "BudgetPerOpSpec can not be set for both COUNT and VARIANCE metrics.", + ), + BUDGET_SPEC_SET_FOR_VARIANCE_AND_SUM( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(VARIANCE, RelativeBudgetPerOpSpec(weight = 1.0)), + MetricDefinition(SUM, RelativeBudgetPerOpSpec(weight = 2.0)), + ) + ), + exceptionMessage = "BudgetPerOpSpec can not be set for both SUM and VARIANCE metrics.", + ), + PARTITION_SELECTION_BUDGET_FOR_PUBLIC_PARTITION( + aggregationParams = + AGGREGATION_PARAMS.copy( + partitionSelectionBudget = AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 1e-12) + ), + publicPartitions = true, + exceptionMessage = "partitionSelectionBudget can not be set for public partitions.", + ), + DUPLICATE_METRIC_TYPES( + aggregationParams = + AGGREGATION_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(COUNT), + MetricDefinition(PRIVACY_ID_COUNT), + MetricDefinition(COUNT), + ) + ), + exceptionMessage = + "metrics must not contain duplicate metric types. Provided " + + "[COUNT, PRIVACY_ID_COUNT, COUNT].", + ), + } + + @Test + fun validateAggregationParams_invalidParams_fails( + @TestParameter testCase: InvalidAggregationParamsTestCase + ) { + val e = + assertFailsWith { + validateAggregationParams( + testCase.aggregationParams, + testCase.publicPartitions, + testCase.hasValueExtractor, + ) + } + assertThat(e).hasMessageThat().contains(testCase.exceptionMessage) + } + + @Test + fun validQuantiles_doesntThrow() { + val unused = QUANTILES(ranks = ImmutableList.of(0.0, 0.0001, 0.5, 0.999, 1.0)) + } + + @Test + @TestParameters("{ranks: [-0.00001]}") + @TestParameters("{ranks: [1.00001]}") + fun invalidQuantiles_fail(ranks: List) { + assertFailsWith("in [0, 1]") { + QUANTILES(ImmutableList.copyOf(ranks)) + } + } + + @Test + fun validateSelectPartitionsParams_validParams_doesntThrow() { + validateSelectPartitionsParams(SELECT_PARTITIONS_PARAMS) + } + + enum class InvalidSelectPartitionsParamsTestCase( + val selectPartitionsParams: SelectPartitionsParams, + val exceptionMessage: String, + ) { + NOT_POSITIVE_MAX_PARTITION_CONTRIBUTED( + selectPartitionsParams = SELECT_PARTITIONS_PARAMS.copy(maxPartitionsContributed = 0), + exceptionMessage = "maxPartitionsContributed must be positive. Provided value: 0.", + ), + NOT_POSITIVE_PRETHRESHOLD( + selectPartitionsParams = SELECT_PARTITIONS_PARAMS.copy(preThreshold = 0), + exceptionMessage = "preThreshold must be positive. Provided value: 0", + ), + HUGE_MAX_PARTTIONS_CONTRIBUTED( + selectPartitionsParams = + SELECT_PARTITIONS_PARAMS.copy(maxPartitionsContributed = 110_000_000), + exceptionMessage = + "maxPartitionsContributed must be less than 100000000 Provided values: maxPartitionsContributed=110000000", + ), + } + + @Test + fun validateSelectPartitionsParams_invalidParams_fails( + @TestParameter testCase: InvalidSelectPartitionsParamsTestCase + ) { + val e = + assertFailsWith { + validateSelectPartitionsParams(testCase.selectPartitionsParams) + } + assertThat(e).hasMessageThat().contains(testCase.exceptionMessage) + } + + companion object { + val AGGREGATION_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(PRIVACY_ID_COUNT)), + noiseKind = NoiseKind.LAPLACE, + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + maxContributions = null, + minValue = 0.0, + maxValue = 1.0, + ) + + val SELECT_PARTITIONS_PARAMS = + SelectPartitionsParams( + maxPartitionsContributed = 2, + budget = AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 1e-12), + preThreshold = 10, + ) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/EndToEndTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/EndToEndTest.kt new file mode 100644 index 00000000..b5da820d --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/EndToEndTest.kt @@ -0,0 +1,279 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.LAPLACE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITHOUT_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITH_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AbsoluteBudgetPerOpSpec +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.TotalBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.local.createLocalEngine +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.dpAggregates +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import org.junit.Test +import org.junit.runner.RunWith + +/** + * End-to-end tests for Kotlin CUJs starting with DP engine. Each of these tests creates a local DP + * Engine that can apply noise to the result and evaluates the expected output. + * + * TODO: Add tests for all supported backends. + */ +@RunWith(TestParameterInjector::class) +class EndToEndTest { + + /** Tests for partition selection below */ + @Test + fun selectPartitions_withManyContributions_keepsPartition() { + // Create dataset with 1 partition and 100 privacy ids which contribute to this partition. + val inputData = + LocalCollection(List(100) { TestDataRow("PrivacyId$it", "Partition1", 2.0) }.asSequence()) + val dpEngine = DpEngine.createLocalEngine(LARGE_BUDGET_SPEC) + // Ensure all contributions to the partition are kept. + val params = SelectPartitionsParams(maxPartitionsContributed = 2) + + val dpAggregates = + dpEngine.selectPartitions(inputData, params, testDataExtractors) as LocalCollection + dpEngine.done() + + val partitionResult = dpAggregates.data.toList() + assertThat(partitionResult).containsExactly("Partition1") + } + + @Test + fun aggregate_preAggregationPartitionSelection_keepsPartitionAndCalculatesCorrectResult() { + // Create dataset with 1 partition and 100 privacy ids which contribute to this partition. + val inputData = + LocalCollection(List(100) { TestDataRow("PrivacyId$it", "US", 2.0) }.asSequence()) + // Create a local DP engine with minimal noise so results are close to deterministic. + val dpEngine = DpEngine.createLocalEngine(LARGE_BUDGET_SPEC) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = + ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(SUM), MetricDefinition(MEAN)), + noiseKind = LAPLACE, + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + minValue = -2.0, + maxValue = 2.0, + ) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors) as LocalTable + dpEngine.done() + + val partitionResult = dpAggregates.data.toMap()["US"]!! + assertThat(partitionResult.count).isWithin(1e-1).of(100.0) + assertThat(partitionResult.sum).isWithin(1e-1).of(200.0) + assertThat(partitionResult.mean).isWithin(1e-10).of(partitionResult.sum / partitionResult.count) + } + + @Test + fun aggregate_postAggregationPartitionSelection_keepsPartitionAndCalculatesCorrectResult() { + // Create dataset with 1 partition and 100 privacy ids which contribute to this partition. + val inputData = + LocalCollection(List(100) { TestDataRow("PrivacyId$it", "US", 2.0) }.asSequence()) + // Create a local DP engine with minimal noise so results are close to deterministic. + val dpEngine = DpEngine.createLocalEngine(LARGE_BUDGET_SPEC) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(PRIVACY_ID_COUNT)), + noiseKind = LAPLACE, + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + minValue = -2.0, + maxValue = 2.0, + ) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors) as LocalTable + dpEngine.done() + + val partitionResult = dpAggregates.data.toMap()["US"]!! + assertThat(partitionResult.privacyIdCount).isWithin(1e-1).of(100.0) + assertThat(partitionResult.count).isWithin(1e-1).of(100.0) + } + + /** Tests for no privacy below */ + @Test + fun aggregate_noPrivacyAndNoContributionBounding_returnsNonDpResult() { + // Create dataset with 2 partition and 100 privacy ids which contribute to each partition twice. + val inputData = + LocalCollection( + (1..100) + .flatMap { + listOf( + TestDataRow("PrivacyId$it", "US", value = 1.0), + TestDataRow("PrivacyId$it", "US", value = 2.0), + TestDataRow("PrivacyId$it", "Canada", value = 1.0), + TestDataRow("PrivacyId$it", "Canada", value = 2.0), + ) + } + .asSequence() + ) + val lowBudgetWithLotsOfNoise = TotalBudget(epsilon = 1e-10, delta = 1e-10) + val dpEngine = DpEngine.createLocalEngine(DpEngineBudgetSpec(budget = lowBudgetWithLotsOfNoise)) + val params = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT)), + noiseKind = LAPLACE, + // Contribution bounding would be applied if it was not disabled. + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + privacyLevel = NONE_WITHOUT_CONTRIBUTION_BOUNDING, + ) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors) as LocalTable + dpEngine.done() + + assertThat(dpAggregates.data.toList()) + .containsExactly( + "US" to dpAggregates { count = 200.0 }, + "Canada" to dpAggregates { count = 200.0 }, + ) + } + + @Test + fun aggregate_noPrivacyAndWithContributionBounding_returnsBoundedNonDpResult() { + // Create dataset with 2 partition and 100 privacy ids which contribute to each partition twice. + val inputData = + LocalCollection( + (1..100) + .flatMap { + listOf( + TestDataRow("PrivacyId$it", "US", value = 1.0), + TestDataRow("PrivacyId$it", "US", value = 2.0), + TestDataRow("PrivacyId$it", "Canada", value = 1.0), + TestDataRow("PrivacyId$it", "Canada", value = 2.0), + ) + } + .asSequence() + ) + + val lowBudgetWithLotsOfNoise = TotalBudget(epsilon = 1e-10, delta = 1e-10) + + val dpEngine = DpEngine.createLocalEngine(DpEngineBudgetSpec(budget = lowBudgetWithLotsOfNoise)) + val params = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT)), + noiseKind = LAPLACE, + maxPartitionsContributed = 2, // Contributions to each of the two partitions are kept. + maxContributionsPerPartition = 1, // Double contributions per partition are removed. + privacyLevel = NONE_WITH_CONTRIBUTION_BOUNDING, + ) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors) as LocalTable + dpEngine.done() + + // No noise is applied to the result but contributions are bounded in half per partition. + assertThat(dpAggregates.data.toList()) + .containsExactly( + "US" to dpAggregates { count = 100.0 }, + "Canada" to dpAggregates { count = 100.0 }, + ) + } + + /** Tests for correct metric calculations below */ + @Test + fun aggregate_withPublicPartitions_calculatesCorrectResult() { + val inputData = + LocalCollection(sequenceOf(TestDataRow("Alice", "US", 1.0), TestDataRow("Bob", "US", 2.0))) + val publicPartitions = LocalCollection(sequenceOf("US")) + val dpEngine = DpEngine.createLocalEngine(LARGE_BUDGET_SPEC) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = + ImmutableList.of(MetricDefinition(COUNT), MetricDefinition(SUM), MetricDefinition(MEAN)), + noiseKind = LAPLACE, + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + minValue = -2.0, + maxValue = 2.0, + ) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + val partitionResult = dpAggregates.data.toMap()["US"]!! + assertThat(partitionResult.count).isWithin(1e-1).of(2.0) + assertThat(partitionResult.sum).isWithin(1e-1).of(3.0) + assertThat(partitionResult.mean).isWithin(1e-10).of(partitionResult.sum / partitionResult.count) + } + + @Test + fun aggregate_withPublicPartitions_calculatesDifferentResultsInDifferentRuns() { + val inputData = + LocalCollection(sequenceOf(TestDataRow("Alice", "US", 1.0), TestDataRow("Bob", "US", 2.0))) + val publicPartitions = LocalCollection(sequenceOf("US")) + val dpEngine = DpEngine.createLocalEngine(LARGE_BUDGET_SPEC) + // Use low bounds to avoid sensitivity overflow when adding noise. + val params = + AggregationParams( + metrics = + ImmutableList.of( + MetricDefinition(COUNT, AbsoluteBudgetPerOpSpec(0.1, 1e-5)), + MetricDefinition(SUM, AbsoluteBudgetPerOpSpec(0.1, 1e-5)), + MetricDefinition(PRIVACY_ID_COUNT, AbsoluteBudgetPerOpSpec(0.1, 1e-5)), + ), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 5, + maxContributionsPerPartition = 5, + minTotalValue = -5.0, + maxTotalValue = 5.0, + ) + + val dpAggregates = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + val dpAggregatesAnotherRun = + dpEngine.aggregate(inputData, params, testDataExtractors, publicPartitions) + as LocalTable + dpEngine.done() + + assertThat(dpAggregates.data.toMap()["US"]!!.count) + .isNotEqualTo(dpAggregatesAnotherRun.data.toMap()["US"]!!.count) + assertThat(dpAggregates.data.toMap()["US"]!!.sum) + .isNotEqualTo(dpAggregatesAnotherRun.data.toMap()["US"]!!.sum) + assertThat(dpAggregates.data.toMap()["US"]!!.privacyIdCount) + .isNotEqualTo(dpAggregatesAnotherRun.data.toMap()["US"]!!.privacyIdCount) + } + + companion object { + // A DpEngineBudgetSpec with budget large enough to make sure that tests don't run out of it. + private val LARGE_BUDGET_SPEC = + DpEngineBudgetSpec(budget = TotalBudget(epsilon = 2000.0, delta = 0.999999)) + private val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/ExactPrivacyIdCountCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/ExactPrivacyIdCountCombinerTest.kt new file mode 100644 index 00000000..60bcc035 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/ExactPrivacyIdCountCombinerTest.kt @@ -0,0 +1,66 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.truth.Truth.assertThat +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdCountAccumulator +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class ExactPrivacyIdCountCombinerTest { + + @Test + fun createAccumulator_initsAccumulatorWithOne() { + val combiner = ExactPrivacyIdCountCombiner() + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(1.0, 1.0, 1.0) }) + + assertThat(accumulator).isEqualTo(privacyIdCountAccumulator { count = 1 }) + } + + @Test + fun mergeAccumulators_sumsCounts() { + val combiner = ExactPrivacyIdCountCombiner() + + val accumulator = + combiner.mergeAccumulators( + privacyIdCountAccumulator { count = 1 }, + privacyIdCountAccumulator { count = 2 }, + ) + + assertThat(accumulator).isEqualTo(privacyIdCountAccumulator { count = 3 }) + } + + @Test + fun computeMetrics_throwsExceptions() { + val combiner = ExactPrivacyIdCountCombiner() + + val throwable = + assertFailsWith { + combiner.computeMetrics(privacyIdCountAccumulator { count = 1 }) + } + assertThat(throwable) + .hasMessageThat() + .contains("ExactPrivacyIdCountCombiner does not support compute_metrics") + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/MeanCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/MeanCombinerTest.kt new file mode 100644 index 00000000..c8e7bc2c --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/MeanCombinerTest.kt @@ -0,0 +1,433 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITHOUT_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.meanAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import org.junit.BeforeClass +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.mock +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class MeanCombinerTest { + companion object { + private val AGG_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(MEAN)), + noiseKind = NoiseKind.GAUSSIAN, + maxPartitionsContributed = 3, + maxContributionsPerPartition = 5, + minValue = -10.0, + maxValue = 10.0, + ) + + private val noiseMock: Noise = mock() + private val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> noiseMock } + private val UNUSED_ALLOCATED_BUDGET = AllocatedBudget() + + @JvmStatic + @BeforeClass + fun beforeClass() { + UNUSED_ALLOCATED_BUDGET.initialize(1.1, 1e-3) + } + } + + @Test + fun emptyAccumulator_countAndSumAreZero() { + val combiner = + MeanCombiner(AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = combiner.emptyAccumulator() + + assertThat(accumulator) + .isEqualTo( + meanAccumulator { + count = 0 + normalizedSum = 0.0 + } + ) + } + + @Test + fun createAccumulator_doesNotClampContributionsWithinBounds() { + val combiner = + MeanCombiner( + AGG_PARAMS.copy(minValue = -10.0, maxValue = 10.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = combiner.createAccumulator(privacyIdContributions { values += listOf(5.5) }) + + assertThat(accumulator) + .isEqualTo( + meanAccumulator { + count = 1 + normalizedSum = 5.5 + } + ) + } + + @Test + fun createAccumulator_privacyLevelWithContributionBounding_clampsValues() { + val combiner = + MeanCombiner( + AGG_PARAMS.copy(minValue = -10.0, maxValue = 10.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(-20.0, 30.0) }) + + assertThat(accumulator) + .isEqualTo( + meanAccumulator { + count = 2 + normalizedSum = 0.0 // = sum of clamped values = -10 + 10 + } + ) + } + + @Test + fun createAccumulator_privacyLevelWithoutContributionBounding_doesNotClampValues() { + val combiner = + MeanCombiner( + AGG_PARAMS.copy( + minValue = -10.0, + maxValue = 10.0, + privacyLevel = NONE_WITHOUT_CONTRIBUTION_BOUNDING, + ), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(-20.0, 30.0) }) + + assertThat(accumulator) + .isEqualTo( + meanAccumulator { + count = 2 + normalizedSum = 10.0 // = sum of non-clamped values = -20 + 30 + } + ) + } + + @Test + fun createAccumulator_normalizesSum() { + val combiner = + MeanCombiner( + AGG_PARAMS.copy(minValue = 5.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = combiner.createAccumulator(privacyIdContributions { values += listOf(6.0) }) + + assertThat(accumulator) + .isEqualTo( + meanAccumulator { + count = 1 + normalizedSum = -1.5 + } + ) + } + + @Test + fun createAccumulator_normalizationAndClamping() { + val combiner = + MeanCombiner( + AGG_PARAMS.copy(minValue = 5.0, maxValue = 10.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = combiner.createAccumulator(privacyIdContributions { values += listOf(30.0) }) + + assertThat(accumulator) + .isEqualTo( + meanAccumulator { + count = 1 + normalizedSum = 2.5 + } + ) + } + + @Test + fun createAccumulator_aggregatesMultipleElements() { + val combiner = + MeanCombiner( + AGG_PARAMS.copy(minValue = 4.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(3.0, 5.5, 6.0) }) + + assertThat(accumulator) + .isEqualTo( + meanAccumulator { + count = 3 + normalizedSum = -5.5 // = sum of normalized values = -3 - 1.5 - 1 + } + ) + } + + @Test + fun mergeAccumulator_sumsValuesInMergedAccumulators() { + val combiner = + MeanCombiner(AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = + combiner.mergeAccumulators( + meanAccumulator { + count = 1 + normalizedSum = -5.0 + }, + meanAccumulator { + count = 10 + normalizedSum = 8.5 + }, + ) + + assertThat(accumulator) + .isEqualTo( + meanAccumulator { + count = 11 + normalizedSum = 3.5 + } + ) + } + + @Test + fun computeMetrics_passesCorrectParametersToNoise() { + val countBudget = AllocatedBudget() + countBudget.initialize(2.0, 1e-5) + val sumBudget = AllocatedBudget() + sumBudget.initialize(1.0, 1e-3) + val combiner = + MeanCombiner( + AGG_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(MEAN)), + maxPartitionsContributed = 5, + maxContributionsPerPartition = 7, + minValue = 4.0, + maxValue = 10.0, + ), + countBudget, + sumBudget, + noiseFactoryMock, + ) + val accumulator = meanAccumulator { + count = 10 + normalizedSum = 120.0 + } + + val unused = combiner.computeMetrics(accumulator) + + // Verify noise is added to count. + verify(noiseMock) + .addNoise( + /* x= */ 10.0, + /* l0Sensitivity= */ 5, + /* lInfSensitivity= */ 7.0, + /* epsilon= */ 2.0, + /* delta= */ 1e-5, + ) + // Verify noise is added to sum. + verify(noiseMock) + .addNoise( + /* x= */ 120.0, + /* l0Sensitivity= */ 5, + /* lInfSensitivity= */ 21.0, + /* epsilon= */ 1.0, + /* delta= */ 1e-3, + ) + } + + @Test + fun computeMetrics_returnsMeanCountSum() { + // Use high budget for low noise. + val countBudget = AllocatedBudget() + countBudget.initialize(10000.0, 0.0) + val sumBudget = AllocatedBudget() + sumBudget.initialize(10000.0, 0.0) + + val combiner = + MeanCombiner( + AGG_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(MEAN), + MetricDefinition(SUM), + MetricDefinition(COUNT), + ), + maxPartitionsContributed = 5, + maxContributionsPerPartition = 7, + minValue = 4.0, + maxValue = 12.0, + noiseKind = NoiseKind.LAPLACE, + ), + countBudget, + sumBudget, + NoiseFactory(), + ) + + val accumulator = meanAccumulator { + count = 10 + normalizedSum = 120.0 + } + + val result = combiner.computeMetrics(accumulator) + + assertThat(result.count).isNotEqualTo(10.0) + assertThat(result.count).isWithin(0.1).of(10.0) + + val approximatedExpectedSum = /* normalizedSum= */ 120.0 + /* dp_count * midValue= */ 10 * 8 + assertThat(result.sum).isNotEqualTo(approximatedExpectedSum) + assertThat(result.sum).isWithin(1.0).of(approximatedExpectedSum) + assertThat(result.mean).isWithin(1e-9).of(result.sum!! / result.count!!) + } + + enum class ReturnedMetricsTestCase( + val requestedMetrics: ImmutableList, + val countExpected: Boolean, + val sumExpected: Boolean, + ) { + NO_SUM_NO_COUNT( + requestedMetrics = ImmutableList.of(MetricDefinition(MEAN)), + countExpected = false, + sumExpected = false, + ), + ONLY_SUM( + requestedMetrics = ImmutableList.of(MetricDefinition(MEAN), MetricDefinition(SUM)), + countExpected = false, + sumExpected = true, + ), + ONLY_COUNT( + requestedMetrics = ImmutableList.of(MetricDefinition(MEAN), MetricDefinition(COUNT)), + countExpected = true, + sumExpected = false, + ), + COUNT_AND_SUM( + requestedMetrics = + ImmutableList.of(MetricDefinition(MEAN), MetricDefinition(SUM), MetricDefinition(COUNT)), + countExpected = true, + sumExpected = true, + ), + } + + @Test + fun aggregate_computeMetrics_checkWhichMetricReturned( + @TestParameter testCase: ReturnedMetricsTestCase + ) { + val combiner = + MeanCombiner( + AGG_PARAMS.copy(metrics = testCase.requestedMetrics), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val metrics = + combiner.computeMetrics( + meanAccumulator { + count = 10 + normalizedSum = 120.0 + } + ) + if (testCase.countExpected) { + assertThat(metrics.count).isNotNull() + } else { + assertThat(metrics.count).isNull() + } + + if (testCase.sumExpected) { + assertThat(metrics.sum).isNotNull() + } else { + assertThat(metrics.sum).isNull() + } + } + + @Test + fun computeMetrics_withoutNoise_withMultipleContributionsIncludingEmptyAccumulator_returnsCorrectResult() { + val combiner = + MeanCombiner( + AGG_PARAMS.copy(minValue = -10.0, maxValue = 10.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + + val accumulator0 = combiner.emptyAccumulator() + val accumulator1 = + combiner.createAccumulator(privacyIdContributions { values += listOf(10.0, -10.0) }) + val accumulator2 = combiner.createAccumulator(privacyIdContributions { values += listOf(9.0) }) + val accumulator3 = combiner.mergeAccumulators(accumulator0, accumulator1) + val finalAccumulator = combiner.mergeAccumulators(accumulator2, accumulator3) + val result = combiner.computeMetrics(finalAccumulator) + + assertThat(result.mean).isEqualTo(3.0) + } + + @Test + fun computeMetrics_withoutNoise_onlyEmptyAccumulator_returnsZeroCountAndNaNForSumAndMean() { + val combiner = + MeanCombiner( + AGG_PARAMS.copy( + ImmutableList.of(MetricDefinition(MEAN), MetricDefinition(SUM), MetricDefinition(COUNT)), + minValue = 4.0, + maxValue = 10.0, + ), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + + val result = combiner.computeMetrics(combiner.emptyAccumulator()) + + assertThat(result.count).isEqualTo(0.0) + // NaN because mean is not defined for count = 0. With noise enabled we will return a very + // noised mean with added mid value. + assertThat(result.mean).isNaN() + // sum is NaN as well because it is computed as count * mean = 0 * NaN = NaN. + assertThat(result.sum).isNaN() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/NoPrivacySamplerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/NoPrivacySamplerTest.kt new file mode 100644 index 00000000..6a6b5e60 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/NoPrivacySamplerTest.kt @@ -0,0 +1,93 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class NoPrivacySamplerTest { + @Test + fun sampleContributions_noPrivacy_returnsOriginal() { + val contributionsPk1 = listOf(1.0, 2.0, 3.0, 4.0) + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId1", "pk1", 1.0), + contributionWithPrivacyId("privacyId1", "pk1", 2.0), + contributionWithPrivacyId("privacyId1", "pk1", 3.0), + contributionWithPrivacyId("privacyId1", "pk1", 4.0), + ) + ) + + val sampledData = + NoPrivacySampler(LOCAL_EF.strings(), LOCAL_EF.strings(), LOCAL_EF) + .sampleContributions(inputData) as LocalTable + val returnedContributionsPk1 = sampledData.data.toMap().getValue("pk1").valuesList + + // Returned contributions are of the same size as the originals. + assertThat(returnedContributionsPk1).hasSize(4) + // Returned partition keys are all in the list of the contributed partition keys. + assertThat(contributionsPk1).containsExactlyElementsIn(returnedContributionsPk1) + } + + @Test + fun sampleContributions_noPrivacy_returnsOriginalGroupsByPk() { + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId1", "pk1", 1.0), + contributionWithPrivacyId("privacyId1", "pk1", 2.0), + contributionWithPrivacyId("privacyId1", "pk1", 3.0), + contributionWithPrivacyId("privacyId1", "pk1", 4.0), + contributionWithPrivacyId("privacyId1", "pk2", 5.0), + contributionWithPrivacyId("privacyId1", "pk2", 6.0), + contributionWithPrivacyId("privacyId2", "pk2", 7.0), + contributionWithPrivacyId("privacyId2", "pk2", 8.0), + ) + ) + + val sampledData = + NoPrivacySampler(LOCAL_EF.strings(), LOCAL_EF.strings(), LOCAL_EF) + .sampleContributions(inputData) as LocalTable + val resultMap: Map> = + sampledData.data.groupBy({ (k, _) -> k }, { (_, v) -> v }).mapValues { (_, v) -> v.toSet() } + + assertThat(resultMap) + .isEqualTo( + mapOf( + "pk1" to setOf(privacyIdContributions { values += listOf(1.0, 2.0, 3.0, 4.0) }), + "pk2" to + setOf( + privacyIdContributions { values += listOf(5.0, 6.0) }, + privacyIdContributions { values += listOf(7.0, 8.0) }, + ), + ) + ) + } + + private companion object { + private val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionAndPerPartitionSamplerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionAndPerPartitionSamplerTest.kt new file mode 100644 index 00000000..a9f47446 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionAndPerPartitionSamplerTest.kt @@ -0,0 +1,183 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import kotlin.Int.Companion.MAX_VALUE +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class PartitionAndPerPartitionSamplerTest { + val AGG_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(MEAN)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = MAX_VALUE, + maxContributionsPerPartition = MAX_VALUE, + minValue = -1.0, + maxValue = 1.0, + ) + + @Test + fun sampleContributions_returnsSubsampleOfContributedPartitions() { + val contributedPks = listOf("red", "blue", "green", "orange") + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("samePrivacyId", "red", 1.0), + contributionWithPrivacyId("samePrivacyId", "blue", 1.0), + contributionWithPrivacyId("samePrivacyId", "green", 1.0), + contributionWithPrivacyId("samePrivacyId", "orange", 1.0), + ) + ) + + val sampledData = + PartitionAndPerPartitionSampler( + maxPartitionsContributed = 3, + maxContributionsPerPartition = MAX_VALUE, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + val returnedPks = sampledData.data.map { it.first }.toList() + assertThat(returnedPks.count()).isEqualTo(3) + // Returned partition keys are all in the list of the contributed partition keys. + assertThat(contributedPks).containsAtLeastElementsIn(returnedPks) + } + + @Test + fun sampleContributions_returnsSubsampleOfContributionsPerPartition() { + val contributions = listOf(1.0, 2.0, 3.0, 4.0) + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("samePrivacyId", "samePk", 1.0), + contributionWithPrivacyId("samePrivacyId", "samePk", 2.0), + contributionWithPrivacyId("samePrivacyId", "samePk", 3.0), + contributionWithPrivacyId("samePrivacyId", "samePk", 4.0), + ) + ) + + val sampledData = + PartitionAndPerPartitionSampler( + maxPartitionsContributed = MAX_VALUE, + maxContributionsPerPartition = 3, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + val returnedContributions = sampledData.data.toMap().get("samePk")!!.valuesList + assertThat(returnedContributions.count()).isEqualTo(3) + // Returned partition keys are all in the list of the contributed partition keys. + assertThat(contributions).containsAtLeastElementsIn(returnedContributions) + } + + @Test + fun sampleContributions_groupsResultPerPrivacyIdAndPartitionKey() { + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId", "pk", value = 1.0), + contributionWithPrivacyId("privacyId", "pk", value = 2.0), + contributionWithPrivacyId("privacyId", "anotherPk", value = 3.0), + contributionWithPrivacyId("privacyId", "anotherPk", value = 4.0), + contributionWithPrivacyId("anotherPrivacyId", "pk", value = 5.0), + ) + ) + + val sampledData = + PartitionAndPerPartitionSampler( + maxPartitionsContributed = MAX_VALUE, + maxContributionsPerPartition = MAX_VALUE, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + assertThat(sampledData.data.toList()) + .containsExactly( + Pair("pk", privacyIdContributions { values += listOf(1.0, 2.0) }), + Pair("anotherPk", privacyIdContributions { values += listOf(3.0, 4.0) }), + Pair("pk", privacyIdContributions { values += 5.0 }), + ) + } + + @Test + fun sampleContributions_samplesFromManyPartitions() { + val contributedKeys = (0 until 100_000).map { it.toString() } + val inputData = + LocalCollection( + contributedKeys.map { contributionWithPrivacyId("privacyId", it, value = 1.0) }.asSequence() + ) + + val sampledData = + PartitionAndPerPartitionSampler( + maxPartitionsContributed = 300, + maxContributionsPerPartition = MAX_VALUE, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + val returnedPks = sampledData.data.map { it.first }.toList() + assertThat(returnedPks.count()).isEqualTo(300) + assertThat(contributedKeys).containsAtLeastElementsIn(returnedPks) + } + + @Test + fun sampleContributions_samplesFromManyContributions() { + val inputData = + LocalCollection( + (0 until 100_000) + .map { contributionWithPrivacyId("privacyId", "pk", value = it.toDouble()) } + .asSequence() + ) + + val sampledData = + PartitionAndPerPartitionSampler( + maxPartitionsContributed = MAX_VALUE, + maxContributionsPerPartition = 300, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + val returnedContributions = sampledData.data.toMap().get("pk")!!.valuesList + assertThat(returnedContributions.count()).isEqualTo(300) + } + + private companion object { + private val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSamplerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSamplerTest.kt new file mode 100644 index 00000000..049fd193 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSamplerTest.kt @@ -0,0 +1,133 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class PartitionSamplerTest { + @Test + fun sampleContributions_returnsSubsampleOfContributedPartitions() { + val contributedPks = listOf("red", "blue", "green", "orange") + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("samePrivacyId", "red", 1.0), + contributionWithPrivacyId("samePrivacyId", "blue", 1.0), + contributionWithPrivacyId("samePrivacyId", "green", 1.0), + contributionWithPrivacyId("samePrivacyId", "orange", 1.0), + ) + ) + val sampledData = + PartitionSampler( + maxPartitionsContributed = 3, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + val returnedPks = sampledData.data.map { it.first }.toList() + assertThat(returnedPks.count()).isEqualTo(3) + // Returned partition keys are all in the list of the contributed partition keys. + assertThat(contributedPks).containsAtLeastElementsIn(returnedPks) + } + + @Test + fun sampleContributions_doesntsampleContributionsPerPartition() { + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId", "pk", 1.0), + contributionWithPrivacyId("privacyId", "pk", 1.0), + contributionWithPrivacyId("privacyId", "pk", 1.0), + ) + ) + + val sampledData = + PartitionSampler( + maxPartitionsContributed = 2, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + assertThat(sampledData.data.toMap().get("pk")!!.valuesList.size).isEqualTo(3) + } + + @Test + fun sampleContributions_returnsResultPerPrivacyId() { + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId", "pk", 1.0), + contributionWithPrivacyId("privacyId", "pk", 1.0), + contributionWithPrivacyId("anotherPrivacyId", "pk", 2.0), + ) + ) + + val sampledData = + PartitionSampler( + maxPartitionsContributed = 5, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + assertThat(sampledData.data.toList()) + .containsExactly( + Pair("pk", privacyIdContributions { values += listOf(1.0, 1.0) }), + Pair("pk", privacyIdContributions { values += 2.0 }), + ) + } + + @Test + fun sampleContributions_samplesFromManyPartitions() { + val contributedKeys = (0 until 100_000).map { it.toString() } + val inputData = + LocalCollection( + contributedKeys.map { contributionWithPrivacyId("privacyId", it, value = 1.0) }.asSequence() + ) + + val sampledData = + PartitionSampler( + maxPartitionsContributed = 300, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + val returnedPks = sampledData.data.map { it.first }.toList() + assertThat(returnedPks.count()).isEqualTo(300) + assertThat(contributedKeys).containsAtLeastElementsIn(returnedPks) + } + + private companion object { + private val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSamplerWithoutValuesTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSamplerWithoutValuesTest.kt new file mode 100644 index 00000000..7f2b771f --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PartitionSamplerWithoutValuesTest.kt @@ -0,0 +1,131 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class PartitionSamplerWithoutValuesTest { + @Test + fun sampleContributions_returnsSubsampleOfContributedPartitions() { + val contributedPks = listOf("red", "blue", "green", "orange") + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("samePrivacyId", "red", 1.0), + contributionWithPrivacyId("samePrivacyId", "blue", 1.0), + contributionWithPrivacyId("samePrivacyId", "green", 1.0), + contributionWithPrivacyId("samePrivacyId", "orange", 1.0), + ) + ) + val sampledData = + PartitionSamplerWithoutValues( + maxPartitionsContributed = 3, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + val returnedPks = sampledData.data.map { it.first }.toList() + assertThat(returnedPks.count()).isEqualTo(3) + // Returned partition keys are all in the list of the contributed partition keys. + assertThat(contributedPks).containsAtLeastElementsIn(returnedPks) + } + + @Test + fun sampleContributions_doesntReturnContributedValues() { + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId", "pk", 1.0), + contributionWithPrivacyId("privacyId", "pk", 1.0), + contributionWithPrivacyId("privacyId", "pk", 1.0), + ) + ) + + val sampledData = + PartitionSamplerWithoutValues( + maxPartitionsContributed = 2, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + // Check that all values are dropped + assertThat(sampledData.data.toMap().get("pk")!!.valuesList.size).isEqualTo(0) + } + + @Test + fun sampleContributions_returnsResultPerPrivacyId() { + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId", "pk", 1.0), + contributionWithPrivacyId("privacyId", "pk", 1.0), + contributionWithPrivacyId("anotherPrivacyId", "pk", 2.0), + ) + ) + + val sampledData = + PartitionSamplerWithoutValues( + maxPartitionsContributed = 5, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + assertThat(sampledData.data.toList()) + .containsExactly(Pair("pk", privacyIdContributions {}), Pair("pk", privacyIdContributions {})) + } + + @Test + fun sampleContributions_samplesFromManyPartitions() { + val contributedKeys = (0 until 100_000).map { it.toString() } + val inputData = + LocalCollection( + contributedKeys.map { contributionWithPrivacyId("privacyId", it, value = 1.0) }.asSequence() + ) + + val sampledData = + PartitionSamplerWithoutValues( + maxPartitionsContributed = 300, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + + val returnedPks = sampledData.data.map { it.first }.toList() + assertThat(returnedPks.count()).isEqualTo(300) + assertThat(contributedKeys).containsAtLeastElementsIn(returnedPks) + } + + private companion object { + private val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PerPartitionContributionsSamplerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PerPartitionContributionsSamplerTest.kt new file mode 100644 index 00000000..3b24dd14 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PerPartitionContributionsSamplerTest.kt @@ -0,0 +1,151 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.PrivacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class PerPartitionContributionsSamplerTest { + @Test + fun sampleContributions_returnsSubsampleOfContributionsPerPartitionSinglePrivacyId() { + val contributions = listOf(1.0, 2.0, 3.0, 4.0) + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId1", "pk1", 1.0), + contributionWithPrivacyId("privacyId1", "pk1", 2.0), + contributionWithPrivacyId("privacyId1", "pk1", 3.0), + contributionWithPrivacyId("privacyId1", "pk1", 4.0), + ) + ) + + val sampledData = + PerPartitionContributionsSampler( + maxContributionsPerPartition = 2, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + val returnedContributions = sampledData.data.toMap().getValue("pk1").valuesList + + assertThat(returnedContributions.count()).isEqualTo(2) + // Returned partition keys are all in the list of the contributed partition keys. + assertThat(contributions).containsAtLeastElementsIn(returnedContributions) + } + + @Test + fun sampleContributions_returnsOriginalContributionsPerPartition() { + val sampledData = + PerPartitionContributionsSampler( + maxContributionsPerPartition = 10, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(multipleContributionsMultiplePrivacyIdInput) + as LocalTable + + assertThat(sampledData.data.toList()) + .containsExactly( + Pair("pk1", privacyIdContributions { values += listOf(1.0, 2.0, 3.0) }), + Pair("pk2", privacyIdContributions { values += listOf(4.0, 5.0, 6.0) }), + Pair("pk1", privacyIdContributions { values += listOf(7.0) }), + ) + } + + @Test + fun sampleContributions_returnsSubsampleOfContributionsPerPartition() { + val sampledData = + PerPartitionContributionsSampler( + maxContributionsPerPartition = 1, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(multipleContributionsMultiplePrivacyIdInput) + as LocalTable + val returnedPks = sampledData.data.map { it.first }.toList() + + assertThat(returnedPks.count()).isEqualTo(3) + + // Returned partition keys should only have 1 contribution from each privacy ID. + for (pk in returnedPks) { + assertThat(sampledData.data.toMap().getValue(pk).valuesList).hasSize(1) + } + } + + @Test + fun sampleContributions_samplesFromManyContributions() { + val inputData = + LocalCollection( + sequence { + repeat(100_000) { + yield(contributionWithPrivacyId("privacyId", "pk", value = it.toDouble())) + } + } + .asSequence() + ) + + val sampledData = + PerPartitionContributionsSampler( + maxContributionsPerPartition = 300, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ) + .sampleContributions(inputData) as LocalTable + val returnedContributions = sampledData.data.toMap().getValue("pk").valuesList + + assertThat(returnedContributions.count()).isEqualTo(300) + } + + private companion object { + val aggParams = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(MEAN)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 1, + maxContributionsPerPartition = 2, + ) + val contributedPks = listOf("pk1", "pk2", "pk1") + val multipleContributionsMultiplePrivacyIdInput = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("privacyId1", "pk1", value = 1.0), + contributionWithPrivacyId("privacyId1", "pk1", value = 2.0), + contributionWithPrivacyId("privacyId1", "pk1", value = 3.0), + contributionWithPrivacyId("privacyId1", "pk2", value = 4.0), + contributionWithPrivacyId("privacyId1", "pk2", value = 5.0), + contributionWithPrivacyId("privacyId1", "pk2", value = 6.0), + contributionWithPrivacyId("privacyId2", "pk1", value = 7.0), + ) + ) + private val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PostAggregationPartitionSelectionCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PostAggregationPartitionSelectionCombinerTest.kt new file mode 100644 index 00000000..7f956bd1 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PostAggregationPartitionSelectionCombinerTest.kt @@ -0,0 +1,178 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdCountAccumulator +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.mock +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class PostAggregationPartitionSelectionCombinerTest { + + @Test + fun createAccumulator_initsAccumulatorWithOne() { + val combiner = + PostAggregationPartitionSelectionCombiner( + AGGREGATION_PARAMS, + unusedAllocatedBudget, + unusedAllocatedBudget, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(1.0, 1.0, 1.0) }) + + assertThat(accumulator).isEqualTo(privacyIdCountAccumulator { count = 1 }) + } + + @Test + fun createAccumulator_initsAccumulatorWithZero() { + val combiner = + PostAggregationPartitionSelectionCombiner( + AGGREGATION_PARAMS, + unusedAllocatedBudget, + unusedAllocatedBudget, + NoiseFactory(), + ) + + val e = + assertFailsWith { + combiner.createAccumulator(privacyIdContributions {}) + } + assertThat(e).hasMessageThat().contains("There must be contributions") + } + + @Test + fun mergeAccumulators_sumsCounts() { + val combiner = + PostAggregationPartitionSelectionCombiner( + AGGREGATION_PARAMS, + unusedAllocatedBudget, + unusedAllocatedBudget, + NoiseFactory(), + ) + + val accumulator = + combiner.mergeAccumulators( + privacyIdCountAccumulator { count = 1 }, + privacyIdCountAccumulator { count = 2 }, + ) + + assertThat(accumulator).isEqualTo(privacyIdCountAccumulator { count = 3 }) + } + + @TestParameters("{noiseKind: LAPLACE}", "{noiseKind: GAUSSIAN}") + fun computeMetrics_twoSmallNumberOfPartitions_returnsNull(noiseKind: NoiseKind) { + val paramsWithNoise = + AGGREGATION_PARAMS.copy( + noiseKind = noiseKind, + maxPartitionsContributed = 30, + maxContributionsPerPartition = 50, + ) + val noiseBudget = AllocatedBudget().apply { initialize(0.5, 1e-12) } + val thresholdingBudget = AllocatedBudget().apply { initialize(0.5, 1e-12) } + val combiner = + PostAggregationPartitionSelectionCombiner( + paramsWithNoise, + noiseBudget, + thresholdingBudget, + NoiseFactory(), + ) + + val result = combiner.computeMetrics(privacyIdCountAccumulator { count = 1 }) + + assertThat(result).isNull() + } + + @Test + fun computeMetrics_passesCorrectParametersToNoise() { + val allocatedBudget = AllocatedBudget().apply { initialize(5.5, 1e-3) } + val thresholdingBudget = AllocatedBudget().apply { initialize(0.0, 1e-8) } + val combiner = + PostAggregationPartitionSelectionCombiner( + AGGREGATION_PARAMS, + allocatedBudget, + thresholdingBudget, + noiseFactoryMock, + ) + + val unused = combiner.computeMetrics(privacyIdCountAccumulator { count = 105 }) + + verify(noiseMock) + .addNoise( + /* x= */ 105.0, + /* l0Sensitivity= */ 3, + /* lInfSensitivity= */ 1.0, + /* epsilon= */ 5.5, + /* delta= */ 1e-3, + ) + + verify(noiseMock) + .computeQuantile( + /* rank= */ 1e-8 / 3, + /* x= */ 0.0, + /* l0Sensitivity= */ 3, + /* lInfSensitivity= */ 1.0, + /* epsilon= */ 5.5, + /* delta= */ 1e-3, + ) + } + + @Test + fun computeMetrics_returnsNoiseValue() { + val allocatedBudget = AllocatedBudget().apply { initialize(5.5, 1e-3) } + val thresholdingBudget = AllocatedBudget().apply { initialize(0.0, 1e-8) } + val combiner = + PostAggregationPartitionSelectionCombiner( + AGGREGATION_PARAMS, + allocatedBudget, + thresholdingBudget, + NoiseFactory(), + ) + + val noisedValue = combiner.computeMetrics(privacyIdCountAccumulator { count = 200 }) + assertThat(noisedValue).isWithin(10.0).of(200.0) + assertThat(noisedValue).isNotEqualTo(200.0) + } + + companion object { + private val AGGREGATION_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(MetricType.PRIVACY_ID_COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 3, + maxContributionsPerPartition = 5, + ) + + private val noiseMock: Noise = mock() + private val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> noiseMock } + private val unusedAllocatedBudget = AllocatedBudget() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivacyIdCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivacyIdCombinerTest.kt new file mode 100644 index 00000000..9768ff33 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivacyIdCombinerTest.kt @@ -0,0 +1,125 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdCountAccumulator +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import org.junit.BeforeClass +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.mock +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class PrivacyIdCombinerTest { + companion object { + private val AGG_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(PRIVACY_ID_COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 3, + maxContributionsPerPartition = 5, + ) + + private val noiseMock: Noise = mock() + private val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> noiseMock } + private val UNUSED_ALLOCATED_BUDGET = AllocatedBudget() + + @JvmStatic + @BeforeClass + fun beforeClass() { + UNUSED_ALLOCATED_BUDGET.initialize(1.1, 1e-3) + } + } + + @Test + fun createAccumulator_initsAccumulatorWithOne() { + val combiner = PrivacyIdCountCombiner(AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(1.0, 1.0, 1.0) }) + + assertThat(accumulator).isEqualTo(privacyIdCountAccumulator { count = 1 }) + } + + @Test + fun createAccumulator_initsAccumulatorWithZero() { + val combiner = PrivacyIdCountCombiner(AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = combiner.createAccumulator(privacyIdContributions {}) + + assertThat(accumulator).isEqualTo(privacyIdCountAccumulator { count = 0 }) + } + + @Test + fun mergeAccumulators_sumsCounts() { + val combiner = PrivacyIdCountCombiner(AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = + combiner.mergeAccumulators( + privacyIdCountAccumulator { count = 1 }, + privacyIdCountAccumulator { count = 2 }, + ) + + assertThat(accumulator).isEqualTo(privacyIdCountAccumulator { count = 3 }) + } + + @TestParameters("{noiseKind: LAPLACE}", "{noiseKind: GAUSSIAN}") + fun computeMetrics_addsNoise(noiseKind: NoiseKind) { + val paramsWithNoise = + AGG_PARAMS.copy( + noiseKind = noiseKind, + maxPartitionsContributed = 30, + maxContributionsPerPartition = 50, + ) + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-3) + val combiner = PrivacyIdCountCombiner(paramsWithNoise, allocatedBudget, NoiseFactory()) + + val result = combiner.computeMetrics(privacyIdCountAccumulator { count = 1 }) + + assertThat(result).isNotEqualTo(1) + } + + @Test + fun computeMetrics_passesCorrectParametersToNoise() { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-3) + val combiner = PrivacyIdCountCombiner(AGG_PARAMS, allocatedBudget, noiseFactoryMock) + + val unused = combiner.computeMetrics(privacyIdCountAccumulator { count = 1 }) + + verify(noiseMock) + .addNoise( + /* x= */ 1.0, + /* l0Sensitivity= */ 3, + /* lInfSensitivity= */ 1.0, + /* epsilon= */ 1.1, + /* delta= */ 1e-3, + ) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitionsComputationalGraphTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitionsComputationalGraphTest.kt new file mode 100644 index 00000000..912453a3 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitionsComputationalGraphTest.kt @@ -0,0 +1,357 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.PreAggregationPartitionSelectionFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.any +import org.mockito.kotlin.doReturn +import org.mockito.kotlin.mock +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class PrivatePartitionsComputationalGraphTest { + @Test + fun aggregate_appliesPreAggregationPartitionSelection_emptyResult() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "partition1", 1.0), + TestDataRow("Alice", "partition2", 1.0), + TestDataRow("Bob", "partition1", 1.0), + ) + ) + val preAggregationPartitionSelector = + mock() { + on { this.shouldKeep(any()) } doReturn false // Drop all partitions + } + val computationalGraph = + PrivatePartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 100, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + preAggregationPartitionSelector, + COUNT_SUM_AND_ID_COUNT_COMBINER_ZERO_NOISE, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = computationalGraph.aggregate(inputData) as LocalTable + + assertThat(dpAggregates.data.toList().isEmpty()).isTrue() + } + + @Test + fun aggregate_requestedMetricsComputed() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "partition1", 1.0), + TestDataRow("Alice", "partition2", 2.0), + TestDataRow("Bob", "partition1", 4.0), + ) + ) + val partitionSelectorMock = + mock() { + on { this.shouldKeep(any()) } doReturn true // Keep all partitions + } + val computationalGraph = + PrivatePartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 10, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + partitionSelectorMock, + COUNT_SUM_AND_ID_COUNT_COMBINER_ZERO_NOISE, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = + (computationalGraph.aggregate(inputData) as LocalTable).data.toMap() + + // Assert + verify(partitionSelectorMock).shouldKeep(1) // "partition1" + verify(partitionSelectorMock).shouldKeep(2) // "partition2" + + assertThat(dpAggregates.keys).containsExactly("partition1", "partition2") + assertThat(dpAggregates.get("partition1")!!.count).isEqualTo(2.0) + assertThat(dpAggregates.get("partition1")!!.sum).isEqualTo(5.0) + assertThat(dpAggregates.get("partition2")!!.sum).isEqualTo(2.0) + assertThat(dpAggregates.get("partition2")!!.privacyIdCount).isEqualTo(1.0) + } + + @Test + fun aggregate_appliesPostAggregationPartitionSelection_emptyResult() { + // Arrange. + val inputData = + LocalCollection( + sequenceOf(TestDataRow("Alice", "partition1", 1.0), TestDataRow("Bob", "partition2", 1.0)) + ) + + // The probability of publishing a partition is delta. Set small delta to make it the output + // almost always empty. + val thresholdBudget = AllocatedBudget().apply { initialize(0.0, 1e-15) } + val preAggregationPartitionSelector = + PostAggregationPartitionSelectionCombiner( + PRIVACY_ID_COUNT_PARAMS, + METRICS_ALLOCATED_BUDGET, + thresholdBudget, + NoiseFactory(), + ) + + val compoundCombinerWithThresholding = CompoundCombiner(listOf(preAggregationPartitionSelector)) + + val computationalGraph = + PrivatePartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 100, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + preAggregationPartitionSelector = null, + compoundCombinerWithThresholding, + testDataExtractors, + LOCAL_EF, + ) + + // Act. + val dpAggregates = computationalGraph.aggregate(inputData) as LocalTable + + // Assert. + assertThat(dpAggregates.data.toList().isEmpty()).isTrue() + } + + @Test + fun aggregate_appliesPostAggregationPartitionSelection_partitionsKept() { + // Arrange. + // Creates a dataset with 100 privacy units, each contributes 1 record to the same partition. + val inputData = + LocalCollection((1..100).map { TestDataRow("PrivacyKey$it", "partition", 1.0) }.asSequence()) + + // Set large budget, though the partition will be kept with the probability close to 1. + val metricsBudget = AllocatedBudget().apply { initialize(5.0, 1e-2) } + val thresholdBudget = AllocatedBudget().apply { initialize(0.0, 1e-2) } + val preAggregationPartitionSelector = + PostAggregationPartitionSelectionCombiner( + PRIVACY_ID_COUNT_PARAMS, + metricsBudget, + thresholdBudget, + NoiseFactory(), + ) + + val compoundCombinerWithThresholding = CompoundCombiner(listOf(preAggregationPartitionSelector)) + + val computationalGraph = + PrivatePartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 1, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + preAggregationPartitionSelector = null, + compoundCombinerWithThresholding, + testDataExtractors, + LOCAL_EF, + ) + + // Act. + val dpAggregates = + (computationalGraph.aggregate(inputData) as LocalTable).data.toMap() + + // Assert. + assertThat(dpAggregates.get("partition")!!.privacyIdCount).isWithin(10.0).of(100.0) + } + + @Test + fun aggregate_withPartitionSampler_appliesPartitionSampling() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "green", 10.0), + TestDataRow("Alice", "blue", 10.0), + ) + ) + val partitionSelectorMock = + mock() { + on { this.shouldKeep(any()) } doReturn true // Keep all partitions + } + val computationalGraph = + PrivatePartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 2, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + partitionSelectorMock, + COUNT_SUM_AND_ID_COUNT_COMBINER_ZERO_NOISE, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = + (computationalGraph.aggregate(inputData) as LocalTable).data.toMap() + + // Assert. + // The user contributed to 3 partitions but maxPartitionsContributed is set to 2. Hence, + // contributions to 2 partitions should appear in the result. + assertThat(dpAggregates.values.map { it.count }).containsExactly(1.0, 1.0) + assertThat(dpAggregates.values.map { it.sum }).containsExactly(10.0, 10.0) + } + + @Test + fun aggregate_addsNoise() { + val inputData = + LocalCollection((0..10).map { TestDataRow("PrivacyKey$it", "partition", 1.0) }.asSequence()) + val preAggregationPartitionSelector = + DpLibPreAggregationPartitionSelector( + maxPartitionsContributed = 5, + preThreshold = 1, + PARTITION_SELECTION_ALLOCATED_BUDGET, + PreAggregationPartitionSelectionFactory(), + ) + val computationalGraph = + PrivatePartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 5, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + preAggregationPartitionSelector, + PRIVACY_ID_COUNT_COMBINER, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = + (computationalGraph.aggregate(inputData) as LocalTable).data.toMap() + + // Assert. + assertThat(dpAggregates.keys).containsExactly("partition") + assertThat(dpAggregates.get("partition")!!.count).isNotEqualTo(10.0) + } + + @Test + fun constructor_failNoPreNorPostAggregationThresholding() { + assertFailsWith("Computational graph must have either") { + PrivatePartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 5, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + preAggregationPartitionSelector = null, + PRIVACY_ID_COUNT_COMBINER, + testDataExtractors, + LOCAL_EF, + ) + } + } + + private companion object { + val PRIVACY_ID_COUNT_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(PRIVACY_ID_COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 10, + maxContributionsPerPartition = 5, + ) + val COUNT_SUM_AND_ID_COUNT_PARAMS = + AggregationParams( + metrics = + ImmutableList.of( + MetricDefinition(COUNT), + MetricDefinition(SUM), + MetricDefinition(PRIVACY_ID_COUNT), + ), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 100, + maxContributionsPerPartition = 100, + minTotalValue = -100.0, + maxTotalValue = 100.0, + ) + val METRICS_ALLOCATED_BUDGET = AllocatedBudget().apply { initialize(1.1, 1e-3) } + // High epsilon/delta for partition selection. Partitions with ~10 privacy unit have ~1 + // probability to be kept. + val PARTITION_SELECTION_ALLOCATED_BUDGET = AllocatedBudget().apply { initialize(10.0, 1e-1) } + val THRESHOLDING_BUDGET = AllocatedBudget().apply { initialize(0.0, 1e-1) } + + val LOCAL_EF = LocalEncoderFactory() + val COUNT_SUM_AND_ID_COUNT_COMBINER_ZERO_NOISE = + CompoundCombiner( + listOf( + CountCombiner( + COUNT_SUM_AND_ID_COUNT_PARAMS, + METRICS_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ), + SumCombiner(COUNT_SUM_AND_ID_COUNT_PARAMS, METRICS_ALLOCATED_BUDGET, ZeroNoiseFactory()), + PrivacyIdCountCombiner( + COUNT_SUM_AND_ID_COUNT_PARAMS, + METRICS_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ), + ) + ) + + val PRIVACY_ID_COUNT_COMBINER = + CompoundCombiner( + listOf( + PrivacyIdCountCombiner(PRIVACY_ID_COUNT_PARAMS, METRICS_ALLOCATED_BUDGET, NoiseFactory()) + ) + ) + + val POST_AGGREGATION_THRESHOLDING_COMBINER = + CompoundCombiner( + listOf( + PostAggregationPartitionSelectionCombiner( + PRIVACY_ID_COUNT_PARAMS, + METRICS_ALLOCATED_BUDGET, + THRESHOLDING_BUDGET, + NoiseFactory(), + ) + ) + ) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitionsTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitionsTest.kt new file mode 100644 index 00000000..e272bf7e --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PrivatePartitionsTest.kt @@ -0,0 +1,264 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.GaussianNoise +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.LAPLACE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.PreAggregationPartitionSelectionFactory +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.eq +import org.mockito.kotlin.spy +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class PrivatePartitionsTest { + companion object { + private val AGG_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(PRIVACY_ID_COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 3, + maxContributionsPerPartition = 5, + ) + } + + @Test + fun shouldKeep_manyContributions_returnsTrue() { + val allocatedBudget = AllocatedBudget.create() + allocatedBudget.initialize(epsilon = 1.5, delta = 2e-5) + val factorySpy: PreAggregationPartitionSelectionFactory = spy() + val selector = + DpLibPreAggregationPartitionSelector( + maxPartitionsContributed = 3, + preThreshold = 1, + allocatedBudget, + factorySpy, + ) + + // Large number of privacy units with probability close to 1 should be kept. + assertThat(selector.shouldKeep(10000)).isTrue() + + verify(factorySpy).create(1.5, 2e-5, 3, 1) + } + + @Test + fun shouldKeep_manyContributionsButWithHighPreThreshold_returnsFalse() { + val allocatedBudget = AllocatedBudget.create() + allocatedBudget.initialize(epsilon = 1.5, delta = 2e-5) + val factorySpy: PreAggregationPartitionSelectionFactory = spy() + val highPreThresholdValue = 100000 + val selector = + DpLibPreAggregationPartitionSelector( + maxPartitionsContributed = 3, + preThreshold = highPreThresholdValue, + allocatedBudget, + factorySpy, + ) + + // PreThreshold is too high, so the partition is dropped. + assertThat(selector.shouldKeep(10000)).isFalse() + + verify(factorySpy).create(1.5, 2e-5, 3, highPreThresholdValue) + } + + @Test + fun shouldKeep_fewContributions_returnsFalse() { + val allocatedBudget = AllocatedBudget.create() + allocatedBudget.initialize(epsilon = 1.0, delta = 1e-15) + val selector = + DpLibPreAggregationPartitionSelector( + maxPartitionsContributed = 3, + preThreshold = 1, + allocatedBudget, + PreAggregationPartitionSelectionFactory(), + ) + + // Only 1 privacy unit, with probability < 1e-15 should be dropped. + assertThat(selector.shouldKeep(1)).isFalse() + } + + enum class ThresholdTestCase( + val epsilon: Double, + val delta: Double, + val thresholdingDelta: Double, + val noiseKind: NoiseKind, + val maxPartitionsContributed: Int, + val expectedThreshold: Double, + ) { + TEST_CASE_GAUSSIAN_1( + epsilon = 1.0, + delta = 1e-4, + thresholdingDelta = 1e-8, + noiseKind = GAUSSIAN, + maxPartitionsContributed = 1, + expectedThreshold = 18.88, + ), + TEST_CASE_GAUSSIAN_2( + epsilon = 0.001, + delta = 1e-5, + thresholdingDelta = 1e-9, + noiseKind = GAUSSIAN, + maxPartitionsContributed = 2, + expectedThreshold = 14905.02, + ), + TEST_CASE_GAUSSIAN_3( + epsilon = 10.0, + delta = 1e-3, + thresholdingDelta = 2e-6, + noiseKind = GAUSSIAN, + maxPartitionsContributed = 3, + expectedThreshold = 4.4, + ), + TEST_CASE_LAPLACE_1( + epsilon = 1.1, + delta = 0.0, + thresholdingDelta = 1e-5, + noiseKind = LAPLACE, + maxPartitionsContributed = 1, + expectedThreshold = 10.84, + ), + TEST_CASE_LAPLACE_2( + epsilon = 0.01, + delta = 0.0, + thresholdingDelta = 1e-10, + noiseKind = LAPLACE, + maxPartitionsContributed = 2, + expectedThreshold = 4606.17, + ), + TEST_CASE_LAPLACE_3( + epsilon = 5.0, + delta = 0.0, + thresholdingDelta = 1e-2, + noiseKind = LAPLACE, + maxPartitionsContributed = 10, + expectedThreshold = 13.42, + ), + } + + @Test + fun threshold_returnsExpectedThreshold(@TestParameter testCase: ThresholdTestCase) { + val noiseBudget = + AllocatedBudget.create().apply { + initialize(epsilon = testCase.epsilon, delta = testCase.delta) + } + val thresholdingBudget = + AllocatedBudget.create().apply { + initialize(epsilon = 0.0, delta = testCase.thresholdingDelta) + } + val selector = + PostAggregationPartitionSelectorImpl( + testCase.maxPartitionsContributed, + testCase.noiseKind, + preThreshold = 1, + noiseBudget, + thresholdingBudget, + NoiseFactory(), + ) + + assertThat(selector.threshold).isWithin(1e-2).of(testCase.expectedThreshold) + } + + @Test + fun addNoiseIfShouldKeep_keepDropAsExpected() { + val noiseBudget = AllocatedBudget.create().apply { initialize(epsilon = 1.0, delta = 1e-3) } + val thresholdingBudget = + AllocatedBudget.create().apply { initialize(epsilon = 0.0, delta = 1e-10) } + val gaussianNoiseSpy: GaussianNoise = spy() + val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> gaussianNoiseSpy } + + val selector = + PostAggregationPartitionSelectorImpl( + maxPartitionsContributed = 2, + GAUSSIAN, + preThreshold = 1, + noiseBudget, + thresholdingBudget, + noiseFactoryMock, + ) + + assertThat(selector.threshold).isWithin(1e-1).of(24.5) + + // A partition with small number of privacy units is dropped + assertThat(selector.addNoiseIfShouldKeep(1)).isNull() + verify(gaussianNoiseSpy).addNoise(eq(1.0), eq(2), eq(1.0), eq(1.0), eq(1e-3)) + + // A partition with large number of privacy units is kept + assertThat(selector.addNoiseIfShouldKeep(200)).isWithin(70.0).of(200.0) + verify(gaussianNoiseSpy).addNoise(eq(200.0), eq(2), eq(1.0), eq(1.0), eq(1e-3)) + } + + @Test + fun addNoiseIfShouldKeep_preThresholdGreaterThanOne_keepDropAsExpected() { + val noiseBudget = AllocatedBudget.create().apply { initialize(epsilon = 1.0, delta = 1e-3) } + val thresholdingBudget = + AllocatedBudget.create().apply { initialize(epsilon = 0.0, delta = 1e-10) } + val gaussianNoiseSpy: GaussianNoise = spy() + val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> gaussianNoiseSpy } + + val selector = + PostAggregationPartitionSelectorImpl( + maxPartitionsContributed = 2, + GAUSSIAN, + preThreshold = 100, + noiseBudget, + thresholdingBudget, + noiseFactoryMock, + ) + + // A partition with number of privacy units equal to preThreshold is dropped + assertThat(selector.addNoiseIfShouldKeep(100)).isNull() + + // A partition with large number of privacy units is kept + assertThat(selector.addNoiseIfShouldKeep(200)).isWithin(70.0).of(200.0) + } + + @Test + fun addNoiseIfShouldKeep_privacyIdCountIsLessThanPreThreshold_dropsImmediatelyWithoutAnyComputations() { + val selector = + PostAggregationPartitionSelectorImpl( + maxPartitionsContributed = 0, + noiseKind = GAUSSIAN, + // only preThreshold matters. + preThreshold = 100, + noiseBudget = AllocatedBudget.create(), + thresholdingBudget = AllocatedBudget.create(), + NoiseFactory(), + ) + + assertThat(selector.addNoiseIfShouldKeep(99)).isNull() + } + + @Test + fun shouldKeep_noPrivacy_alwaysReturnsTrue() { + val selector = NoPrivacyPartitionSelector() + + // Large number of privacy units should be kept. + assertThat(selector.shouldKeep(10000)).isTrue() + // No privacy units should also be kept. + assertThat(selector.shouldKeep(0)).isTrue() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitionsComputationalGraphTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitionsComputationalGraphTest.kt new file mode 100644 index 00000000..37c8a96d --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitionsComputationalGraphTest.kt @@ -0,0 +1,345 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.PRIVACY_ID_COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PartitionsBalance.UNKNOWN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.DpAggregates +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.dpAggregates +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import com.google.testing.junit.testparameterinjector.TestParametersValuesProvider +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class PublicPartitionsComputationalGraphTest { + companion object { + private val COUNT_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = Int.MAX_VALUE, + maxContributionsPerPartition = Int.MAX_VALUE, + ) + private val PRIVACY_ID_COUNT_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(PRIVACY_ID_COUNT)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = Int.MAX_VALUE, + ) + private val SUM_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(SUM)), + noiseKind = GAUSSIAN, + maxPartitionsContributed = Int.MAX_VALUE, + minTotalValue = -Double.MAX_VALUE, + maxTotalValue = Double.MAX_VALUE, + ) + private val COUNT_SUM_AND_ID_COUNT_PARAMS = + AggregationParams( + metrics = + ImmutableList.of( + MetricDefinition(COUNT), + MetricDefinition(SUM), + MetricDefinition(PRIVACY_ID_COUNT), + ), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 100, + maxContributionsPerPartition = 100, + minTotalValue = -100.0, + maxTotalValue = 100.0, + ) + private val ALLOCATED_BUDGET = AllocatedBudget() + + init { + ALLOCATED_BUDGET.initialize(1.1, 1e-3) + } + + private val LOCAL_EF = LocalEncoderFactory() + private val COUNT_SUM_AND_ID_COUNT_COMBINER = + CompoundCombiner( + listOf( + CountCombiner(COUNT_SUM_AND_ID_COUNT_PARAMS, ALLOCATED_BUDGET, ZeroNoiseFactory()), + SumCombiner(COUNT_SUM_AND_ID_COUNT_PARAMS, ALLOCATED_BUDGET, ZeroNoiseFactory()), + PrivacyIdCountCombiner( + COUNT_SUM_AND_ID_COUNT_PARAMS, + ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ), + ) + ) + } + + @Test + fun aggregate_appliesPublicPartitions() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "public_present_in_data", 1.0), + TestDataRow("Alice", "not_public", 1.0), + ) + ) + val publicPartitions = + LocalCollection(sequenceOf("public_present_in_data", "public_not_present_in_data")) + val computationalGraph = + PublicPartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 5, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + publicPartitions, + PartitionsBalance.UNKNOWN, + COUNT_SUM_AND_ID_COUNT_COMBINER, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = computationalGraph.aggregate(inputData) as LocalTable + + assertThat(dpAggregates.data.toMap().keys) + .containsExactly("public_present_in_data", "public_not_present_in_data") + // Check that the value corresponding to the public partition not present in data is noisy + assertThat(dpAggregates.data.toMap().get("public_present_in_data")!!.count).isNotEqualTo(0.0) + assertThat(dpAggregates.data.toMap().get("public_present_in_data")!!.sum).isNotEqualTo(0.0) + assertThat(dpAggregates.data.toMap().get("public_present_in_data")!!.privacyIdCount) + .isNotEqualTo(0.0) + } + + // The test uses a value provider instead of a canonical enum because an enum must be immutable, + // and it cannot be immutable because it stores a CompoundCombiner, which is not immutable. + class MetricsTestValueProvider : TestParametersValuesProvider() { + override fun provideValues(context: Context): MutableList { + return ImmutableList.of( + TestParameters.TestParametersValues.builder() + .name("COUNT") + .addParameter("params", COUNT_PARAMS) + .addParameter( + "combiner", + CompoundCombiner( + listOf(CountCombiner(COUNT_PARAMS, ALLOCATED_BUDGET, ZeroNoiseFactory())) + ), + ) + .addParameter( + "inputData", + sequenceOf( + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "green", 10.0), + TestDataRow("Bob", "green", 10.0), + TestDataRow("Bob", "blue", 10.0), + ), + ) + .addParameter("publicPartitions", sequenceOf("red", "green", "blue")) + .addParameter( + "expectedResult", + arrayOf( + Pair("red", dpAggregates { count = 2.0 }), + Pair("green", dpAggregates { count = 2.0 }), + Pair("blue", dpAggregates { count = 1.0 }), + ), + ) + .build(), + TestParameters.TestParametersValues.builder() + .name("SUM") + .addParameter("params", SUM_PARAMS) + .addParameter( + "combiner", + CompoundCombiner(listOf(SumCombiner(SUM_PARAMS, ALLOCATED_BUDGET, ZeroNoiseFactory()))), + ) + .addParameter( + "inputData", + sequenceOf( + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "green", 10.0), + TestDataRow("Bob", "green", 10.0), + TestDataRow("Bob", "blue", 10.0), + ), + ) + .addParameter("publicPartitions", sequenceOf("red", "green", "blue")) + .addParameter( + "expectedResult", + arrayOf( + Pair("red", dpAggregates { sum = 20.0 }), + Pair("green", dpAggregates { sum = 20.0 }), + Pair("blue", dpAggregates { sum = 10.0 }), + ), + ) + .build(), + TestParameters.TestParametersValues.builder() + .name("PRIVACY_ID_COUNT") + .addParameter("params", PRIVACY_ID_COUNT_PARAMS) + .addParameter( + "combiner", + CompoundCombiner( + listOf( + PrivacyIdCountCombiner( + PRIVACY_ID_COUNT_PARAMS, + ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + ) + ), + ) + .addParameter( + "inputData", + sequenceOf( + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "green", 10.0), + TestDataRow("Bob", "green", 10.0), + TestDataRow("Bob", "blue", 10.0), + ), + ) + .addParameter("publicPartitions", sequenceOf("red", "green", "blue")) + .addParameter( + "expectedResult", + arrayOf( + Pair("red", dpAggregates { privacyIdCount = 1.0 }), + Pair("green", dpAggregates { privacyIdCount = 2.0 }), + Pair("blue", dpAggregates { privacyIdCount = 1.0 }), + ), + ) + .build(), + TestParameters.TestParametersValues.builder() + .name("COUNT_SUM_PRIVACY_ID_COUNT") + .addParameter("params", COUNT_SUM_AND_ID_COUNT_PARAMS) + .addParameter("combiner", COUNT_SUM_AND_ID_COUNT_COMBINER) + .addParameter( + "inputData", + sequenceOf( + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "green", 10.0), + TestDataRow("Bob", "green", 10.0), + TestDataRow("Bob", "blue", 10.0), + ), + ) + .addParameter("publicPartitions", sequenceOf("red", "green", "blue")) + .addParameter( + "expectedResult", + arrayOf( + Pair( + "red", + dpAggregates { + count = 2.0 + sum = 20.0 + privacyIdCount = 1.0 + }, + ), + Pair( + "green", + dpAggregates { + count = 2.0 + sum = 20.0 + privacyIdCount = 2.0 + }, + ), + Pair( + "blue", + dpAggregates { + count = 1.0 + sum = 10.0 + privacyIdCount = 1.0 + }, + ), + ), + ) + .build(), + ) + } + } + + @Test + @TestParameters(valuesProvider = MetricsTestValueProvider::class) + fun aggregate_computesMetricsDefinedInCombiner( + params: AggregationParams, + combiner: CompoundCombiner, + inputData: Sequence, + publicPartitions: Sequence, + expectedResult: Array>, + ) { + val computationalGraph = + PublicPartitionsComputationalGraph( + PartitionSampler( + params.maxPartitionsContributed!!, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + LocalCollection(publicPartitions), + PartitionsBalance.UNKNOWN, + combiner, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = + computationalGraph.aggregate(LocalCollection(inputData)) as LocalTable + + assertThat(dpAggregates.data.toList()).containsExactlyElementsIn(expectedResult) + } + + @Test + fun aggregate_withPartitionSampler_appliesPartitionSampling() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "red", 10.0), + TestDataRow("Alice", "green", 10.0), + TestDataRow("Alice", "blue", 10.0), + ) + ) + val publicPartitions = LocalCollection(sequenceOf("red", "green", "blue")) + val computationalGraph = + PublicPartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 2, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + publicPartitions, + PartitionsBalance.UNKNOWN, + COUNT_SUM_AND_ID_COUNT_COMBINER, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = computationalGraph.aggregate(inputData) as LocalTable + + // The user contributed to 3 partitions but maxPartitionsContributed is set to 2. Hence, + // contributions to 2 partitions should appear in the result. + assertThat(dpAggregates.data.toMap().values.map { it.count }).containsExactly(1.0, 1.0, 0.0) + assertThat(dpAggregates.data.toMap().values.map { it.sum }).containsExactly(10.0, 10.0, 0.0) + assertThat(dpAggregates.data.toMap().values.map { it.privacyIdCount }) + .containsExactly(1.0, 1.0, 0.0) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitionsTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitionsTest.kt new file mode 100644 index 00000000..bba7b007 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/PublicPartitionsTest.kt @@ -0,0 +1,107 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalTable +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.CompoundAccumulator +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.compoundAccumulator +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class PublicPartitionsTest { + @Test + fun dropNonPublicPartitions_keepsOnlyPublicPartitions( + @TestParameter partitionsBalance: PartitionsBalance + ) { + val inputData = + LocalCollection( + sequenceOf( + contributionWithPrivacyId("pid1", "privatePartition1", 1.0), + contributionWithPrivacyId("pid1", "privatePartition2", 1.0), + contributionWithPrivacyId("pid1", "publicPartition1", 1.0), + contributionWithPrivacyId("pid2", "publicPartition1", 1.0), + contributionWithPrivacyId("pid2", "publicPartition1", 1.0), + contributionWithPrivacyId("pid3", "publicPartition2", 1.0), + contributionWithPrivacyId("pid4", "privatePartition2", 1.0), + ) + ) + val publicPartitions = + LocalCollection(sequenceOf("publicPartition1", "publicPartition2", "publicPartition3")) + + val result = + inputData.dropNonPublicPartitions(publicPartitions, LOCAL_EF.strings(), partitionsBalance) + as LocalCollection> + + assertThat(result.data.toList()) + .containsExactly( + contributionWithPrivacyId("pid1", "publicPartition1", 1.0), + contributionWithPrivacyId("pid2", "publicPartition1", 1.0), + contributionWithPrivacyId("pid2", "publicPartition1", 1.0), + contributionWithPrivacyId("pid3", "publicPartition2", 1.0), + ) + } + + @Test + fun insertPublicPartitions_addsAllPublicPartitionsWithEmptyAccumulatorAsValues() { + val inputData = + LocalTable( + sequenceOf("partition1" to compoundAccumulator {}, "partition3" to compoundAccumulator {}) + ) + val publicPartitions = + LocalCollection( + sequenceOf( + "partition0", + "partition1", + "partition2", + "partition3", + "partition4", + "partition5", + ) + ) + val compoundCombiner = CompoundCombiner(combiners = emptyList()) + + val result = + inputData.insertPublicPartitions( + publicPartitions, + compoundCombiner, + LOCAL_EF.strings(), + LOCAL_EF, + ) as LocalTable + + assertThat(result.data.toList()) + .containsExactly( + "partition1" to compoundAccumulator {}, + "partition3" to compoundAccumulator {}, + "partition0" to compoundCombiner.emptyAccumulator(), + "partition1" to compoundCombiner.emptyAccumulator(), + "partition2" to compoundCombiner.emptyAccumulator(), + "partition3" to compoundCombiner.emptyAccumulator(), + "partition4" to compoundCombiner.emptyAccumulator(), + "partition5" to compoundCombiner.emptyAccumulator(), + ) + } + + private companion object { + private val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/QuantilesCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/QuantilesCombinerTest.kt new file mode 100644 index 00000000..d82a8173 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/QuantilesCombinerTest.kt @@ -0,0 +1,117 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class QuantilesCombinerTest { + private fun defaultQuantilesAggParams() = + AggregationParams( + metrics = ImmutableList.of(), + noiseKind = GAUSSIAN, + maxPartitionsContributed = 1, + maxContributionsPerPartition = 1, + minValue = -10000.0, + maxValue = 10000.0, + ) + + @Test + fun computeMetrics_noNoise_withEmptyAccumulator_returnsCorrectQuantiles() { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-5) + val combiner = + QuantilesCombiner( + ranks = listOf(0.25, 0.75, 0.5), + defaultQuantilesAggParams(), + allocatedBudget, + ZeroNoiseFactory(), + ) + + val accumulator0 = combiner.emptyAccumulator() + val accumulator1 = + combiner.createAccumulator(privacyIdContributions { values += listOf(1.0, 3.0) }) + val accumulator2 = + combiner.createAccumulator(privacyIdContributions { values += listOf(2.0, 4.0) }) + val accumulator3 = combiner.createAccumulator(privacyIdContributions { values += listOf(5.0) }) + val accumulator01 = combiner.mergeAccumulators(accumulator0, accumulator1) + val accumulator012 = combiner.mergeAccumulators(accumulator01, accumulator2) + val accumulator0123 = combiner.mergeAccumulators(accumulator3, accumulator012) + val quantiles = combiner.computeMetrics(accumulator0123) + + assertThat(quantiles).hasSize(3) + assertThat(quantiles.get(0)).isWithin(0.5).of(2.0) + assertThat(quantiles.get(1)).isWithin(0.5).of(3.0) + assertThat(quantiles.get(2)).isWithin(0.5).of(4.0) + } + + @Test + fun computeMetrics_noNoise_onlyEmptyAccumulator_returnsQuantilesBetweenMinMaxValues() { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-5) + val combiner = + QuantilesCombiner( + ranks = listOf(0.0, 0.5, 1.0), + defaultQuantilesAggParams().copy(minValue = -10.0, maxValue = 10.0), + allocatedBudget, + ZeroNoiseFactory(), + ) + + val quantiles = combiner.computeMetrics(combiner.emptyAccumulator()) + + assertThat(quantiles).hasSize(3) + assertThat(quantiles.get(0)).isWithin(0.5).of(-10.0) + assertThat(quantiles.get(1)).isWithin(0.5).of(0.0) + assertThat(quantiles.get(2)).isWithin(0.5).of(10.0) + } + + @Test + @TestParameters("{noiseKind: LAPLACE, delta: 0.0}", "{noiseKind: GAUSSIAN, delta: 0.1}") + fun computeMetrics_smallNoise_returnsQuantilesCloseToReal(noiseKind: NoiseKind, delta: Double) { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(100.0, delta) + val combiner = + QuantilesCombiner( + ranks = listOf(0.0, 0.5, 1.0), + defaultQuantilesAggParams().copy(minValue = 1.0, maxValue = 1000.0, noiseKind = noiseKind), + allocatedBudget, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator( + privacyIdContributions { values += (1..1000).map { it.toDouble() }.toList() } + ) + val quantiles = combiner.computeMetrics(accumulator) + + assertThat(quantiles).hasSize(3) + assertThat(quantiles.get(0)).isWithin(10.0).of(1.0) + assertThat(quantiles.get(1)).isWithin(10.0).of(500.0) + assertThat(quantiles.get(2)).isWithin(10.0).of(1000.0) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/SelectPartitionsComputationalGraphTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/SelectPartitionsComputationalGraphTest.kt new file mode 100644 index 00000000..2c04edef --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/SelectPartitionsComputationalGraphTest.kt @@ -0,0 +1,130 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.PreAggregationPartitionSelectionFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalCollection +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.any +import org.mockito.kotlin.doReturn +import org.mockito.kotlin.mock +import org.mockito.kotlin.times +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class SelectPartitionsComputationalGraphTest { + @Test + fun selectPartitions_selectorsDropsEverything_emptyResult() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "partition1"), + TestDataRow("Alice", "partition2"), + TestDataRow("Bob", "partition1"), + ) + ) + val partitionSelector = + mock() { + on { this.shouldKeep(any()) } doReturn false // Drop all partitions + } + val computationalGraph = + SelectPartitionsComputationalGraph( + PartitionSampler(10, LOCAL_EF.strings(), LOCAL_EF.strings(), LOCAL_EF), + partitionSelector, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = + (computationalGraph.selectPartitions(inputData) as LocalCollection).data.toList() + + assertThat(dpAggregates).isEmpty() + } + + @Test + fun aggregate_withPartitionSampler_appliesPartitionSampling() { + val inputData = + LocalCollection( + sequenceOf( + TestDataRow("Alice", "red"), + TestDataRow("Alice", "green"), + TestDataRow("Alice", "blue"), + ) + ) + val partitionSelectorMock = + mock() { + on { this.shouldKeep(any()) } doReturn true // Keep all partitions + } + val computationalGraph = + SelectPartitionsComputationalGraph( + PartitionSampler( + maxPartitionsContributed = 2, + LOCAL_EF.strings(), + LOCAL_EF.strings(), + LOCAL_EF, + ), + partitionSelectorMock, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = + (computationalGraph.selectPartitions(inputData) as LocalCollection).data.toList() + + assertThat(dpAggregates).hasSize(2) + verify(partitionSelectorMock, times(2)).shouldKeep(1) + } + + @Test + fun selectPartition_keepsFrequentPartition() { + // Generate a dataset with 2 partitions. One partition has 100 contributions and another 1. Each + // user contributes one record. + val inputData = + LocalCollection( + (0..100).map { TestDataRow("PrivacyKey$it", "partition${it/100}") }.asSequence() + ) + val partitionSelector = + DpLibPreAggregationPartitionSelector( + maxPartitionsContributed = 1, + preThreshold = 1, + allocatedBudget, + PreAggregationPartitionSelectionFactory(), + ) + val computationalGraph = + SelectPartitionsComputationalGraph( + PartitionSampler(10, LOCAL_EF.strings(), LOCAL_EF.strings(), LOCAL_EF), + partitionSelector, + testDataExtractors, + LOCAL_EF, + ) + + val dpAggregates = + (computationalGraph.selectPartitions(inputData) as LocalCollection).data.toList() + + assertThat(dpAggregates).containsExactly("partition0") + } + + private companion object { + val allocatedBudget = AllocatedBudget().apply { initialize(epsilon = 1.1, delta = 1e-3) } + val LOCAL_EF = LocalEncoderFactory() + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/SumCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/SumCombinerTest.kt new file mode 100644 index 00000000..29dcea93 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/SumCombinerTest.kt @@ -0,0 +1,220 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.NoiseKind.GAUSSIAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITHOUT_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.sumAccumulator +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.mock +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class SumCombinerTest { + private val SUM_AGG_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(SUM)), + noiseKind = GAUSSIAN, + minTotalValue = -1.0, + maxTotalValue = 3.0, + maxPartitionsContributed = 5, + ) + + private val noiseMock: Noise = mock() + private val noiseFactoryMock: (NoiseKind) -> Noise = { _ -> noiseMock } + private val UNUSED_ALLOCATED_BUDGET = AllocatedBudget() + + init { + UNUSED_ALLOCATED_BUDGET.initialize(1.1, 1e-3) + } + + @Test + fun emptyAccumulator_minIsGreaterThanZero_returnsZeroAndIgnoresContributionBounds() { + val combiner = + SumCombiner( + SUM_AGG_PARAMS.copy(minTotalValue = 1.0, maxTotalValue = 2.0), + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = combiner.emptyAccumulator() + + assertThat(accumulator).isEqualTo(sumAccumulator { sum = 0.0 }) + } + + @Test + fun createAccumulator_sumsItems() { + val combiner = + SumCombiner( + SUM_AGG_PARAMS.copy(minTotalValue = -300.0, maxTotalValue = 300.0), + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(-10.0, 15.0, 0.0) }) + + assertThat(accumulator).isEqualTo(sumAccumulator { sum = 5.0 }) + } + + @Test + fun createAccumulator_privacyLevelWithContributionBounding_clampsOnlyTotalSum() { + val combiner = + SumCombiner( + SUM_AGG_PARAMS.copy( + minValue = -1.0, + maxValue = 4.0, + minTotalValue = -2.0, + maxTotalValue = 300.0, + ), + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator( + privacyIdContributions { values += listOf(-1000.0, 1000.0, 500.0) } + ) + + assertThat(accumulator).isEqualTo(sumAccumulator { sum = 300.0 }) + } + + @Test + fun createAccumulator_privacyLevelWithoutContributionBounding_doesNotClampTotalSum() { + val combiner = + SumCombiner( + SUM_AGG_PARAMS.copy( + minValue = -1.0, + maxValue = 4.0, + minTotalValue = -2.0, + maxTotalValue = 300.0, + privacyLevel = NONE_WITHOUT_CONTRIBUTION_BOUNDING, + ), + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator( + privacyIdContributions { values += listOf(-1000.0, 1000.0, 500.0) } + ) + + assertThat(accumulator).isEqualTo(sumAccumulator { sum = 500.0 }) + } + + @Test + fun mergeAccumulators_sumsPartialSums() { + val combiner = SumCombiner(SUM_AGG_PARAMS, UNUSED_ALLOCATED_BUDGET, NoiseFactory()) + + val accumulator = + combiner.mergeAccumulators(sumAccumulator { sum = 1000.0 }, sumAccumulator { sum = -2000.0 }) + + assertThat(accumulator).isEqualTo(sumAccumulator { sum = -1000.0 }) + } + + @Test + @TestParameters("{noiseKind: LAPLACE, delta: 0.0}", "{noiseKind: GAUSSIAN, delta: 1e-5}") + fun computeMetrics_addsNoise(noiseKind: NoiseKind, delta: Double) { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, delta) + val combiner = + SumCombiner(SUM_AGG_PARAMS.copy(noiseKind = noiseKind), allocatedBudget, NoiseFactory()) + + val result = combiner.computeMetrics(sumAccumulator { sum = 1.0 }) + + assertThat(result).isNotEqualTo(1.0) + } + + @Test + fun computeMetrics_passesCorrectParametersToNoise() { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-3) + val combiner = + SumCombiner( + SUM_AGG_PARAMS.copy( + noiseKind = GAUSSIAN, + maxPartitionsContributed = 10, + minTotalValue = -4.0, + maxTotalValue = 3.0, + ), + allocatedBudget, + noiseFactoryMock, + ) + + val unused = combiner.computeMetrics(sumAccumulator { sum = 1.0 }) + + verify(noiseMock) + .addNoise( + /* x= */ 1.0, + /* l0Sensitivity= */ 10, + /* lInfSensitivity= */ 4.0, + /* epsilon= */ 1.1, + /*delta= */ 1e-3, + ) + } + + @Test + fun computeMetrics_withoutNoise_withMultipleContributionsIncludingEmptyAccumulator_returnsCorrectResult() { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-5) + val combiner = + SumCombiner( + SUM_AGG_PARAMS.copy(minTotalValue = -1.0, maxTotalValue = 3.0), + allocatedBudget, + ZeroNoiseFactory(), + ) + + val accumulator0 = combiner.emptyAccumulator() + val accumulator1 = + combiner.createAccumulator(privacyIdContributions { values += listOf(-2.0, 3.0) }) + val accumulator2 = + combiner.createAccumulator(privacyIdContributions { values += listOf(4.0, -1.0) }) + val accumulator3 = combiner.mergeAccumulators(accumulator0, accumulator1) + val finalAccumulator = combiner.mergeAccumulators(accumulator2, accumulator3) + val result = combiner.computeMetrics(finalAccumulator) + + assertThat(result).isEqualTo(4.0) + } + + @Test + fun computeMetrics_withoutNoiseAndEmptyAccumulatorThenMerged_returnsZeroSum() { + val allocatedBudget = AllocatedBudget() + allocatedBudget.initialize(1.1, 1e-5) + val combiner = + SumCombiner( + SUM_AGG_PARAMS.copy(minTotalValue = -1.0, maxTotalValue = 3.0), + allocatedBudget, + ZeroNoiseFactory(), + ) + + val result = combiner.computeMetrics(combiner.emptyAccumulator()) + + assertThat(result).isEqualTo(0.0) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/TestDataTypes.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/TestDataTypes.kt new file mode 100644 index 00000000..69e262e6 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/TestDataTypes.kt @@ -0,0 +1,34 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalEncoderFactory + +data class TestDataRow(val privacyId: String, val partitionKey: String, val value: Double = 0.0) { + private constructor() : this("defaultPrivacyId", "defaultPartitionKey", 0.0) +} + +val testDataExtractors = testDataExtractors(LocalEncoderFactory()) + +fun testDataExtractors(encoderFactory: EncoderFactory) = + DataExtractors.from( + { dataRow -> dataRow.privacyId }, + encoderFactory.strings(), + { dataRow -> dataRow.partitionKey }, + encoderFactory.strings(), + { dataRow -> dataRow.value }, + ) diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/VarianceCombinerTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/VarianceCombinerTest.kt new file mode 100644 index 00000000..24dba4f6 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/VarianceCombinerTest.kt @@ -0,0 +1,525 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core + +import com.google.common.collect.ImmutableList +import com.google.common.truth.Truth.assertThat +import com.google.common.truth.extensions.proto.ProtoTruth.assertThat +import com.google.privacy.differentialprivacy.Noise +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.COUNT +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.MEAN +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.SUM +import com.google.privacy.differentialprivacy.pipelinedp4j.core.MetricType.VARIANCE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.PrivacyLevel.NONE_WITHOUT_CONTRIBUTION_BOUNDING +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AllocatedBudget +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.NoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.dplibrary.ZeroNoiseFactory +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.privacyIdContributions +import com.google.privacy.differentialprivacy.pipelinedp4j.proto.varianceAccumulator +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import org.junit.BeforeClass +import org.junit.Test +import org.junit.runner.RunWith +import org.mockito.kotlin.mock +import org.mockito.kotlin.verify + +@RunWith(TestParameterInjector::class) +class VarianceCombinerTest { + @Test + fun emptyAccumulator_countAndSumAndSumSquaresAreZero() { + val combiner = + VarianceCombiner( + AGG_PARAMS, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = combiner.emptyAccumulator() + + assertThat(accumulator) + .isEqualTo( + varianceAccumulator { + count = 0 + normalizedSum = 0.0 + normalizedSumSquares = 0.0 + } + ) + } + + @Test + fun createAccumulator_doesNotClampContributionsWithinBounds() { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy(minValue = -8.0, maxValue = 12.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = combiner.createAccumulator(privacyIdContributions { values += listOf(5.5) }) + // midValue is the midpoint between minValue = -8.0 and maxValue = 12.0 = 2 + assertThat(accumulator) + .isEqualTo( + varianceAccumulator { + count = 1 + normalizedSum = 3.5 // = 5.5 - 2.0 = contribution - midValue + normalizedSumSquares = 12.25 // (5.5 - 2.0)^2 = (contribution - midValue)^2 + } + ) + } + + @Test + fun createAccumulator_privacyLevelWithContributionBounding_clampsValues() { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy(minValue = -10.0, maxValue = 10.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(-20.0, 30.0) }) + // midValue is the midpoint between minValue = -10.0 and maxValue = 10.0 = 0 + assertThat(accumulator) + .isEqualTo( + varianceAccumulator { + count = 2 + normalizedSum = 0.0 // (-10.0 - 0) + (10.0 - 0) = two clamped contributions minus midValue + normalizedSumSquares = + 200.0 // (-10.0 - 0)^2 + (10.0 - 0)^2 = two clamped contributions minus midValue squared + } + ) + } + + @Test + fun createAccumulator_privacyLevelWithoutContributionBounding_doesNotClampValues() { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy( + minValue = -10.0, + maxValue = 10.0, + privacyLevel = NONE_WITHOUT_CONTRIBUTION_BOUNDING, + ), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(-20.0, 30.0) }) + // midValue is the midpoint between minValue = -10.0 and maxValue = 10.0 = 0 + assertThat(accumulator) + .isEqualTo( + varianceAccumulator { + count = 2 + normalizedSum = 10.0 // (-20.0 - 0.0) + (30.0 - 0.0) Not clamped + normalizedSumSquares = 1300.0 // (-20.0 - 0.0)^2 + (30.0 - 0.0)^2 Not clamped + } + ) + } + + @Test + fun createAccumulator_normalizesSumAndSumOfSquares() { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy(minValue = 5.0, maxValue = 10.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = combiner.createAccumulator(privacyIdContributions { values += listOf(6.0) }) + + assertThat(accumulator) + .isEqualTo( + varianceAccumulator { + count = 1 + normalizedSum = -1.5 + normalizedSumSquares = (-1.5) * (-1.5) + } + ) + } + + @Test + fun createAccumulator_normalizationAndClamping() { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy(minValue = 5.0, maxValue = 10.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = combiner.createAccumulator(privacyIdContributions { values += listOf(30.0) }) + + assertThat(accumulator) + .isEqualTo( + varianceAccumulator { + count = 1 + normalizedSum = 2.5 + normalizedSumSquares = 2.5 * 2.5 + } + ) + } + + @Test + fun createAccumulator_aggregatesMultipleElements() { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy(minValue = 4.0), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + // Create list with one value that is clamped to min value. + val accumulator = + combiner.createAccumulator(privacyIdContributions { values += listOf(3.0, 5.5, 6.0) }) + + assertThat(accumulator) + .isEqualTo( + varianceAccumulator { + count = 3 + normalizedSum = -5.5 // = sum of normalized values = -3 - 1.5 - 1 + normalizedSumSquares = 12.25 // sum of each normalized value squared + } + ) + } + + @Test + fun mergeAccumulator_sumsValuesInMergedAccumulators() { + val combiner = + VarianceCombiner( + AGG_PARAMS, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val accumulator = + combiner.mergeAccumulators( + varianceAccumulator { + count = 1 + normalizedSum = -5.0 + normalizedSumSquares = 25.0 + }, + varianceAccumulator { + count = 10 + normalizedSum = 8.5 + normalizedSumSquares = 72.5 + }, + ) + + assertThat(accumulator) + .isEqualTo( + varianceAccumulator { + count = 11 + normalizedSum = 3.5 + normalizedSumSquares = 97.5 + } + ) + } + + @Test + fun computeMetrics_passesCorrectParametersToNoise() { + val countBudget = AllocatedBudget() + countBudget.initialize(2.0, 1e-5) + val sumBudget = AllocatedBudget() + sumBudget.initialize(1.0, 1e-3) + val sumSquaresBudget = AllocatedBudget() + sumSquaresBudget.initialize(3.0, 1e-2) + val noise: Noise = mock() + val noiseFactory: (NoiseKind) -> Noise = { _ -> noise } + + val combiner = + VarianceCombiner( + AGG_PARAMS.copy( + metrics = ImmutableList.of(MetricDefinition(MEAN), MetricDefinition(VARIANCE)), + maxPartitionsContributed = 5, + maxContributionsPerPartition = 7, + minValue = 4.0, + maxValue = 10.0, + ), + countBudget, + sumBudget, + sumSquaresBudget, + noiseFactory, + ) + + val accumulator = varianceAccumulator { + count = 10 + normalizedSum = 120.0 + normalizedSumSquares = 1500.0 + } + + val unused = combiner.computeMetrics(accumulator) + + // Verify noise is added to count. + verify(noise) + .addNoise( + /* x= */ 10.0, + /* l0Sensitivity= */ 5, + /* lInfSensitivity= */ 7.0, + /* epsilon= */ 2.0, + /* delta= */ 1e-5, + ) + // Verify noise is added to sum. + verify(noise) + .addNoise( + /* x= */ 120.0, + /* l0Sensitivity= */ 5, + /* lInfSensitivity= */ 21.0, // (maxValue - midValue) * maxContributionsPerPartition + /* epsilon= */ 1.0, + /* delta= */ 1e-3, + ) + // Verify noise is added to normalized sum of squares + verify(noise) + .addNoise( + /* x= */ 1500.0, + /* l0Sensitivity= */ 5, + /* lInfSensitivity= */ 63.0, // (maxValue - midValue)^2 * maxContributionsPerPartition + /* epsilon= */ 3.0, + /* delta= */ 1e-2, + ) + } + + @Test + fun computeMetrics_returnsVarianceMeanCountSum() { + // Use high budget for low noise. + val countBudget = AllocatedBudget() + countBudget.initialize(10000.0, 0.0) + val sumBudget = AllocatedBudget() + sumBudget.initialize(10000.0, 0.0) + val sumSquaresBudget = AllocatedBudget() + sumSquaresBudget.initialize(10000.0, 0.0) + + val combiner = + VarianceCombiner( + AGG_PARAMS.copy( + metrics = + ImmutableList.of( + MetricDefinition(VARIANCE), + MetricDefinition(MEAN), + MetricDefinition(SUM), + MetricDefinition(COUNT), + ), + maxPartitionsContributed = 5, + maxContributionsPerPartition = 7, + minValue = 4.0, + maxValue = 12.0, + noiseKind = NoiseKind.LAPLACE, + ), + countBudget, + sumBudget, + sumSquaresBudget, + NoiseFactory(), + ) + + val accumulator = varianceAccumulator { + count = 10 + normalizedSum = 120.0 + normalizedSumSquares = 1500.0 + } + + val result = combiner.computeMetrics(accumulator) + + assertThat(result.count).isNotEqualTo(10.0) + assertThat(result.count).isWithin(0.1).of(10.0) + + val approximatedExpectedSum = /* normalizedSum= */ 120.0 + /* dp_count * midValue= */ 10 * 8 + assertThat(result.sum).isNotEqualTo(approximatedExpectedSum) + assertThat(result.sum).isWithin(1.0).of(approximatedExpectedSum) + assertThat(result.mean).isWithin(1e-9).of(result.sum!! / result.count!!) + } + + enum class ReturnedMetricsTestCase( + val requestedMetrics: ImmutableList, + val countExpected: Boolean, + val sumExpected: Boolean, + val meanExpected: Boolean, + ) { + NO_SUM_NO_COUNT_NO_MEAN( + requestedMetrics = ImmutableList.of(MetricDefinition(VARIANCE)), + countExpected = false, + sumExpected = false, + meanExpected = false, + ), + ONLY_SUM( + requestedMetrics = ImmutableList.of(MetricDefinition(VARIANCE), MetricDefinition(SUM)), + countExpected = false, + sumExpected = true, + meanExpected = false, + ), + ONLY_COUNT( + requestedMetrics = ImmutableList.of(MetricDefinition(VARIANCE), MetricDefinition(COUNT)), + countExpected = true, + sumExpected = false, + meanExpected = false, + ), + ONLY_MEAN( + requestedMetrics = ImmutableList.of(MetricDefinition(VARIANCE), MetricDefinition(MEAN)), + countExpected = false, + sumExpected = false, + meanExpected = true, + ), + COUNT_AND_SUM_AND_MEAN( + requestedMetrics = + ImmutableList.of( + MetricDefinition(VARIANCE), + MetricDefinition(MEAN), + MetricDefinition(SUM), + MetricDefinition(COUNT), + ), + countExpected = true, + sumExpected = true, + meanExpected = true, + ), + } + + @Test + fun aggregate_computeMetrics_checkWhichMetricReturned( + @TestParameter testCase: ReturnedMetricsTestCase + ) { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy(metrics = testCase.requestedMetrics), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + NoiseFactory(), + ) + + val metrics = + combiner.computeMetrics( + varianceAccumulator { + count = 10 + normalizedSum = 120.0 + normalizedSumSquares = 1500.0 + } + ) + if (testCase.countExpected) { + assertThat(metrics.count).isNotNull() + } else { + assertThat(metrics.count).isNull() + } + + if (testCase.sumExpected) { + assertThat(metrics.sum).isNotNull() + } else { + assertThat(metrics.sum).isNull() + } + + if (testCase.meanExpected) { + assertThat(metrics.mean).isNotNull() + } else { + assertThat(metrics.mean).isNull() + } + } + + @Test + fun computeMetrics_withoutNoise_withMultipleContributionsIncludingEmptyAccumulator_returnsCorrectResult() { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy( + ImmutableList.of(MetricDefinition(VARIANCE)), + minValue = -10.0, + maxValue = 10.0, + ), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + + val accumulator0 = combiner.emptyAccumulator() + val accumulator1 = + combiner.createAccumulator(privacyIdContributions { values += listOf(10.0, -10.0) }) + val accumulator2 = combiner.createAccumulator(privacyIdContributions { values += listOf(9.0) }) + val accumulator3 = combiner.createAccumulator(privacyIdContributions { values += listOf(0.0) }) + val accumulator01 = combiner.mergeAccumulators(accumulator0, accumulator1) + val accumulator23 = combiner.mergeAccumulators(accumulator2, accumulator3) + val finalAccumulator = combiner.mergeAccumulators(accumulator01, accumulator23) + val result = combiner.computeMetrics(finalAccumulator) + + // (10.0^2 + (-10.0)^2 + 9.0^2 + 0.0^2) / 4 - ((10.0 + -10.0 + 9.0 + 0.0) / 4)^2 = 65.1875 + assertThat(result.variance).isEqualTo(65.1875) + } + + @Test + fun computeMetrics_withoutNoise_onlyEmptyAccumulator_returnsZeroCountAndNaNForCountMeanAndVariance() { + val combiner = + VarianceCombiner( + AGG_PARAMS.copy( + ImmutableList.of( + MetricDefinition(VARIANCE), + MetricDefinition(MEAN), + MetricDefinition(SUM), + MetricDefinition(COUNT), + ), + minValue = 4.0, + maxValue = 10.0, + ), + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + UNUSED_ALLOCATED_BUDGET, + ZeroNoiseFactory(), + ) + + val result = combiner.computeMetrics(combiner.emptyAccumulator()) + + assertThat(result.count).isEqualTo(0.0) + // NaN because mean is not defined for count = 0. With noise enabled we will return a very + // noised mean with added mid value. + assertThat(result.mean).isNaN() + // NaN because variance is not defined for count = 0. With noise enabled we will return a very + // noised variance. + assertThat(result.variance).isNaN() + // sum is NaN as well because it is computed as count * mean = 0 * NaN = NaN. + assertThat(result.sum).isNaN() + } + + companion object { + private val AGG_PARAMS = + AggregationParams( + metrics = ImmutableList.of(MetricDefinition(MEAN), MetricDefinition(VARIANCE)), + noiseKind = NoiseKind.GAUSSIAN, + maxPartitionsContributed = 3, + maxContributionsPerPartition = 5, + minValue = -10.0, + maxValue = 10.0, + ) + + private val UNUSED_ALLOCATED_BUDGET = AllocatedBudget() + + @JvmStatic + @BeforeClass + fun beforeClass() { + UNUSED_ALLOCATED_BUDGET.initialize(1.1, 1e-3) + } + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/AbsoluteBudgetPerOpSpecTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/AbsoluteBudgetPerOpSpecTest.kt new file mode 100644 index 00000000..a8bc3018 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/AbsoluteBudgetPerOpSpecTest.kt @@ -0,0 +1,68 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core.budget + +import com.google.common.truth.Truth.assertThat +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class AbsoluteBudgetPerOpSpecTest { + @Test + @TestParameters("{epsilon: -1.0, delta: 0.5}", "{epsilon: 0.5, delta: -1.0}") + fun create_invalidParameters_throws(epsilon: Double, delta: Double) { + assertFailsWith { AbsoluteBudgetPerOpSpec(epsilon, delta) } + } + + @Test + @TestParameters( + "{epsilon: 0.5, delta: 0.5}", + "{epsilon: 0.0, delta: 0.5}", + "{epsilon: 0.5, delta: 0.0}", + "{epsilon: 0.0, delta: 0.0}", + ) + fun create_validParameters_createsObjectChecksContents(epsilon: Double, delta: Double) { + val absoluteBudgetPerOpSpec = AbsoluteBudgetPerOpSpec(epsilon, delta) + assertThat(absoluteBudgetPerOpSpec).isNotNull() + assertThat(absoluteBudgetPerOpSpec.epsilon).isEqualTo(epsilon) + assertThat(absoluteBudgetPerOpSpec.delta).isEqualTo(delta) + } + + @Test + @TestParameters( + "{initialEpsilon: 2.0, delta: 0.5, factor: 0.5, calculatedWeight: 1.0, calculatedDelta: 0.25}", + "{initialEpsilon: 2.0, delta: 0.5, factor: 1, calculatedWeight: 2.0, calculatedDelta: 0.5}", + ) + fun times_validInput_hasCorrectCalculation( + initialEpsilon: Double, + delta: Double, + factor: Double, + calculatedWeight: Double, + calculatedDelta: Double, + ) { + assertThat(AbsoluteBudgetPerOpSpec(initialEpsilon, delta).times(factor)) + .isEqualTo(AbsoluteBudgetPerOpSpec(calculatedWeight, calculatedDelta)) + } + + @Test + fun times_invalidCalculatedWeight_throws() { + assertFailsWith { AbsoluteBudgetPerOpSpec(1.0, 0.5).times(-1.0) } + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BUILD.bazel b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BUILD.bazel new file mode 100644 index 00000000..94dd6419 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BUILD.bazel @@ -0,0 +1,36 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_test") + +kt_jvm_test( + name = "budget_tests", + srcs = [ + "AbsoluteBudgetPerOpSpecTest.kt", + "BudgetTests.kt", + "NaiveBudgetAccountantTest.kt", + "RelativeBudgetPerOpSpecTest.kt", + "TotalBudgetTest.kt", + ], + kotlinc_opts = "//:kotlinc_options_for_parameterized_tests", + test_class = "com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.BudgetTests", + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_accountant", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_spec", + "@maven//:com_google_testparameterinjector_test_parameter_injector", + "@maven//:com_google_truth_truth", + "@maven//:junit_junit", + "@maven//:org_jetbrains_kotlin_kotlin_test", + ], +) diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetTests.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetTests.kt new file mode 100644 index 00000000..850fd613 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/BudgetTests.kt @@ -0,0 +1,30 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core.budget + +import org.junit.runner.RunWith +import org.junit.runners.Suite + +/** Provides a list of JUnit test classes to Bazel. When creating a new test class, add it here. */ +@RunWith(Suite::class) +@Suite.SuiteClasses( + AbsoluteBudgetPerOpSpecTest::class, + NaiveBudgetAccountantTest::class, + RelativeBudgetPerOpSpecTest::class, + TotalBudgetTest::class, +) +class BudgetTests {} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/NaiveBudgetAccountantTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/NaiveBudgetAccountantTest.kt new file mode 100644 index 00000000..d7af4ed6 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/NaiveBudgetAccountantTest.kt @@ -0,0 +1,272 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core.budget + +import com.google.common.truth.Truth.assertThat +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AccountedMechanism.GAUSSIAN_NOISE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AccountedMechanism.LAPLACE_NOISE +import com.google.privacy.differentialprivacy.pipelinedp4j.core.budget.AccountedMechanism.POSTAGGREGATED_PARTITION_SELECTION +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class NaiveBudgetAccountantTest { + + @Test + fun accessAllocatedBudget_beforeCallingAllocateBudgets_throws() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 2.0, delta = 0.2)) + + val allocatedBudget = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 0.1), GAUSSIAN_NOISE) + ) + + assertFailsWith { allocatedBudget.epsilon() } + assertFailsWith { allocatedBudget.delta() } + } + + @Test + fun requestBudget_calledAfterAllocateBudget_throws() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 2.0, delta = 0.2)) + val budgetRequest = + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 0.1), GAUSSIAN_NOISE) + accountant.allocateBudgets() + val e = assertFailsWith { accountant.requestBudget(budgetRequest) } + assertThat(e).hasMessageThat().contains("Budget cannot be requested") + } + + @Test + fun allocateBudgets_allocatesAbsoluteBudget() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 2.0, delta = 0.2)) + val allocatedBudget = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 0.1), GAUSSIAN_NOISE) + ) + + accountant.allocateBudgets() + + assertThat(allocatedBudget.epsilon()).isEqualTo(1.0) + assertThat(allocatedBudget.delta()).isEqualTo(0.1) + } + + @Test + fun allocateBudgets_absoluteBudgetRequest_notEnoughEpsilon_throws() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 2.0, delta = 0.2)) + val unused1 = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 0.1), GAUSSIAN_NOISE) + ) + val unused2 = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 1.1, delta = 0.1), GAUSSIAN_NOISE) + ) + + assertFailsWith("Can't allocate absolute budget") { + accountant.allocateBudgets() + } + } + + @Test + fun allocateBudgets_absoluteBudgetRequest_notEnoughDelta_throws() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 2.0, delta = 0.2)) + val unused1 = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 0.1), GAUSSIAN_NOISE) + ) + val unused2 = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 0.11), GAUSSIAN_NOISE) + ) + + assertFailsWith("Can't allocate absolute budget") { + accountant.allocateBudgets() + } + } + + @Test + fun allocateBudgets_allocatesRelativeBudget() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 60.0, delta = 0.6)) + val allocatedBudgetWeightOne = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), GAUSSIAN_NOISE)) + val allocatedBudgetWeightTwo = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(2.0), GAUSSIAN_NOISE)) + val allocatedBudgetWeightThree = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(3.0), GAUSSIAN_NOISE)) + + accountant.allocateBudgets() + + assertThat(allocatedBudgetWeightOne.epsilon()).isEqualTo(10.0) + assertThat(allocatedBudgetWeightOne.delta()).isWithin(1e-13).of(0.1) + assertThat(allocatedBudgetWeightTwo.epsilon()).isEqualTo(20.0) + assertThat(allocatedBudgetWeightTwo.delta()).isWithin(1e-13).of(0.2) + assertThat(allocatedBudgetWeightThree.epsilon()).isEqualTo(30.0) + assertThat(allocatedBudgetWeightThree.delta()).isWithin(1e-13).of(0.3) + } + + @Test + fun allocateBudgets_allocatesRelativeDeltaOnlyIfMechanismNeedsIt( + @TestParameter mechanism: AccountedMechanism + ) { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 1.0, delta = 0.1)) + val allocatedBudget = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), mechanism)) + + accountant.allocateBudgets() + + if (mechanism.usesDelta) { + assertThat(allocatedBudget.delta()).isEqualTo(0.1) + } else { + assertThat(allocatedBudget.delta()).isEqualTo(0.0) + } + } + + @Test + fun allocateBudgets_relativeEpsilonRequest_budgetLessThenRelativeEpsilon() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 0.5, delta = 0.1)) + val allocatedBudget = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), GAUSSIAN_NOISE)) + accountant.allocateBudgets() + + assertThat(allocatedBudget.epsilon()).isEqualTo(0.5) + assertThat(allocatedBudget.delta()).isEqualTo(0.1) + } + + @Test + fun allocateBudgets_relativeEpsilonRequest_noEpsilonAllocated_throws() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 0.0, delta = 0.1)) + // All relative budget requests request epsilon to be consumed. + val unused1 = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), GAUSSIAN_NOISE)) + + assertFailsWith("Can't allocate relative budget") { + accountant.allocateBudgets() + } + } + + @Test + fun allocateBudgets_relativeEpsilonRequest_noEpsilonLeft_throws() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 1.0, delta = 0.1)) + // Consume all epsilon with an absolute budget request + val unused1 = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 1.0, delta = 0.05), GAUSSIAN_NOISE) + ) + // All relative budget requests request epsilon to be consumed. + val unused2 = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), GAUSSIAN_NOISE)) + + assertFailsWith("Can't allocate relative budget") { + accountant.allocateBudgets() + } + } + + @Test + fun allocateBudgets_relativeDeltaRequest_noDeltaAllocated_throws() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 1.0, delta = 0.0)) + // Request budget for Gaussian mechanism, it requires delta. + val unused1 = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), GAUSSIAN_NOISE)) + + assertFailsWith("Can't allocate relative budget") { + accountant.allocateBudgets() + } + } + + @Test + fun allocateBudgets_relativeDeltaRequest_noDeltaLeft_throws() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 1.0, delta = 0.1)) + // Consume all delta with an absolute budget request + val unused1 = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 0.5, delta = 0.1), GAUSSIAN_NOISE) + ) + // Request budget for Gaussian mechanism, it requires delta. + val unused2 = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), GAUSSIAN_NOISE)) + + assertFailsWith("Can't allocate relative budget") { + accountant.allocateBudgets() + } + } + + @Test + fun allocateBudgets_composesAbsoluteAndRelativeBudgets() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 60.0, delta = 0.6)) + val absoluteAllocatedBudget = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 30.0, delta = 0.3), GAUSSIAN_NOISE) + ) + val relativeAllocatedBudget = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), GAUSSIAN_NOISE)) + val relativeAllocatedBudgetTwiceMore = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(2.0), GAUSSIAN_NOISE)) + + accountant.allocateBudgets() + + assertThat(absoluteAllocatedBudget.epsilon()).isEqualTo(30.0) + assertThat(absoluteAllocatedBudget.delta()).isEqualTo(0.3) + assertThat(relativeAllocatedBudget.epsilon()).isEqualTo(10.0) + assertThat(relativeAllocatedBudget.delta()).isWithin(1e-13).of(0.1) + assertThat(relativeAllocatedBudgetTwiceMore.epsilon()).isEqualTo(20.0) + assertThat(relativeAllocatedBudgetTwiceMore.delta()).isWithin(1e-13).of(0.2) + } + + @Test + fun allocateBudgets_accountsForFloatingPointTolerance() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 1.0, delta = 0.0)) + val smallerThanFloatingPointError = 1.0 / 1e10 + val allocatedBudget = + accountant.requestBudget( + BudgetRequest( + AbsoluteBudgetPerOpSpec(epsilon = 1.0 + smallerThanFloatingPointError, delta = 0.0), + LAPLACE_NOISE, + ) + ) + + accountant.allocateBudgets() + + assertThat(allocatedBudget.epsilon()).isEqualTo(1.0 + smallerThanFloatingPointError) + assertThat(allocatedBudget.delta()).isEqualTo(0.0) + } + + @Test + fun allocateBudgets_postAggregationThresholdingAllocatedCorrectly() { + val accountant = NaiveBudgetAccountant(TotalBudget(epsilon = 10.0, delta = 0.4)) + val absoluteAllocatedBudget = + accountant.requestBudget( + BudgetRequest(AbsoluteBudgetPerOpSpec(epsilon = 3.0, delta = 0.1), GAUSSIAN_NOISE) + ) + val relativeAllocatedBudget = + accountant.requestBudget(BudgetRequest(RelativeBudgetPerOpSpec(1.0), GAUSSIAN_NOISE)) + val relativeAllocatedBudgetTwiceMore = + accountant.requestBudget( + BudgetRequest(RelativeBudgetPerOpSpec(2.0), POSTAGGREGATED_PARTITION_SELECTION) + ) + + accountant.allocateBudgets() + + assertThat(absoluteAllocatedBudget.epsilon()).isEqualTo(3.0) + assertThat(absoluteAllocatedBudget.delta()).isEqualTo(0.1) + assertThat(relativeAllocatedBudget.epsilon()).isEqualTo(7.0) + assertThat(relativeAllocatedBudget.delta()).isWithin(1e-13).of(0.1) + assertThat(relativeAllocatedBudgetTwiceMore.epsilon()).isEqualTo(0.0) + assertThat(relativeAllocatedBudgetTwiceMore.delta()).isWithin(1e-13).of(0.2) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/RelativeBudgetPerOpSpecTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/RelativeBudgetPerOpSpecTest.kt new file mode 100644 index 00000000..d683975b --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/RelativeBudgetPerOpSpecTest.kt @@ -0,0 +1,60 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core.budget + +import com.google.common.truth.Truth.assertThat +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class RelativeBudgetPerOpSpecTest { + @Test + @TestParameters("{weight: -1.0}", "{weight: 0.0}") + fun create_invalidWeight_throws(weight: Double) { + assertFailsWith { RelativeBudgetPerOpSpec(weight) } + } + + @Test + fun create_validWeight_createsObjectChecksContents() { + val relativeBudgetPerOpSpec = RelativeBudgetPerOpSpec(1.0) + assertThat(relativeBudgetPerOpSpec).isNotNull() + assertThat(relativeBudgetPerOpSpec.weight).isEqualTo(1.0) + } + + @Test + @TestParameters( + "{initialWeight: 2.0, factor: 0.5, calculatedWeight: 1.0}", + "{initialWeight: 2.0, factor: 1, calculatedWeight: 2.0}", + ) + fun times_validInput_hasCorrectCalculation( + initialWeight: Double, + factor: Double, + calculatedWeight: Double, + ) { + assertThat(RelativeBudgetPerOpSpec(initialWeight).times(factor)) + .isEqualTo(RelativeBudgetPerOpSpec(calculatedWeight)) + } + + @Test + @TestParameters("{factor: 0.0}", "{factor: -1.0}") + fun times_invalidCalculatedWeight_throws(factor: Double) { + assertFailsWith { RelativeBudgetPerOpSpec(1.0).times(factor) } + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/TotalBudgetTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/TotalBudgetTest.kt new file mode 100644 index 00000000..59b6fc37 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget/TotalBudgetTest.kt @@ -0,0 +1,47 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.core.budget + +import com.google.common.truth.Truth.assertThat +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import com.google.testing.junit.testparameterinjector.TestParameters +import kotlin.test.assertFailsWith +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class TotalBudgetTest { + @Test + @TestParameters("{epsilon: -1.0, delta: 0.5}", "{epsilon: 0.5, delta: -1.0}") + fun create_invalidParameters_throws(epsilon: Double, delta: Double) { + assertFailsWith { TotalBudget(epsilon, delta) } + } + + @Test + @TestParameters( + "{epsilon: 0.5, delta: 0.5}", + "{epsilon: 0.0, delta: 0.5}", + "{epsilon: 0.5, delta: 0.0}", + "{epsilon: 0.0, delta: 0.0}", + ) + fun create_validParameters_createsObjectChecksContents(epsilon: Double, delta: Double) { + val totalBudget = TotalBudget(epsilon, delta) + assertThat(totalBudget).isNotNull() + assertThat(totalBudget.epsilon).isEqualTo(epsilon) + assertThat(totalBudget.delta).isEqualTo(delta) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/BUILD.bazel b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/BUILD.bazel new file mode 100644 index 00000000..14c22249 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/BUILD.bazel @@ -0,0 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_kotlin//kotlin:jvm.bzl", "kt_jvm_test") + +kt_jvm_test( + name = "local_framework_tests", + srcs = glob( + ["*.kt"], + ), + test_class = "com.google.privacy.differentialprivacy.pipelinedp4j.local.LocalFrameworkTests", + deps = [ + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_collections", + "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_encoders", + "@maven//:com_google_testparameterinjector_test_parameter_injector", + "@maven//:com_google_truth_truth", + "@maven//:junit_junit", + ], +) diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalCollectionTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalCollectionTest.kt new file mode 100644 index 00000000..9f609e16 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalCollectionTest.kt @@ -0,0 +1,73 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.local + +import com.google.common.truth.Truth.assertThat +import org.junit.Test +import org.junit.runner.RunWith +import org.junit.runners.JUnit4 + +@RunWith(JUnit4::class) +class LocalCollectionTest { + val ENCODER_FACTORY: LocalEncoderFactory = LocalEncoderFactory() + + @Test + fun distinct_removesDuplicates() { + val localCollection = LocalCollection(sequenceOf(1, 2, 1)) + + val result = localCollection.distinct("stageName") + + assertThat(result.data.asIterable()).containsExactly(1, 2) + } + + @Test + fun map_appliesMapFn() { + val localCollection = LocalCollection(sequenceOf(1)) + + val result: LocalCollection = + localCollection.map("Test", ENCODER_FACTORY.strings(), { v -> v.toString() }) + as LocalCollection + + assertThat(result.data.asIterable()).containsExactly("1") + } + + @Test + fun keyBy_keysCollection() { + val localCollection = LocalCollection(sequenceOf(1)) + + val result: LocalTable = + localCollection.keyBy("Test", ENCODER_FACTORY.strings(), { v -> v.toString() }) + as LocalTable + + assertThat(result.data.asIterable()).containsExactly(Pair("1", 1)) + } + + @Test + fun mapToTable_appliesMapFn() { + val localCollection = LocalCollection(sequenceOf(1)) + + val result: LocalTable = + localCollection.mapToTable( + "Test", + ENCODER_FACTORY.strings(), + ENCODER_FACTORY.ints(), + { v -> Pair(v.toString(), v) }, + ) as LocalTable + + assertThat(result.data.asIterable()).containsExactly(Pair("1", 1)) + } +} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalFrameworkTests.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalFrameworkTests.kt new file mode 100644 index 00000000..3a262647 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalFrameworkTests.kt @@ -0,0 +1,25 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.local + +import org.junit.runner.RunWith +import org.junit.runners.Suite + +/** Provides a list of JUnit test classes to Bazel. When creating a new test class, add it here. */ +@RunWith(Suite::class) +@Suite.SuiteClasses(LocalCollectionTest::class, LocalTableTest::class) +class LocalFrameworkTests {} diff --git a/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalTableTest.kt b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalTableTest.kt new file mode 100644 index 00000000..1153da33 --- /dev/null +++ b/pipelinedp4j/tests/com/google/privacy/differentialprivacy/pipelinedp4j/local/LocalTableTest.kt @@ -0,0 +1,237 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.privacy.differentialprivacy.pipelinedp4j.local + +import com.google.common.truth.Truth.assertThat +import com.google.testing.junit.testparameterinjector.TestParameter +import com.google.testing.junit.testparameterinjector.TestParameterInjector +import org.junit.Test +import org.junit.runner.RunWith + +@RunWith(TestParameterInjector::class) +class LocalTableTest { + val ENCODER_FACTORY: LocalEncoderFactory = LocalEncoderFactory() + + @Test + fun map_appliesMapFn() { + val localTable = LocalTable(sequenceOf(1 to 10)) + val mapFn: (Int, Int) -> String = { k, v -> k.toString() + "_" + v.toString() } + + val result: LocalCollection = + localTable.map("Test", ENCODER_FACTORY.strings(), mapFn) as LocalCollection + + assertThat(result.data.asIterable()).containsExactly("1_10") + } + + @Test + fun groupAndCombineValues_appliesCombiner() { + val localTable = + LocalTable( + sequenceOf( + Pair("positive", 1), + Pair("positive", 10), + Pair("negative", -1), + Pair("negative", -10), + ) + ) + val combineFn: (Int, Int) -> Int = { v1, v2 -> v1 + v2 } + + val result = localTable.groupAndCombineValues("Test", combineFn) as LocalTable + + assertThat(result.data.asIterable()) + .containsExactly(Pair("positive", 11), Pair("negative", -11)) + } + + @Test + fun groupAndCombineValues_executesLazy() { + var initialized = false + val localTable = + LocalTable( + (1..2).asSequence().map { + require(initialized) { "Not initialized" } + Pair("key", it) + } + ) + + val combineFn: (Int, Int) -> Int = { v1, v2 -> v1 + v2 } + + // Check that groupAndCombineValues() is lazy, i.e. it does not access elements of localTable. + val result = localTable.groupAndCombineValues("Test", combineFn) as LocalTable + + // Check that when the input collection is initialized, it is safe to access the output + // elements. + initialized = true + assertThat(result.data.toList()).containsExactly("key" to 3) + } + + @Test + fun groupByKey_groupsValues() { + val localTable = + LocalTable(sequenceOf(Pair("positive", 1), Pair("positive", 10), Pair("negative", -1))) + + val result = localTable.groupByKey("Test") + + assertThat(result.data.toList()) + .containsExactly(Pair("positive", listOf(1, 10)), Pair("negative", listOf(-1))) + } + + @Test + fun groupByKey_executesLazy() { + var initialized = false + val localTable = + LocalTable( + (1..2).asSequence().map { + require(initialized) { "Not initialized" } + Pair(it, it) + } + ) + + // Check that groupByKey() is lazy, i.e. it does not access elements of localTable. + val result = localTable.groupByKey("Test") + + // Check that when the input collection is initialized, it is safe to access the output + // elements. + initialized = true + assertThat(result.data.toList()).containsExactly(1 to listOf(1), 2 to listOf(2)) + } + + @Test + fun keys_returnsKeys() { + val localTable = LocalTable(sequenceOf("key" to "value")) + + val result: LocalCollection = localTable.keys("Test") as LocalCollection + + assertThat(result.data.asIterable()).containsExactly("key") + } + + @Test + fun keys_returnsValues() { + val localTable = LocalTable(sequenceOf("key" to "value")) + + val result: LocalCollection = localTable.values("Test") as LocalCollection + + assertThat(result.data.asIterable()).containsExactly("value") + } + + @Test + fun mapValues_appliesMapFn() { + val localTable = LocalTable(sequenceOf("one" to 1)) + val mapFn: (String, Int) -> String = { k, v -> k + "_" + v.toString() } + + val result: LocalTable = + localTable.mapValues("Test", ENCODER_FACTORY.strings(), mapFn) as LocalTable + + assertThat(result.data.asIterable()).containsExactly("one" to "one_1") + } + + @Test + fun mapToTable_appliesMapFn() { + val localTable = LocalTable(sequenceOf("one" to 1)) + val mapFn: (String, Int) -> Pair = { k, v -> Pair(v, k) } + + val result: LocalTable = + localTable.mapToTable("Test", ENCODER_FACTORY.ints(), ENCODER_FACTORY.strings(), mapFn) + as LocalTable + + assertThat(result.data.asIterable()).containsExactly(1 to "one") + } + + @Test + fun flatMapToTable_appliesMapFn() { + val localTable = LocalTable(sequenceOf("one" to 1)) + val mapFn: (String, Int) -> Sequence> = { k, v -> + sequenceOf(Pair(v, k), Pair(v, k)) + } + + val result: LocalTable = + localTable.flatMapToTable("Test", ENCODER_FACTORY.ints(), ENCODER_FACTORY.strings(), mapFn) + as LocalTable + + assertThat(result.data.asIterable()).containsExactly(1 to "one", 1 to "one") + } + + @Test + fun filterValues_appliesPredicate() { + val localTable = LocalTable(sequenceOf("one" to 1, "two" to 2)) + val predicate: (Int) -> Boolean = { v -> v == 1 } + + val result: LocalTable = + localTable.filterValues("Test", predicate) as LocalTable + + assertThat(result.data.asIterable()).containsExactly("one" to 1) + } + + @Test + fun filterKeys_appliesPredicate() { + val localTable = LocalTable(sequenceOf("one" to 1, "two" to 2, "two" to -2)) + val predicate: (String) -> Boolean = { k -> k == "two" } + + val result = localTable.filterKeys("Test", predicate) + + assertThat(result.data.asIterable()).containsExactly("two" to -2, "two" to 2) + } + + @Test + fun filterKeys_keepsAllowedKeys(@TestParameter unbalancedKeys: Boolean) { + val localTable = LocalTable(sequenceOf("one" to 1, "two" to 2, "three" to 3, "two" to -2)) + val allowedKeys: LocalCollection = LocalCollection(sequenceOf("three", "two", "four")) + + val result: LocalTable = + localTable.filterKeys("Test", allowedKeys, unbalancedKeys) as LocalTable + + assertThat(result.data.asIterable()).containsExactly("two" to 2, "two" to -2, "three" to 3) + } + + @Test + fun flattenWith_flattensCollections() { + val localTable = LocalTable(sequenceOf("one" to 1)) + val otherLocalTable: LocalTable = LocalTable(sequenceOf("two" to 2)) + + val result: LocalTable = localTable.flattenWith("Test", otherLocalTable) + + assertThat(result.data.asIterable()).containsExactly("one" to 1, "two" to 2) + } + + @Test + fun samplePerKey_samplesElements() { + val localTable = + LocalTable( + sequenceOf( + "one" to 1, + "one" to 2, + "one" to 3, + "one" to 4, + "one" to 5, + "two" to 6, + "two" to 7, + "two" to 8, + "two" to 9, + "two" to 10, + "three" to 11, + "three" to 12, + ) + ) + + val result: LocalTable> = localTable.samplePerKey("Test", 3) + + val resultData = result.data.toList() + assertThat(resultData.size).isEqualTo(3) + assertThat(resultData.filter { it.first == "one" }[0].second.size).isEqualTo(3) + assertThat(resultData.filter { it.first == "two" }[0].second.size).isEqualTo(3) + assertThat(resultData.filter { it.first == "three" }[0].second.size).isEqualTo(2) + } +}