diff --git a/privbayes-synthesizer/code/diffprivlib/__init__.py b/privbayes-synthesizer/code/diffprivlib/__init__.py new file mode 100644 index 0000000..c21d873 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/__init__.py @@ -0,0 +1,32 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Differential Privacy Library for Python +======================================= + +The IBM Differential Privacy Library is a library for writing, executing and experimenting with differential privacy. +The Library includes a basic differential privacy mechanisms, the building blocks of differential privacy; tools for +basic data analysis with differential privacy; and machine learning models that satisfy differential privacy. + +""" +from diffprivlib import mechanisms +from diffprivlib import models +from diffprivlib import tools +from diffprivlib.accountant import BudgetAccountant + +__version__ = '0.6.3' diff --git a/privbayes-synthesizer/code/diffprivlib/__pycache__/__init__.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..226f110 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/__pycache__/__init__.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/__pycache__/accountant.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/__pycache__/accountant.cpython-311.pyc new file mode 100644 index 0000000..8045fe8 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/__pycache__/accountant.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/__pycache__/utils.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000..03172d0 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/__pycache__/utils.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/__pycache__/validation.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/__pycache__/validation.cpython-311.pyc new file mode 100644 index 0000000..8f756f3 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/__pycache__/validation.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/accountant.py b/privbayes-synthesizer/code/diffprivlib/accountant.py new file mode 100644 index 0000000..d9ec66f --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/accountant.py @@ -0,0 +1,469 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2020 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Privacy budget accountant for differential privacy +""" +from numbers import Integral + +import numpy as np + +from diffprivlib.utils import Budget, BudgetError +from diffprivlib.validation import check_epsilon_delta + + +class BudgetAccountant: + """Privacy budget accountant for differential privacy. + + This class creates a privacy budget accountant to track privacy spend across queries and other data accesses. Once + initialised, the BudgetAccountant stores each privacy spend and iteratively updates the total budget spend, raising + an error when the budget ceiling (if specified) is exceeded. The accountant can be initialised without any maximum + budget, to enable users track the total privacy spend of their actions without hindrance. + + Diffprivlib functions can make use of a BudgetAccountant in three different ways (see examples for more details): + + - Passed as an ``accountant`` parameter to the function (e.g., ``mean(..., accountant=acc)``) + - Set as the default using the ``set_default()`` method (all subsequent diffprivlib functions will use the + accountant by default) + - As a context manager using a ``with`` statement (the accountant is used for that block of code) + + Implements the accountant rules as given in [KOV17]_. + + Parameters + ---------- + epsilon : float, default: infinity + Epsilon budget ceiling of the accountant. + + delta : float, default: 1.0 + Delta budget ceiling of the accountant. + + slack : float, default: 0.0 + Slack allowed in delta spend. Greater slack may reduce the overall epsilon spend. + + spent_budget : list of tuples of the form (epsilon, delta), optional + List of tuples of pre-existing budget spends. Allows for a new accountant to be initialised with spends + extracted from a previous instance. + + Attributes + ---------- + epsilon : float + Epsilon budget ceiling of the accountant. + + delta : float + Delta budget ceiling of the accountant. + + slack : float + The accountant's slack. Can be modified at runtime, subject to the privacy budget not being exceeded. + + spent_budget : list of tuples of the form (epsilon, delta) + The list of privacy spends recorded by the accountant. Can be used in the initialisation of a new accountant. + + Examples + -------- + + A ``BudgetAccountant`` is typically passed to diffprivlib functions as an ``accountant`` parameter. If ``epsilon`` + and ``delta`` are not set, the accountant has an infinite budget by default, allowing you to track privacy spend + without imposing a hard limit. By allowing a ``slack`` in the budget calculation, the overall epsilon privacy spend + can be reduced (at the cost of extra delta spend). + + >>> import diffprivlib as dp + >>> from numpy.random import random + >>> X = random(100) + >>> acc = dp.BudgetAccountant(epsilon=1.5, delta=0) + >>> dp.tools.mean(X, bounds=(0, 1), accountant=acc) + 0.4547006207923884 + >>> acc.total() + (epsilon=1.0, delta=0) + >>> dp.tools.std(X, bounds=(0, 1), epsilon=0.25, accountant=acc) + 0.2630216611181259 + >>> acc.total() + (epsilon=1.25, delta=0) + + >>> acc2 = dp.BudgetAccountant() # infinite budget + >>> first_half = dp.tools.mean(X[:50], epsilon=0.25, bounds=(0, 1), accountant=acc2) + >>> last_half = dp.tools.mean(X[50:], epsilon=0.25, bounds=(0, 1), accountant=acc2) + >>> acc2.total() + (epsilon=0.5, delta=0) + >>> acc2.remaining() + (epsilon=inf, delta=1.0) + + >>> acc3 = dp.BudgetAccountant(slack=1e-3) + >>> for i in range(20): + ... dp.tools.mean(X, epsilon=0.05, bounds=(0, 1), accountant=acc3) + >>> acc3.total() # Slack has reduced the epsilon spend by almost 25% + (epsilon=0.7613352285668463, delta=0.001) + + Using ``set_default()``, an accountant is used by default in all diffprivlib functions in that script. Accountants + also act as context managers, allowing for use in a ``with`` statement. Passing an accountant as a parameter + overrides all other methods. + + >>> acc4 = dp.BudgetAccountant() + >>> acc4.set_default() + BudgetAccountant() + >>> Y = random((100, 2)) - 0.5 + >>> clf = dp.models.PCA(1, centered=True, data_norm=1.4) + >>> clf.fit(Y) + PCA(accountant=BudgetAccountant(spent_budget=[(1.0, 0)]), centered=True, copy=True, data_norm=1.4, epsilon=1.0, + n_components=1, random_state=None, bounds=None, whiten=False) + >>> acc4.total() + (epsilon=1.0, delta=0) + + >>> with dp.BudgetAccountant() as acc5: + ... dp.tools.mean(Y, bounds=(0, 1), epsilon=1/3) + >>> acc5.total() + (epsilon=0.3333333333333333, delta=0) + + References + ---------- + .. [KOV17] Kairouz, Peter, Sewoong Oh, and Pramod Viswanath. "The composition theorem for differential privacy." + IEEE Transactions on Information Theory 63.6 (2017): 4037-4049. + + """ + _default = None + + def __init__(self, epsilon=float("inf"), delta=1.0, slack=0.0, spent_budget=None): + check_epsilon_delta(epsilon, delta) + self.__epsilon = epsilon + self.__min_epsilon = 0 if epsilon == float("inf") else epsilon * 1e-14 + self.__delta = delta + self.__spent_budget = [] + self.slack = slack + + if spent_budget is not None: + if not isinstance(spent_budget, list): + raise TypeError("spent_budget must be a list") + + for _epsilon, _delta in spent_budget: + self.spend(_epsilon, _delta) + + def __repr__(self, n_budget_max=5): + params = [] + if self.epsilon != float("inf"): + params.append(f"epsilon={self.epsilon}") + + if self.delta != 1: + params.append(f"delta={self.delta}") + + if self.slack > 0: + params.append(f"slack={self.slack}") + + if self.spent_budget: + if len(self.spent_budget) > n_budget_max: + params.append("spent_budget=" + str(self.spent_budget[:n_budget_max] + ["..."]).replace("'", "")) + else: + params.append("spent_budget=" + str(self.spent_budget)) + + return "BudgetAccountant(" + ", ".join(params) + ")" + + def __enter__(self): + self.old_default = self.pop_default() + self.set_default() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.pop_default() + + if self.old_default is not None: + self.old_default.set_default() + del self.old_default + + def __len__(self): + return len(self.spent_budget) + + @property + def slack(self): + """Slack parameter for composition. + """ + return self.__slack + + @slack.setter + def slack(self, slack): + if not 0 <= slack <= self.delta: + raise ValueError(f"Slack must be between 0 and delta ({self.delta}), inclusive. Got {slack}.") + + epsilon_spent, delta_spent = self.total(slack=slack) + + if self.epsilon < epsilon_spent or self.delta < delta_spent: + raise BudgetError(f"Privacy budget will be exceeded by changing slack to {slack}.") + + self.__slack = slack + + @property + def spent_budget(self): + """List of tuples of the form (epsilon, delta) of spent privacy budget. + """ + return self.__spent_budget.copy() + + @property + def epsilon(self): + """Epsilon privacy ceiling of the accountant. + """ + return self.__epsilon + + @property + def delta(self): + """Delta privacy ceiling of the accountant. + """ + return self.__delta + + def total(self, spent_budget=None, slack=None): + """Returns the total current privacy spend. + + `spent_budget` and `slack` can be specified as parameters, otherwise the class values will be used. + + Parameters + ---------- + spent_budget : list of tuples of the form (epsilon, delta), optional + List of tuples of budget spends. If not provided, the accountant's spends will be used. + + slack : float, optional + Slack in delta for composition. If not provided, the accountant's slack will be used. + + Returns + ------- + epsilon : float + Total epsilon spend. + + delta : float + Total delta spend. + + """ + if spent_budget is None: + spent_budget = self.spent_budget + else: + for epsilon, delta in spent_budget: + check_epsilon_delta(epsilon, delta) + + if slack is None: + slack = self.slack + elif not 0 <= slack <= self.delta: + raise ValueError(f"Slack must be between 0 and delta ({self.delta}), inclusive. Got {slack}.") + + epsilon_sum, epsilon_exp_sum, epsilon_sq_sum = 0, 0, 0 + + for epsilon, _ in spent_budget: + epsilon_sum += epsilon + epsilon_exp_sum += (1 - np.exp(-epsilon)) * epsilon / (1 + np.exp(-epsilon)) + epsilon_sq_sum += epsilon ** 2 + + total_epsilon_naive = epsilon_sum + total_delta = self.__total_delta_safe(spent_budget, slack) + + if slack == 0: + return Budget(total_epsilon_naive, total_delta) + + total_epsilon_drv = epsilon_exp_sum + np.sqrt(2 * epsilon_sq_sum * np.log(1 / slack)) + total_epsilon_kov = epsilon_exp_sum + np.sqrt(2 * epsilon_sq_sum * + np.log(np.exp(1) + np.sqrt(epsilon_sq_sum) / slack)) + + return Budget(min(total_epsilon_naive, total_epsilon_drv, total_epsilon_kov), total_delta) + + def check(self, epsilon, delta): + """Checks if the provided (epsilon,delta) can be spent without exceeding the accountant's budget ceiling. + + Parameters + ---------- + epsilon : float + Epsilon budget spend to check. + + delta : float + Delta budget spend to check. + + Returns + ------- + bool + True if the budget can be spent, otherwise a :class:`.BudgetError` is raised. + + Raises + ------ + BudgetError + If the specified budget spend will result in the budget ceiling being exceeded. + + """ + check_epsilon_delta(epsilon, delta) + if self.epsilon == float("inf") and self.delta == 1: + return True + + if 0 < epsilon < self.__min_epsilon: + raise ValueError(f"Epsilon must be at least {self.__min_epsilon} if non-zero, got {epsilon}.") + + spent_budget = self.spent_budget + [(epsilon, delta)] + + if Budget(self.epsilon, self.delta) >= self.total(spent_budget=spent_budget): + return True + + raise BudgetError(f"Privacy spend of ({epsilon},{delta}) not permissible; will exceed remaining privacy budget." + f" Use {self.__class__.__name__}.{self.remaining.__name__}() to check remaining budget.") + + def remaining(self, k=1): + """Calculates the budget that remains to be spent. + + Calculates the privacy budget that can be spent on `k` queries. Spending this budget on `k` queries will + match the budget ceiling, assuming no floating point errors. + + Parameters + ---------- + k : int, default: 1 + The number of queries for which to calculate the remaining budget. + + Returns + ------- + epsilon : float + Total epsilon spend remaining for `k` queries. + + delta : float + Total delta spend remaining for `k` queries. + + """ + if not isinstance(k, Integral): + raise TypeError(f"k must be integer-valued, got {type(k)}.") + if k < 1: + raise ValueError(f"k must be at least 1, got {k}.") + + _, spent_delta = self.total() + delta = 1 - ((1 - self.delta) / (1 - spent_delta)) ** (1 / k) if spent_delta < 1.0 else 1.0 + # delta = 1 - np.exp((np.log(1 - self.delta) - np.log(1 - spent_delta)) / k) + + lower = 0 + upper = self.epsilon + old_interval_size = (upper - lower) * 2 + + while old_interval_size > upper - lower: + old_interval_size = upper - lower + mid = (upper + lower) / 2 + + spent_budget = self.spent_budget + [(mid, 0)] * k + x_0, _ = self.total(spent_budget=spent_budget) + + if x_0 >= self.epsilon: + upper = mid + if x_0 <= self.epsilon: + lower = mid + + epsilon = (upper + lower) / 2 + + return Budget(epsilon, delta) + + def spend(self, epsilon, delta): + """Spend the given privacy budget. + + Instructs the accountant to spend the given epsilon and delta privacy budget, while ensuring the target budget + is not exceeded. + + Parameters + ---------- + epsilon : float + Epsilon privacy budget to spend. + + delta : float + Delta privacy budget to spend. + + Returns + ------- + self : BudgetAccountant + + """ + self.check(epsilon, delta) + self.__spent_budget.append((epsilon, delta)) + return self + + @staticmethod + def __total_delta_safe(spent_budget, slack): + """ + Calculate total delta spend of `spent_budget`, with special consideration for floating point arithmetic. + Should yield greater precision, especially for a large number of budget spends with very small delta. + + Parameters + ---------- + spent_budget: list of tuples of the form (epsilon, delta) + List of budget spends, for which the total delta spend is to be calculated. + + slack: float + Delta slack parameter for composition of spends. + + Returns + ------- + float + Total delta spend. + + """ + delta_spend = [slack] + for _, delta in spent_budget: + delta_spend.append(delta) + delta_spend.sort() + + # (1 - a) * (1 - b) = 1 - (a + b - a * b) + prod = 0 + for delta in delta_spend: + prod += delta - prod * delta + + return prod + + @staticmethod + def load_default(accountant): + """Loads the default privacy budget accountant if none is supplied, otherwise checks that the supplied + accountant is a BudgetAccountant class. + + An accountant can be set as the default using the `set_default()` method. If no default has been set, a default + is created. + + Parameters + ---------- + accountant : BudgetAccountant or None + The supplied budget accountant. If None, the default accountant is returned. + + Returns + ------- + default : BudgetAccountant + Returns a working BudgetAccountant, either the supplied `accountant` or the existing default. + + """ + if accountant is None: + if BudgetAccountant._default is None: + BudgetAccountant._default = BudgetAccountant() + + return BudgetAccountant._default + + if not isinstance(accountant, BudgetAccountant): + raise TypeError(f"Accountant must be of type BudgetAccountant, got {type(accountant)}") + + return accountant + + def set_default(self): + """Sets the current accountant to be the default when running functions and queries with diffprivlib. + + Returns + ------- + self : BudgetAccountant + + """ + BudgetAccountant._default = self + return self + + @staticmethod + def pop_default(): + """Pops the default BudgetAccountant from the class and returns it to the user. + + Returns + ------- + default : BudgetAccountant + Returns the existing default BudgetAccountant. + + """ + default = BudgetAccountant._default + BudgetAccountant._default = None + return default diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__init__.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/__init__.py new file mode 100644 index 0000000..dbcb1a1 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/__init__.py @@ -0,0 +1,34 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Basic mechanisms for achieving differential privacy, the basic building blocks of the library. +""" +from diffprivlib.mechanisms.base import DPMachine, DPMechanism, TruncationAndFoldingMixin + +from diffprivlib.mechanisms.binary import Binary +from diffprivlib.mechanisms.bingham import Bingham +from diffprivlib.mechanisms.exponential import Exponential, ExponentialCategorical, ExponentialHierarchical, \ + PermuteAndFlip +from diffprivlib.mechanisms.gaussian import Gaussian, GaussianAnalytic, GaussianDiscrete +from diffprivlib.mechanisms.geometric import Geometric, GeometricFolded, GeometricTruncated +from diffprivlib.mechanisms.laplace import Laplace, LaplaceBoundedDomain, LaplaceBoundedNoise, LaplaceFolded,\ + LaplaceTruncated +from diffprivlib.mechanisms.snapping import Snapping +from diffprivlib.mechanisms.staircase import Staircase +from diffprivlib.mechanisms.uniform import Uniform +from diffprivlib.mechanisms.vector import Vector diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/__init__.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..0628152 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/__init__.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/base.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/base.cpython-311.pyc new file mode 100644 index 0000000..65950e1 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/base.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/binary.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/binary.cpython-311.pyc new file mode 100644 index 0000000..055d158 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/binary.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/bingham.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/bingham.cpython-311.pyc new file mode 100644 index 0000000..94ca92d Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/bingham.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/exponential.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/exponential.cpython-311.pyc new file mode 100644 index 0000000..2c70606 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/exponential.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/gaussian.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/gaussian.cpython-311.pyc new file mode 100644 index 0000000..8c24fba Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/gaussian.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/geometric.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/geometric.cpython-311.pyc new file mode 100644 index 0000000..fb1ccd2 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/geometric.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/laplace.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/laplace.cpython-311.pyc new file mode 100644 index 0000000..cd99384 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/laplace.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/snapping.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/snapping.cpython-311.pyc new file mode 100644 index 0000000..21a6839 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/snapping.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/staircase.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/staircase.cpython-311.pyc new file mode 100644 index 0000000..288aab3 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/staircase.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/uniform.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/uniform.cpython-311.pyc new file mode 100644 index 0000000..30890a9 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/uniform.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/vector.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/vector.cpython-311.pyc new file mode 100644 index 0000000..031d90c Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/__pycache__/vector.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/base.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/base.py new file mode 100644 index 0000000..2554cb5 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/base.py @@ -0,0 +1,269 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Base classes for differential privacy mechanisms. +""" +import abc +from copy import copy +import inspect +from numbers import Real + +from diffprivlib.utils import check_random_state + + +class DPMachine(abc.ABC): + """ + Parent class for :class:`.DPMechanism` and :class:`.DPTransformer`, providing and specifying basic functionality. + + """ + @abc.abstractmethod + def randomise(self, value): + """Randomise `value` with the mechanism. + + Parameters + ---------- + value : int or float or str or method + The value to be randomised. + + Returns + ------- + int or float or str or method + The randomised value, same type as `value`. + + """ + + def copy(self): + """Produces a copy of the class. + + Returns + ------- + self : class + Returns the copy. + + """ + return copy(self) + + +class DPMechanism(DPMachine, abc.ABC): + r"""Abstract base class for all mechanisms. Instantiated from :class:`.DPMachine`. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + + delta : float + Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with + ``epsilon``. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, delta, random_state=None): + self.epsilon, self.delta = self._check_epsilon_delta(epsilon, delta) + self.random_state = random_state + + self._rng = check_random_state(random_state, True) + + def __repr__(self): + attrs = inspect.getfullargspec(self.__class__).kwonlyargs + attr_output = [] + + for attr in attrs: + attr_output.append(attr + "=" + repr(self.__getattribute__(attr))) + + return str(self.__module__) + "." + str(self.__class__.__name__) + "(" + ", ".join(attr_output) + ")" + + @abc.abstractmethod + def randomise(self, value): + """Randomise `value` with the mechanism. + + Parameters + ---------- + value : int or float or str or method + The value to be randomised. + + Returns + ------- + int or float or str or method + The randomised value, same type as `value`. + + """ + + def bias(self, value): + """Returns the bias of the mechanism at a given `value`. + + Parameters + ---------- + value : int or float + The value at which the bias of the mechanism is sought. + + Returns + ------- + bias : float or None + The bias of the mechanism at `value` if defined, `None` otherwise. + + """ + raise NotImplementedError + + def variance(self, value): + """Returns the variance of the mechanism at a given `value`. + + Parameters + ---------- + value : int or float + The value at which the variance of the mechanism is sought. + + Returns + ------- + bias : float or None + The variance of the mechanism at `value` if defined, `None` otherwise. + + """ + raise NotImplementedError + + def mse(self, value): + """Returns the mean squared error (MSE) of the mechanism at a given `value`. + + Parameters + ---------- + value : int or float + The value at which the MSE of the mechanism is sought. + + Returns + ------- + bias : float or None + The MSE of the mechanism at `value` if defined, `None` otherwise. + + """ + return self.variance(value) + (self.bias(value)) ** 2 + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if not isinstance(epsilon, Real) or not isinstance(delta, Real): + raise TypeError("Epsilon and delta must be numeric") + + if epsilon < 0: + raise ValueError("Epsilon must be non-negative") + + if not 0 <= delta <= 1: + raise ValueError("Delta must be in [0, 1]") + + if epsilon + delta == 0: + raise ValueError("Epsilon and Delta cannot both be zero") + + return float(epsilon), float(delta) + + def _check_all(self, value): + del value + self._check_epsilon_delta(self.epsilon, self.delta) + + return True + + +class TruncationAndFoldingMixin: # pylint: disable=too-few-public-methods + """Mixin for truncating or folding the outputs of a mechanism. Must be instantiated with a :class:`.DPMechanism`. + + Parameters + ---------- + lower : float + The lower bound of the mechanism. + + upper : float + The upper bound of the mechanism. + + """ + def __init__(self, *, lower, upper): + if not isinstance(self, DPMechanism): + raise TypeError("TruncationAndFoldingMachine must be implemented alongside a :class:`.DPMechanism`") + + self.lower, self.upper = self._check_bounds(lower, upper) + + @classmethod + def _check_bounds(cls, lower, upper): + """Performs a check on the bounds provided for the mechanism.""" + if not isinstance(lower, Real) or not isinstance(upper, Real): + raise TypeError("Bounds must be numeric") + + if lower > upper: + raise ValueError("Lower bound must not be greater than upper bound") + + return lower, upper + + def _check_all(self, value): + """Checks that all parameters of the mechanism have been initialised correctly""" + del value + self._check_bounds(self.lower, self.upper) + + return True + + def _truncate(self, value): + if value > self.upper: + return self.upper + if value < self.lower: + return self.lower + + return value + + def _fold(self, value): + if value < self.lower: + return self._fold(2 * self.lower - value) + if value > self.upper: + return self._fold(2 * self.upper - value) + + return value + + +def bernoulli_neg_exp(gamma, random_state=None): + """Sample from Bernoulli(exp(-gamma)). + + Adapted from "The Discrete Gaussian for Differential Privacy", Canonne, Kamath, Steinke, 2020. + https://arxiv.org/pdf/2004.00010v2.pdf + + Parameters + ---------- + gamma : float + Parameter to sample from Bernoulli(exp(-gamma)). Must be non-negative. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + Returns + ------- + One sample from the Bernoulli(exp(-gamma)) distribution. + + """ + if gamma < 0: + raise ValueError(f"Gamma must be non-negative, got {gamma}.") + + rng = check_random_state(random_state, True) + + while gamma > 1: + gamma -= 1 + if not bernoulli_neg_exp(1, rng): + return 0 + + counter = 1 + + while rng.random() <= gamma / counter: + counter += 1 + + return counter % 2 diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/binary.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/binary.py new file mode 100644 index 0000000..b574bcb --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/binary.py @@ -0,0 +1,119 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +The binary mechanism for differential privacy. + +""" +import numpy as np + +from diffprivlib.mechanisms.base import DPMechanism +from diffprivlib.utils import copy_docstring + + +class Binary(DPMechanism): + r"""The classic binary mechanism in differential privacy. + + Given a binary input value, the mechanism randomly decides to flip to the other binary value or not, in order to + satisfy differential privacy. + + Paper link: https://arxiv.org/pdf/1612.05568.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + + value0 : str + 0th binary label. + + value1 : str + 1st binary label. Cannot be the same as ``value0``. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + Notes + ----- + * The binary attributes, known as `labels`, must be specified as strings. If non-string labels are required (e.g. + integer-valued labels), a :class:`.DPTransformer` can be used (e.g. :class:`.IntToString`). + + """ + def __init__(self, *, epsilon, value0, value1, random_state=None): + super().__init__(epsilon=epsilon, delta=0.0, random_state=random_state) + self.value0, self.value1 = self._check_labels(value0, value1) + + @classmethod + def _check_labels(cls, value0, value1): + if not isinstance(value0, str) or not isinstance(value1, str): + raise TypeError("Binary labels must be strings. Use a DPTransformer (e.g. transformers.IntToString) for " + "non-string labels") + + if len(value0) * len(value1) == 0: + raise ValueError("Binary labels must be non-empty strings") + + if value0 == value1: + raise ValueError("Binary labels must not match") + + return value0, value1 + + def _check_all(self, value): + super()._check_all(value) + self._check_labels(self.value0, self.value1) + + if not isinstance(value, str): + raise TypeError("Value to be randomised must be a string") + + if value not in [self.value0, self.value1]: + raise ValueError(f"Value to be randomised is not in the domain {{\"{self.value0}\", \"{self.value1}\"}}, " + f"got \"{value}\".") + + return True + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError + + def randomise(self, value): + """Randomise `value` with the mechanism. + + Parameters + ---------- + value : str + The value to be randomised. + + Returns + ------- + str + The randomised value. + + """ + self._check_all(value) + + indicator = 0 if value == self.value0 else 1 + + unif_rv = self._rng.random() * (np.exp(self.epsilon) + 1) + + if unif_rv > np.exp(self.epsilon) + self.delta: + indicator = 1 - indicator + + return self.value1 if indicator else self.value0 diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/bingham.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/bingham.py new file mode 100644 index 0000000..8103502 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/bingham.py @@ -0,0 +1,152 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +The Bingham mechanism in differential privacy, for estimating the first eigenvector of a covariance matrix. +""" +import secrets +from numbers import Real + +import numpy as np + +from diffprivlib.mechanisms.base import DPMechanism +from diffprivlib.utils import copy_docstring + + +class Bingham(DPMechanism): + r""" + The Bingham mechanism in differential privacy. + + Used to estimate the first eigenvector (associated with the largest eigenvalue) of a covariance matrix. + + Paper link: http://eprints.whiterose.ac.uk/123206/7/simbingham8.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + sensitivity : float, default: 1 + The sensitivity of the mechanism. Must be in [0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, sensitivity=1.0, random_state=None): + super().__init__(epsilon=epsilon, delta=0, random_state=random_state) + self.sensitivity = self._check_sensitivity(sensitivity) + + if isinstance(self._rng, secrets.SystemRandom): + self._rng = np.random.default_rng() + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if not delta == 0: + raise ValueError("Delta must be zero") + + return super()._check_epsilon_delta(epsilon, delta) + + @classmethod + def _check_sensitivity(cls, sensitivity): + if not isinstance(sensitivity, Real): + raise TypeError("Sensitivity must be numeric") + + if sensitivity < 0: + raise ValueError("Sensitivity must be non-negative") + + return float(sensitivity) + + def _check_all(self, value): + super()._check_all(value) + self._check_sensitivity(self.sensitivity) + + if not isinstance(value, np.ndarray): + raise TypeError(f"Value to be randomised must be a numpy array, got {type(value)}") + if value.ndim != 2: + raise ValueError(f"Array must be 2-dimensional, got {value.ndim} dimensions") + if value.shape[0] != value.shape[1]: + raise ValueError(f"Array must be square, got {value.shape[0]} x {value.shape[1]}") + if not np.allclose(value, value.T): + raise ValueError("Array must be symmetric, supplied array is not.") + + return True + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError + + def randomise(self, value): + """Randomise `value` with the mechanism. + + Parameters + ---------- + value : numpy array + The data to be randomised. + + Returns + ------- + numpy array + The randomised eigenvector. + + """ + self._check_all(value) + + eigvals, eigvecs = np.linalg.eigh(value) + dims = value.shape[0] + + if dims == 1: + return np.ones((1, 1)) + if self.sensitivity / self.epsilon == 0: + return eigvecs[:, eigvals.argmax()] + + value_translated = self.epsilon * (eigvals.max() * np.eye(dims) - value) / 4 / self.sensitivity + translated_eigvals = np.linalg.eigvalsh(value_translated) + + left, right, mid = 1, dims, (1 + dims) / 2 + old_interval_size = (right - left) * 2 + + while right - left < old_interval_size: + old_interval_size = right - left + + mid = (right + left) / 2 + f_mid = np.array([1 / (mid + 2 * eig) for eig in translated_eigvals]).sum() + + if f_mid <= 1: + right = mid + + if f_mid >= 1: + left = mid + + b_const = mid + omega = np.eye(dims) + 2 * value_translated / b_const + omega_inv = np.linalg.inv(omega) + norm_const = np.exp(-(dims - b_const) / 2) * ((dims / b_const) ** (dims / 2)) + + while True: + rnd_vec = self._rng.multivariate_normal(np.zeros(dims), omega_inv / 4, size=4).sum(axis=0) + unit_vec = rnd_vec / np.linalg.norm(rnd_vec) + prob = np.exp(-unit_vec.dot(value_translated).dot(unit_vec)) / norm_const\ + / ((unit_vec.dot(omega).dot(unit_vec)) ** (dims / 2)) + + if self._rng.random() <= prob: + return unit_vec diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/exponential.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/exponential.py new file mode 100644 index 0000000..139114c --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/exponential.py @@ -0,0 +1,570 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Implementation of the standard exponential mechanism, and its derivative, the hierarchical mechanism. +""" +from numbers import Real + +import numpy as np + +from diffprivlib.mechanisms.base import DPMechanism, bernoulli_neg_exp +from diffprivlib.mechanisms.binary import Binary +from diffprivlib.utils import copy_docstring + + +class Exponential(DPMechanism): + r""" + The exponential mechanism for achieving differential privacy on candidate selection, as first proposed by McSherry + and Talwar. + + The exponential mechanism achieves differential privacy by randomly choosing a candidate subject to candidate + utility scores, with greater probability given to higher-utility candidates. + + Paper link: https://www.cs.drexel.edu/~greenie/privacy/mdviadp.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + sensitivity : float + The sensitivity in utility values to a change in a datapoint in the underlying dataset. + + utility : list + A list of non-negative utility values for each candidate. + + monotonic : bool, default: False + Specifies if the utility function is monotonic, i.e. that adding an individual to the underlying dataset can + only increase the values in `utility`. + + candidates : list, optional + An optional list of candidate labels. If omitted, the zero-indexed list [0, 1, ..., n] is used. + + measure : list, optional + An optional list of measures for each candidate. If omitted, a uniform measure is used. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, sensitivity, utility, monotonic=False, candidates=None, measure=None, + random_state=None): + super().__init__(epsilon=epsilon, delta=0.0, random_state=random_state) + self.sensitivity = self._check_sensitivity(sensitivity) + self.utility, self.candidates, self.measure = self._check_utility_candidates_measure(utility, candidates, + measure) + self.monotonic = bool(monotonic) + self._probabilities = self._find_probabilities(self.epsilon, self.sensitivity, self.utility, self.monotonic, + self.measure) + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if not delta == 0: + raise ValueError("Delta must be zero") + + return super()._check_epsilon_delta(epsilon, delta) + + @classmethod + def _check_sensitivity(cls, sensitivity): + if not isinstance(sensitivity, Real): + raise TypeError("Sensitivity must be numeric") + + if sensitivity < 0: + raise ValueError("Sensitivity must be non-negative") + + return float(sensitivity) + + @classmethod + def _check_utility_candidates_measure(cls, utility, candidates, measure): + if not isinstance(utility, list): + raise TypeError(f"Utility must be a list, got a {utility}.") + + if not all(isinstance(u, Real) for u in utility): + raise TypeError("Utility must be a list of real-valued numbers.") + + if len(utility) < 1: + raise ValueError("Utility must have at least one element.") + + if np.isinf(utility).any(): + raise ValueError("Utility must be a list of finite numbers.") + + if candidates is not None: + if not isinstance(candidates, list): + raise TypeError(f"Candidates must be a list, got a {type(candidates)}.") + + if len(candidates) != len(utility): + raise ValueError("List of candidates must be the same length as the list of utility values.") + + if measure is not None: + if not isinstance(measure, list): + raise TypeError(f"Measure must be a list, got a {type(measure)}.") + + if not all(isinstance(m, Real) for m in measure): + raise TypeError("Measure must be a list of real-valued numbers.") + + if np.isinf(measure).any(): + raise ValueError("Measure must be a list of finite numbers.") + + if len(measure) != len(utility): + raise ValueError("List of measures must be the same length as the list of utility values.") + + return utility, candidates, measure + + @classmethod + def _find_probabilities(cls, epsilon, sensitivity, utility, monotonic, measure): + scale = epsilon / sensitivity / (2 - monotonic) if sensitivity / epsilon > 0 else float("inf") + + # Set max utility to 0 to avoid overflow on high utility; will be normalised out before returning + utility = np.array(utility) - max(utility) + + if np.isinf(scale): + probabilities = np.isclose(utility, 0).astype(float) + else: + probabilities = np.exp(scale * utility) + + probabilities *= np.array(measure) if measure else 1 + probabilities /= probabilities.sum() + + return np.cumsum(probabilities) + + def _check_all(self, value): + super()._check_all(value) + self._check_sensitivity(self.sensitivity) + self._check_utility_candidates_measure(self.utility, self.candidates, self.measure) + + if value is not None: + raise ValueError(f"Value to be randomised must be None. Got: {value}.") + + return True + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError + + def randomise(self, value=None): + """Select a candidate with differential privacy. + + Parameters + ---------- + value : None + Ignored. + + Returns + ------- + int or other + The randomised candidate. + + """ + self._check_all(value) + + rand = self._rng.random() + + if np.any(rand <= self._probabilities): + idx = np.argmax(rand <= self._probabilities) + elif np.isclose(rand, self._probabilities[-1]): + idx = len(self._probabilities) - 1 + else: + raise RuntimeError("Can't find a candidate to return. " + f"Debugging info: Rand: {rand}, Probabilities: {self._probabilities}") + + return self.candidates[idx] if self.candidates else idx + + +class PermuteAndFlip(Exponential): + r""" + The permute and flip mechanism for achieving differential privacy on candidate selection, as first proposed by + McKenna and Sheldon. + + The permute and flip mechanism is an alternative to the exponential mechanism, and achieves differential privacy by + randomly choosing a candidate subject to candidate utility scores, with greater probability given to higher-utility + candidates. + + Paper link: https://arxiv.org/pdf/2010.12603.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + sensitivity : float + The sensitivity in utility values to a change in a datapoint in the underlying dataset. + + utility : list + A list of non-negative utility values for each candidate. + + monotonic : bool, default: False + Specifies if the utility function is monotonic, i.e. that adding an individual to the underlying dataset can + only increase the values in `utility`. + + candidates : list, optional + An optional list of candidate labels. If omitted, the zero-indexed list [0, 1, ..., n] is used. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, sensitivity, utility, monotonic=False, candidates=None, random_state=None): + super().__init__(epsilon=epsilon, sensitivity=sensitivity, utility=utility, monotonic=monotonic, + candidates=candidates, measure=None, random_state=random_state) + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError + + @classmethod + def _find_probabilities(cls, epsilon, sensitivity, utility, monotonic, measure): + scale = epsilon / sensitivity / (2 - monotonic) if sensitivity / epsilon > 0 else float("inf") + + utility = np.array(utility) + utility -= max(utility) + + if np.isinf(scale): + log_probabilities = np.ones_like(utility) * (-float("inf")) + log_probabilities[utility == 0] = 0 + else: + log_probabilities = scale * utility + + return log_probabilities + + def randomise(self, value=None): + """Select a candidate with differential privacy. + + Parameters + ---------- + value : None + Ignored. + + Returns + ------- + int or other + The randomised candidate. + + """ + self._check_all(value) + + candidate_ids = list(range(len(self.utility))) + + while candidate_ids: + idx = candidate_ids[int(self._rng.random() * len(candidate_ids))] + candidate_ids.remove(idx) + + if bernoulli_neg_exp(-self._probabilities[idx], self._rng): + return self.candidates[idx] if self.candidates else idx + + raise RuntimeError(f"No value to return. Probabilities: {self._probabilities}.") + + +class ExponentialCategorical(DPMechanism): + r""" + The exponential mechanism for achieving differential privacy on categorical inputs, as first proposed by McSherry + and Talwar. + + The exponential mechanism achieves differential privacy by randomly choosing an output value for a given input + value, with greater probability given to values 'closer' to the input, as measured by a given utility function. + + Paper link: https://www.cs.drexel.edu/~greenie/privacy/mdviadp.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + utility_list : list of tuples + The utility list of the mechanism. Must be specified as a list of tuples, of the form ("value1", "value2", + utility), where each `value` is a string and `utility` is a strictly positive float. A `utility` must be + specified for every pair of values given in the `utility_list`. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, utility_list, random_state=None): + super().__init__(epsilon=epsilon, delta=0.0, random_state=random_state) + + self._balanced_tree = False + self._utility_values, self._sensitivity, self._domain_values = self._build_utility(utility_list) + self._check_utility_full(self._domain_values) + self._normalising_constant = self._build_normalising_constant() + + def _build_utility(self, utility_list): + if not isinstance(utility_list, list): + raise TypeError("Utility must be given in a list") + + self._normalising_constant = None + + utility_values = {} + domain_values = [] + sensitivity = 0 + + for _utility_sub_list in utility_list: + value1, value2, utility_value = _utility_sub_list + + if not isinstance(value1, str) or not isinstance(value2, str): + raise TypeError("Utility keys must be strings") + if not isinstance(utility_value, Real): + raise TypeError("Utility value must be a number") + if utility_value < 0.0: + raise ValueError("Utility values must be non-negative") + + sensitivity = max(sensitivity, utility_value) + if value1 not in domain_values: + domain_values.append(value1) + if value2 not in domain_values: + domain_values.append(value2) + + if value1 == value2: + continue + if value1 < value2: + utility_values[(value1, value2)] = utility_value + else: + utility_values[(value2, value1)] = utility_value + + self._utility_values = utility_values + self._sensitivity = sensitivity + self._domain_values = domain_values + + return utility_values, sensitivity, domain_values + + def _check_utility_full(self, domain_values): + missing = [] + + for val1 in domain_values: + for val2 in domain_values: + if val1 >= val2: + continue + + if (val1, val2) not in self._utility_values: + missing.append((val1, val2)) + + if missing: + raise ValueError(f"Utility values missing: {missing}") + + return True + + @property + def utility_list(self): + """Gets the utility list of the mechanism, in the same form as accepted by `.set_utility_list`. + + Returns + ------- + utility_list : list of tuples (str, str, float), or None + Returns a list of tuples of the form ("value1", "value2", utility), or `None` if the utility has not yet + been set. + + """ + utility_list = [] + + for _key, _utility in self._utility_values.items(): + value1, value2 = _key + utility_list.append((value1, value2, _utility)) + + return utility_list + + def _build_normalising_constant(self, re_eval=False): + balanced_tree = True + first_constant_value = None + normalising_constant = {} + + for _base_leaf in self._domain_values: + constant_value = 0.0 + + for _target_leaf in self._domain_values: + constant_value += self._get_prob(_base_leaf, _target_leaf) + + normalising_constant[_base_leaf] = constant_value + + if first_constant_value is None: + first_constant_value = constant_value + elif not np.isclose(constant_value, first_constant_value): + balanced_tree = False + + # If the tree is balanced, we can eliminate the doubling factor + if balanced_tree and not re_eval: + self._balanced_tree = True + return self._build_normalising_constant(True) + + return normalising_constant + + def _get_utility(self, value1, value2): + if value1 == value2: + return 0 + + if value1 > value2: + return self._get_utility(value1=value2, value2=value1) + + return self._utility_values[(value1, value2)] + + def _get_prob(self, value1, value2): + if value1 == value2: + return 1.0 + + balancing_factor = 1 if self._balanced_tree else 2 + return np.exp(- self.epsilon * self._get_utility(value1, value2) / balancing_factor / self._sensitivity) + + def _check_all(self, value): + super()._check_all(value) + + if not isinstance(value, str): + raise TypeError("Value to be randomised must be a string") + + if value not in self._domain_values: + raise ValueError(f"Value \"{value}\" not in domain") + + return True + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if not delta == 0: + raise ValueError("Delta must be zero") + + return super()._check_epsilon_delta(epsilon, delta) + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError + + @copy_docstring(Binary.randomise) + def randomise(self, value): + self._check_all(value) + + unif_rv = self._rng.random() * self._normalising_constant[value] + cum_prob = 0 + _target_value = None + + for _target_value in self._normalising_constant.keys(): + cum_prob += self._get_prob(value, _target_value) + + if unif_rv <= cum_prob: + return _target_value + + return _target_value + + +class ExponentialHierarchical(ExponentialCategorical): + r""" + Adaptation of the exponential mechanism to hierarchical data. Simplifies the process of specifying utility values, + as the values can be inferred from the hierarchy. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + hierarchy : nested list of str + The hierarchy as specified as a nested list of string. Each string must be a leaf node, and each leaf node + must lie at the same depth in the hierarchy. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + Examples + -------- + Example hierarchies: + + >>> flat_hierarchy = ["A", "B", "C", "D", "E"] + >>> nested_hierarchy = [["A"], ["B"], ["C"], ["D", "E"]] + + """ + def __init__(self, *, epsilon, hierarchy, random_state=None): + self.hierarchy = hierarchy + utility_list = self._build_utility_list(self._build_hierarchy(hierarchy)) + super().__init__(epsilon=epsilon, utility_list=utility_list, random_state=random_state) + self._list_hierarchy = None + + def _build_hierarchy(self, nested_list, parent_node=None): + if not isinstance(nested_list, list): + raise TypeError("Hierarchy must be a list") + + if parent_node is None: + parent_node = [] + + hierarchy = {} + + for _i, _value in enumerate(nested_list): + if isinstance(_value, str): + hierarchy[_value] = parent_node + [_i] + elif not isinstance(_value, list): + raise TypeError("All leaves of the hierarchy must be a string " + + "(see node " + str(parent_node + [_i]) + ")") + else: + hierarchy.update(self._build_hierarchy(_value, parent_node + [_i])) + + self._check_hierarchy_height(hierarchy) + + return hierarchy + + @staticmethod + def _check_hierarchy_height(hierarchy): + hierarchy_height = None + for _value, _hierarchy_locator in hierarchy.items(): + if hierarchy_height is None: + hierarchy_height = len(_hierarchy_locator) + elif len(_hierarchy_locator) != hierarchy_height: + raise ValueError( + f"Leaves of the hierarchy must all be at the same level (node {str(_hierarchy_locator)} is at " + f"level {len(_hierarchy_locator)} instead of hierarchy height {hierarchy_height})" + ) + + @staticmethod + def _build_utility_list(hierarchy): + if not isinstance(hierarchy, dict): + raise TypeError("Hierarchy for _build_utility_list must be a dict") + + utility_list = [] + hierarchy_height = None + + for _root_value, _root_hierarchy_locator in hierarchy.items(): + if hierarchy_height is None: + hierarchy_height = len(_root_hierarchy_locator) + + for _target_value, _target_hierarchy_locator in hierarchy.items(): + if _root_value >= _target_value: + continue + + i = 0 + while (i < len(_root_hierarchy_locator) and + _root_hierarchy_locator[i] == _target_hierarchy_locator[i]): + i += 1 + + utility_list.append([_root_value, _target_value, hierarchy_height - i]) + + return utility_list + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/gaussian.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/gaussian.py new file mode 100644 index 0000000..c57c7c0 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/gaussian.py @@ -0,0 +1,362 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +The classic Gaussian mechanism in differential privacy, and its derivatives. +""" +from math import erf +from numbers import Real, Integral + +import numpy as np + +from diffprivlib.mechanisms.base import DPMechanism, bernoulli_neg_exp +from diffprivlib.mechanisms.geometric import Geometric +from diffprivlib.mechanisms.laplace import Laplace +from diffprivlib.utils import copy_docstring + + +class Gaussian(DPMechanism): + r"""The Gaussian mechanism in differential privacy. + + First proposed by Dwork and Roth in "The algorithmic foundations of differential privacy" [DR14]_. Samples from the + Gaussian distribution are generated using two samples from `random.normalvariate` as detailed in [HB21b]_, to + prevent against reconstruction attacks due to limited floating point precision. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, 1]. For ``epsilon > 1``, use + :class:`.GaussianAnalytic`. + + delta : float + Privacy parameter :math:`\delta` for the mechanism. Must be in (0, 1]. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + References + ---------- + .. [DR14] Dwork, Cynthia, and Aaron Roth. "The algorithmic foundations of differential privacy." Found. Trends + Theor. Comput. Sci. 9, no. 3-4 (2014): 211-407. + + .. [HB21b] Holohan, Naoise, and Stefano Braghin. "Secure Random Sampling in Differential Privacy." arXiv preprint + arXiv:2107.10138 (2021). + + """ + def __init__(self, *, epsilon, delta, sensitivity, random_state=None): + super().__init__(epsilon=epsilon, delta=delta, random_state=random_state) + self.sensitivity = self._check_sensitivity(sensitivity) + self._scale = np.sqrt(2 * np.log(1.25 / self.delta)) * self.sensitivity / self.epsilon + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if epsilon == 0 or delta == 0: + raise ValueError("Neither Epsilon nor Delta can be zero") + + if isinstance(epsilon, Real) and epsilon > 1.0: + raise ValueError("Epsilon cannot be greater than 1. If required, use GaussianAnalytic instead.") + + return super()._check_epsilon_delta(epsilon, delta) + + @classmethod + def _check_sensitivity(cls, sensitivity): + if not isinstance(sensitivity, Real): + raise TypeError("Sensitivity must be numeric") + + if sensitivity < 0: + raise ValueError("Sensitivity must be non-negative") + + return float(sensitivity) + + def _check_all(self, value): + super()._check_all(value) + self._check_sensitivity(self.sensitivity) + + if not isinstance(value, Real): + raise TypeError("Value to be randomised must be a number") + + return True + + @copy_docstring(Laplace.bias) + def bias(self, value): + return 0.0 + + @copy_docstring(Laplace.variance) + def variance(self, value): + self._check_all(0) + + return self._scale ** 2 + + @copy_docstring(Laplace.randomise) + def randomise(self, value): + self._check_all(value) + + try: + standard_normal = (self._rng.normalvariate(0, 1) + self._rng.normalvariate(0, 1)) / np.sqrt(2) + except AttributeError: # random_state is a np.random.RandomState + standard_normal = (self._rng.standard_normal() + self._rng.standard_normal()) / np.sqrt(2) + + return value + standard_normal * self._scale + + +class GaussianAnalytic(Gaussian): + r"""The analytic Gaussian mechanism in differential privacy. + + As first proposed by Balle and Wang in "Improving the Gaussian Mechanism for Differential Privacy: Analytical + Calibration and Optimal Denoising". + + Paper link: https://arxiv.org/pdf/1805.06530.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + delta : float + Privacy parameter :math:`\delta` for the mechanism. Must be in (0, 1]. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, delta, sensitivity, random_state=None): + super().__init__(epsilon=epsilon, delta=delta, sensitivity=sensitivity, random_state=random_state) + self._scale = self._find_scale() + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if epsilon == 0 or delta == 0: + raise ValueError("Neither Epsilon nor Delta can be zero") + + return DPMechanism._check_epsilon_delta(epsilon, delta) # pylint: disable=protected-access + + def _check_all(self, value): + super()._check_all(value) + + return True + + def _find_scale(self): + if self.sensitivity / self.epsilon == 0: + return 0.0 + + epsilon = self.epsilon + delta = self.delta + + def phi(val): + return (1 + erf(val / np.sqrt(2))) / 2 + + def b_plus(val): + return phi(np.sqrt(epsilon * val)) - np.exp(epsilon) * phi(- np.sqrt(epsilon * (val + 2))) - delta + + def b_minus(val): + return phi(- np.sqrt(epsilon * val)) - np.exp(epsilon) * phi(- np.sqrt(epsilon * (val + 2))) - delta + + delta_0 = b_plus(0) + + if delta_0 < 0: + target_func = b_plus + else: + target_func = b_minus + + # Find the starting interval by doubling the initial size until the target_func sign changes, as suggested + # in the paper + left = 0 + right = 1 + + while target_func(left) * target_func(right) > 0: + left = right + right *= 2 + + # Binary search code copied from mechanisms.LaplaceBoundedDomain + old_interval_size = (right - left) * 2 + + while old_interval_size > right - left: + old_interval_size = right - left + middle = (right + left) / 2 + + if target_func(middle) * target_func(left) <= 0: + right = middle + if target_func(middle) * target_func(right) <= 0: + left = middle + + alpha = np.sqrt(1 + (left + right) / 4) + (-1 if delta_0 < 0 else 1) * np.sqrt((left + right) / 4) + + return alpha * self.sensitivity / np.sqrt(2 * self.epsilon) + + +class GaussianDiscrete(DPMechanism): + r"""The Discrete Gaussian mechanism in differential privacy. + + As proposed by Canonne, Kamath and Steinke, re-purposed for approximate :math:`(\epsilon,\delta)`-differential + privacy. + + Paper link: https://arxiv.org/pdf/2004.00010.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + delta : float + Privacy parameter :math:`\delta` for the mechanism. Must be in (0, 1]. + + sensitivity : int, default: 1 + The sensitivity of the mechanism. Must be in [0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, delta, sensitivity=1, random_state=None): + super().__init__(epsilon=epsilon, delta=delta, random_state=random_state) + self.sensitivity = self._check_sensitivity(sensitivity) + self._scale = self._find_scale() + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if epsilon == 0 or delta == 0: + raise ValueError("Neither Epsilon nor Delta can be zero") + + return super()._check_epsilon_delta(epsilon, delta) + + @classmethod + def _check_sensitivity(cls, sensitivity): + if not isinstance(sensitivity, Integral): + raise TypeError("Sensitivity must be an integer") + + if sensitivity < 0: + raise ValueError("Sensitivity must be non-negative") + + return sensitivity + + def _check_all(self, value): + super()._check_all(value) + self._check_sensitivity(self.sensitivity) + + if not isinstance(value, Integral): + raise TypeError("Value to be randomised must be an integer") + + return True + + @copy_docstring(Laplace.bias) + def bias(self, value): + return 0.0 + + @copy_docstring(Laplace.variance) + def variance(self, value): + raise NotImplementedError + + @copy_docstring(Geometric.randomise) + def randomise(self, value): + self._check_all(value) + + if self._scale == 0: + return value + + tau = 1 / (1 + np.floor(self._scale)) + sigma2 = self._scale ** 2 + + while True: + geom_x = 0 + while bernoulli_neg_exp(tau, self._rng): + geom_x += 1 + + bern_b = self._rng.random() < 0.5 + if bern_b and not geom_x: + continue + + lap_y = int((1 - 2 * bern_b) * geom_x) + bern_c = bernoulli_neg_exp((abs(lap_y) - tau * sigma2) ** 2 / 2 / sigma2, self._rng) + if bern_c: + return value + lap_y + + def _find_scale(self): + """Determine the scale of the mechanism's distribution given epsilon and delta. + """ + if self.sensitivity / self.epsilon == 0: + return 0 + + def objective(sigma, epsilon_, delta_, sensitivity_): + """Function for which we are seeking its root. """ + idx_0 = int(np.floor(epsilon_ * sigma ** 2 / sensitivity_ - sensitivity_ / 2)) + idx_1 = int(np.floor(epsilon_ * sigma ** 2 / sensitivity_ + sensitivity_ / 2)) + idx = 1 + + lhs, rhs, denom = float(idx_0 < 0), 0, 1 + _term, diff = 1, 1 + + while _term > 0 and diff > 0: + _term = np.exp(-idx ** 2 / 2 / sigma ** 2) + + if idx > idx_0: + lhs += _term + + if idx_0 < -idx: + lhs += _term + + if idx > idx_1: + diff = -rhs + rhs += _term + diff += rhs + + denom += 2 * _term + idx += 1 + if idx > 1e6: + raise ValueError("Infinite sum not converging, aborting. Try changing the epsilon and/or delta.") + + return (lhs - np.exp(epsilon_) * rhs) / denom - delta_ + + epsilon = self.epsilon + delta = self.delta + sensitivity = self.sensitivity + + # Begin by locating the root within an interval [2**i, 2**(i+1)] + guess_0 = 1 + f_0 = objective(guess_0, epsilon, delta, sensitivity) + pwr = 1 if f_0 > 0 else -1 + guess_1 = 2 ** pwr + f_1 = objective(guess_1, epsilon, delta, sensitivity) + + while f_0 * f_1 > 0: + guess_0 *= 2 ** pwr + guess_1 *= 2 ** pwr + + f_0 = f_1 + f_1 = objective(guess_1, epsilon, delta, sensitivity) + + # Find the root (sigma) using the bisection method + while not np.isclose(guess_0, guess_1, atol=1e-12, rtol=1e-6): + guess_mid = (guess_0 + guess_1) / 2 + f_mid = objective(guess_mid, epsilon, delta, sensitivity) + + if f_mid * f_0 <= 0: + f_1 = f_mid + guess_1 = guess_mid + if f_mid * f_1 <= 0: + f_0 = f_mid + guess_0 = guess_mid + + return (guess_0 + guess_1) / 2 diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/geometric.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/geometric.py new file mode 100644 index 0000000..27e87c6 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/geometric.py @@ -0,0 +1,233 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +The classic geometric mechanism for differential privacy, and its derivatives. +""" +from numbers import Integral + +import numpy as np + +from diffprivlib.mechanisms.base import DPMechanism, TruncationAndFoldingMixin +from diffprivlib.utils import copy_docstring + + +class Geometric(DPMechanism): + r""" + The classic geometric mechanism for differential privacy, as first proposed by Ghosh, Roughgarden and Sundararajan. + Extended to allow for non-unity sensitivity. + + Paper link: https://arxiv.org/pdf/0811.2841.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + sensitivity : float, default: 1 + The sensitivity of the mechanism. Must be in [0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, sensitivity=1, random_state=None): + super().__init__(epsilon=epsilon, delta=0.0, random_state=random_state) + self.sensitivity = self._check_sensitivity(sensitivity) + self._scale = - self.epsilon / self.sensitivity if self.sensitivity > 0 else - float("inf") + + @classmethod + def _check_sensitivity(cls, sensitivity): + if not isinstance(sensitivity, Integral): + raise TypeError("Sensitivity must be an integer") + + if sensitivity < 0: + raise ValueError("Sensitivity must be non-negative") + + return sensitivity + + def _check_all(self, value): + super()._check_all(value) + self._check_sensitivity(self.sensitivity) + + if not isinstance(value, Integral): + raise TypeError("Value to be randomised must be an integer") + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if not delta == 0: + raise ValueError("Delta must be zero") + + return super()._check_epsilon_delta(epsilon, delta) + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + return 0.0 + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + self._check_all(value) + + leading_factor = (1 - np.exp(self._scale)) / (1 + np.exp(self._scale)) + geom_series = np.exp(self._scale) / (1 - np.exp(self._scale)) + + return 2 * leading_factor * (geom_series + 3 * (geom_series ** 2) + 2 * (geom_series ** 3)) + + def randomise(self, value): + """Randomise `value` with the mechanism. + + Parameters + ---------- + value : int + The value to be randomised. + + Returns + ------- + int + The randomised value. + + """ + self._check_all(value) + + # Need to account for overlap of 0-value between distributions of different sign + unif_rv = self._rng.random() - 0.5 + unif_rv *= 1 + np.exp(self._scale) + sgn = -1 if unif_rv < 0 else 1 + + # Use formula for geometric distribution, with ratio of exp(-epsilon/sensitivity) + return int(np.round(value + sgn * np.floor(np.log(sgn * unif_rv) / self._scale))) + + +class GeometricTruncated(Geometric, TruncationAndFoldingMixin): + r""" + The truncated geometric mechanism, where values that fall outside a pre-described range are mapped back to the + closest point within the range. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + sensitivity : float, default: 1 + The sensitivity of the mechanism. Must be in [0, ∞). + + lower : int + The lower bound of the mechanism. + + upper : int + The upper bound of the mechanism. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, sensitivity=1, lower, upper, random_state=None): + super().__init__(epsilon=epsilon, sensitivity=sensitivity, random_state=random_state) + TruncationAndFoldingMixin.__init__(self, lower=lower, upper=upper) + + @classmethod + def _check_bounds(cls, lower, upper): + if not isinstance(lower, Integral) and abs(lower) != float("inf"): + raise TypeError(f"Lower bound must be integer-valued, got {lower}") + if not isinstance(upper, Integral) and abs(upper) != float("inf"): + raise TypeError(f"Upper bound must be integer-valued, got {upper}") + + return super()._check_bounds(lower, upper) + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.bias) + def variance(self, value): + raise NotImplementedError + + def _check_all(self, value): + super()._check_all(value) + TruncationAndFoldingMixin._check_all(self, value) + + return True + + @copy_docstring(Geometric.randomise) + def randomise(self, value): + self._check_all(value) + + noisy_value = super().randomise(value) + return int(np.round(self._truncate(noisy_value))) + + +class GeometricFolded(Geometric, TruncationAndFoldingMixin): + r""" + The folded geometric mechanism, where values outside a pre-described range are folded back toward the domain around + the closest point within the domain. + Half-integer bounds are permitted. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + sensitivity : float, default: 1 + The sensitivity of the mechanism. Must be in [0, ∞). + + lower : int or float + The lower bound of the mechanism. Must be integer or half-integer -valued. + + upper : int or float + The upper bound of the mechanism. Must be integer or half-integer -valued. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, sensitivity=1, lower, upper, random_state=None): + super().__init__(epsilon=epsilon, sensitivity=sensitivity, random_state=random_state) + TruncationAndFoldingMixin.__init__(self, lower=lower, upper=upper) + + @classmethod + def _check_bounds(cls, lower, upper): + if not np.isclose(2 * lower, np.round(2 * lower)) or not np.isclose(2 * upper, np.round(2 * upper)): + raise ValueError("Bounds must be integer or half-integer floats") + + return super()._check_bounds(lower, upper) + + def _fold(self, value): + return super()._fold(int(np.round(value))) + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.bias) + def variance(self, value): + raise NotImplementedError + + def _check_all(self, value): + super()._check_all(value) + TruncationAndFoldingMixin._check_all(self, value) + + return True + + @copy_docstring(Geometric.randomise) + def randomise(self, value): + self._check_all(value) + + noisy_value = super().randomise(value) + return int(np.round(self._fold(noisy_value))) diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/laplace.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/laplace.py new file mode 100644 index 0000000..dba20fc --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/laplace.py @@ -0,0 +1,1440 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +The classic Laplace mechanism in differential privacy, and its derivatives. +""" +from numbers import Real + +import numpy as np + +#from .laplace import nonnegativity +from diffprivlib.mechanisms.base import DPMechanism, TruncationAndFoldingMixin +from diffprivlib.utils import copy_docstring + + +class Laplace(DPMechanism): + r""" + The classical Laplace mechanism in differential privacy. + + First proposed by Dwork, McSherry, Nissim and Smith [DMNS16]_, with support for (relaxed) + :math:`(\epsilon,\delta)`-differential privacy [HLM15]_. + + Samples from the Laplace distribution are generated using 4 uniform variates, as detailed in [HB21]_, to prevent + against reconstruction attacks due to limited floating point precision. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + + delta : float, default: 0.0 + Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with + ``epsilon``. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + References + ---------- + .. [DMNS16] Dwork, Cynthia, Frank McSherry, Kobbi Nissim, and Adam Smith. "Calibrating noise to sensitivity in + private data analysis." Journal of Privacy and Confidentiality 7, no. 3 (2016): 17-51. + + .. [HLM15] Holohan, Naoise, Douglas J. Leith, and Oliver Mason. "Differential privacy in metric spaces: Numerical, + categorical and functional data under the one roof." Information Sciences 305 (2015): 256-268. + + .. [HB21] Holohan, Naoise, and Stefano Braghin. "Secure Random Sampling in Differential Privacy." arXiv preprint + arXiv:2107.10138 (2021). + + """ + def __init__(self, *, epsilon, delta=0.0, sensitivity, random_state=None): + super().__init__(epsilon=epsilon, delta=delta, random_state=random_state) + self.sensitivity = self._check_sensitivity(sensitivity) + self._scale = None + + @classmethod + def _check_sensitivity(cls, sensitivity): + if not isinstance(sensitivity, Real): + raise TypeError("Sensitivity must be numeric") + + if sensitivity < 0: + raise ValueError("Sensitivity must be non-negative") + + return float(sensitivity) + + def _check_all(self, value): + super()._check_all(value) + self._check_sensitivity(self.sensitivity) + + if not isinstance(value, Real): + raise TypeError("Value to be randomised must be a number") + + return True + + def bias(self, value): + """Returns the bias of the mechanism at a given `value`. + + Parameters + ---------- + value : int or float + The value at which the bias of the mechanism is sought. + + Returns + ------- + bias : float or None + The bias of the mechanism at `value`. + + """ + return 0.0 + + def variance(self, value): + """Returns the variance of the mechanism at a given `value`. + + Parameters + ---------- + value : float + The value at which the variance of the mechanism is sought. + + Returns + ------- + bias : float + The variance of the mechanism at `value`. + + """ + self._check_all(0) + + return 2 * (self.sensitivity / (self.epsilon - np.log(1 - self.delta))) ** 2 + + @staticmethod + def _laplace_sampler(unif1, unif2, unif3, unif4): + return np.log(1 - unif1) * np.cos(np.pi * unif2) + np.log(1 - unif3) * np.cos(np.pi * unif4) + + # def randomise(self, value): + # """Randomise `value` with the mechanism. + + # Parameters + # ---------- + # value : float + # The value to be randomised. + + # Returns + # ------- + # float + # The randomised value. + + # """ + # self._check_all(value) + # print('[super.randomize] Inside the laplace class randomize, and Value: {}'.format(value)) + + # scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta)) + # print('[super.randomize] Inside the laplace class randomize, and Scale: {}'.format(scale)) + # standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), + # self._rng.random()) + # print('[super.randomize] Inside the laplace class randomize, and Standard Laplace: {}'.format(standard_laplace)) + # print('[super.randomize] returns the new randomized value as: {}'.format(value - scale * standard_laplace)) + # noisy_value = value - scale * standard_laplace + # return value - scale * standard_laplace + + def nonnegativity(self, array, rho=0.0001, T=30): + """Enforces non-negativity on an array of integers, gradually reducing negative values. + + Args: + array: The input array of integers. + rho: The threshold for absolute sum of negative numbers (default: 0.0001). + T: The maximum number of rounds to perform (default: 30). + + Returns: + The modified array with non-negative values. + """ + + round_count = 0 + negative_sum = 0 + print("\n\n\n\n\n\n[+START][non-negativity] Initial array before non-negativity enforcement is:\n\n") + print(array) + while round_count < T and abs(sum(x for x in array if x < 0)) > rho: + round_count += 1 + print(f"\n\n ==============================[non-negativity] Inside the non-negativity enforcement loop. Round count is {round_count} :====================================\n") + for i in range(len(array)): + if array[i] < 0: + print(f"\n [ROUND {round_count}] [non-negativity] --- negative value found at position {i}:", array[i]) + else: + print(f"\n [ROUND {round_count}] [non-negativity] +++ positive value found at position {i}:", array[i]) + print("\n\n ") + negative_sum = sum(abs(x) for x in array if x < 0) + print(f"\n [ROUND {round_count}] [non-negativity] Absolute Sum of negative numbers above is:", negative_sum) + positive_count = sum(1 for x in array if x > 0) # Count positive values using generator expression + print(f"\n [ROUND {round_count}] [non-negativity] Count of positive numbers above is:", positive_count) + + try: + height = negative_sum / positive_count # Handle potential division by zero + except ZeroDivisionError: + height = 0 + + print(f"\n [ROUND {round_count}] [non-negativity] Height for round no.{round_count} is:", height) + print("\n\n ") + for i in range(len(array)): + if array[i] > 0: + print(f"\n [ROUND {round_count}] [non-negativity] +++ positive value at position {i}:", array[i]) + array[i] -= height + print(f"\n [ROUND {round_count}] [non-negativity] +++ new value at position {i} after subtraction of height :", array[i]) + elif array[i] < 0: + print(f"\n [ROUND {round_count}] [non-negativity] --- negative value at position {i}:", array[i]) + array[i] = 0 + print(f"\n [ROUND {round_count}] [non-negativity] --- new value at position {i} after setting to 0 :", array[i]) + + print("\n ") + + + # if round_count >= T: + # print(f"\n\n [non-negativity] Number of rounds completed is {round_count}, And it's more than {T} !\n") + # elif abs(sum(x for x in array if x < 0)) <= rho: + # print(f"\n\n [non-negativity] Absolute sum of negative numbers {abs(sum(x for x in array if x < 0))} is below the threshold {rho} !\n") + + + + # print(f"\n [non-negativity] Number of rounds completed is {round_count}, And Threshold is {T} !") + # print(f"\n [non-negativity] Absolute sum of negative numbers {negative_sum}, And Threshold is {rho} !") + # print(f"\n [non-negativity] Count of negative numbers {sum(1 for x in array if x < 0)} !") + + if sum(1 for x in array if x < 0) == 0: + print(f"\n [-QUIT][non-negativity] No more negative numbers found in the array !") + elif abs(sum(x for x in array if x < 0)) < rho: + print(f"\n [-QUIT][non-negativity] Absolute sum of negative numbers {abs(sum(x for x in array if x < 0))} is below the threshold {rho} !") + elif round_count >= T: + print(f"\n [-QUIT][non-negativity] Number of rounds completed is {round_count}, And its more than {T} !") + + + + + + print("\n [-QUIT][non-negativity] Because of the above reason, we are setting any of the remaining negative values to 0") + + # Set any remaining negative values to 0 + for i in range(len(array)): + if array[i] < 0: + print(f"\n [-REPLACE][non-negativity] Previous value at {i}, {array[i]} is replaced with 0") + array[i] = 0 + # print(f"\n [non-negativity] --- new value at position {i} after setting to 0 :", array[i]) + + print("\n[-END][non-negativity] Final array after non-negativity enforcement is:\n\n") + print(array) + print("\n\n\n\n\n\n") + return array + + + def randomise(self, values): + """Randomise `values` with the mechanism. + + Parameters + ---------- + values : list or array-like + The values to be randomised. + + Returns + ------- + array-like + The array of randomised values. + + """ + result_values = [] + for value in values: + self._check_all(value) + + #print('[super.randomize] Inside the laplace class randomize, and Values: {}'.format(values)) + + for value in values: + scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta)) + #print('[super.randomize] Inside the laplace class randomize, and Scale: {}'.format(scale)) + standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), + self._rng.random()) + #print('[super.randomize] Inside the laplace class randomize, and Standard Laplace: {}'.format(standard_laplace)) + noisy_value = value - scale * standard_laplace + #print('[super.randomize] returns the new randomized value as: {}'.format(noisy_value)) + result_values.append(noisy_value) + + #print('\n[super.randomize] returns the new randomized values as: {}'.format(result_values)) + #return result_values + #print("\n[super.randomize] The noisy values are sent for non-negativity enforcing..!") + noisy_values = self.nonnegativity(result_values) + return noisy_values + + + + + + # def randomise(self, values, threshold=0.3, T=30): + # """Randomise a list of values with the mechanism while enforcing non-negativity. + + # Parameters + # ---------- + # values : list or array-like + # The values to be randomised. + # threshold : float, optional + # The threshold for stopping the non-negativity loop. Default is 0.3. + # T : int, optional + # The maximum number of rounds for non-negativity. Default is 30. + + # Returns + # ------- + # array-like + # The array of randomised values. + + # """ + # # Check all values in the list + # for value in values: + # self._check_all(value) + + # print('[super.randomize] Inside the laplace class randomize, and Values: {}'.format(values)) + + # scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta)) + # print('[super.randomize] Inside the laplace class randomize, and Scale: {}'.format(scale)) + + # sum_negative = 0 # Variable to store the sum of negative numbers + # num_positive = 0 # Variable to count the number of positive numbers + # T_counter = 0 # Variable to count the number of rounds of non-negativity + + # while True: + # randomized_values = [] # List to store the randomised values + + # for value in values: + # standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), + # self._rng.random()) + # randomized_value = value - scale * standard_laplace + + # if randomized_value < 0: + # sum_negative += abs(randomized_value) + # num_positive += 1 + # randomized_value = 0 # Set negative value to 0 for now + + # randomized_values.append(randomized_value) + + # if sum_negative / num_positive > threshold or T_counter >= T: + # return np.maximum(0, randomized_values) # If threshold is exceeded or T rounds reached, return values with negative values set to 0 + # else: + # print("[super.randomize] SUM_negative: {}, NUM_positive: {}".format(sum_negative, num_positive)) + # values = np.maximum(0, randomized_values - sum_negative / num_positive) # Adjust values for non-negativity + # sum_negative = 0 # Reset sum_negative for the next round + # num_positive = 0 # Reset num_positive for the next round + # T_counter += 1 + # return randomized_values + + + +class LaplaceTruncated(Laplace, TruncationAndFoldingMixin): + r""" + The truncated Laplace mechanism, where values outside a pre-described domain are mapped to the closest point + within the domain. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + + delta : float, default: 0.0 + Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with + ``epsilon``. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + lower : float + The lower bound of the mechanism. + + upper : float + The upper bound of the mechanism. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, delta=0.0, sensitivity, lower, upper, random_state=None): + super().__init__(epsilon=epsilon, delta=delta, sensitivity=sensitivity, random_state=random_state) + TruncationAndFoldingMixin.__init__(self, lower=lower, upper=upper) + + @copy_docstring(Laplace.bias) + def bias(self, value): + self._check_all(value) + + shape = self.sensitivity / self.epsilon + # print('@@@@ Inside the laplace truncated class, and Shape: {}'.format(shape)) + # print('@@@@ Inside the laplace truncated class, and Value: {}'.format(value)) + # print('@@@@ Inside the laplace truncated class, and Lower: {}'.format(self.lower)) + # print('@@@@ Inside the laplace truncated class, and Upper: {}'.format(self.upper)) + # print('@@@@ Inside the laplace truncated class, and returns ----: {}',shape / 2 * (np.exp((self.lower - value) / shape) - np.exp((value - self.upper) / shape))) + return shape / 2 * (np.exp((self.lower - value) / shape) - np.exp((value - self.upper) / shape)) + + @copy_docstring(Laplace.variance) + def variance(self, value): + self._check_all(value) + + shape = self.sensitivity / self.epsilon + + variance = value ** 2 + shape * (self.lower * np.exp((self.lower - value) / shape) + - self.upper * np.exp((value - self.upper) / shape)) + variance += (shape ** 2) * (2 - np.exp((self.lower - value) / shape) + - np.exp((value - self.upper) / shape)) + + variance -= (self.bias(value) + value) ** 2 + + return variance + + def _check_all(self, value): + Laplace._check_all(self, value) + TruncationAndFoldingMixin._check_all(self, value) + + return True + + @copy_docstring(Laplace.randomise) + # def randomise(self, value): + # print('----START(randomize)---- Inside the laplace truncated randomize class, and original value is: {}'.format(value)) + # self._check_all(value) + # #print('@@@@ Inside the laplace truncated randomize class, and Value: {}'.format(value)) + # noisy_value = super().randomise(value) + # print('----END(randomize)---- The noisy value returned after randomizing the original value is: {}'.format(noisy_value)) + # print('@@@@ Inside the laplace truncated randomize class, and returns-------------->>>>>>>: ',self._truncate(noisy_value)) + # return self._truncate(noisy_value) + + + def randomise(self, values): + """Randomise a list of values with the mechanism. + + Parameters + ---------- + values : list or array-like + The values to be randomised. + + Returns + ------- + array-like + The array of randomised values. + + """ + # print('----START(randomize)---- Inside the laplace truncated randomize class, and original values are: {}'.format(values)) + + # Ensure all values in the list pass the check + for value in values: + self._check_all(value) + + # Send the entire list of values for randomization to the superclass + noisy_values = super().randomise(values) + + # print('----END(randomize)---- The noisy values returned after randomizing the original values are: {}'.format(noisy_values)) + #print('@@@@ Inside the laplace truncated randomize class, and returns-------------->>>>>>>: ', self._truncate(noisy_values)) + + return noisy_values + #return self._truncate(noisy_values) + + + +class LaplaceFolded(Laplace, TruncationAndFoldingMixin): + r""" + The folded Laplace mechanism, where values outside a pre-described domain are folded around the domain until they + fall within. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + + delta : float, default: 0.0 + Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with + ``epsilon``. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + lower : float + The lower bound of the mechanism. + + upper : float + The upper bound of the mechanism. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, delta=0.0, sensitivity, lower, upper, random_state=None): + super().__init__(epsilon=epsilon, delta=delta, sensitivity=sensitivity, random_state=random_state) + TruncationAndFoldingMixin.__init__(self, lower=lower, upper=upper) + + @copy_docstring(Laplace.bias) + def bias(self, value): + self._check_all(value) + + shape = self.sensitivity / self.epsilon + + bias = shape * (np.exp((self.lower + self.upper - 2 * value) / shape) - 1) + bias /= np.exp((self.lower - value) / shape) + np.exp((self.upper - value) / shape) + + return bias + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError + + def _check_all(self, value): + super()._check_all(value) + TruncationAndFoldingMixin._check_all(self, value) + + return True + + @copy_docstring(Laplace.randomise) + def randomise(self, value): + self._check_all(value) + + noisy_value = super().randomise(value) + return self._fold(noisy_value) + + + +class LaplaceBoundedDomain(LaplaceTruncated): + r""" + The bounded Laplace mechanism on a bounded domain. The mechanism draws values directly from the domain using + rejection sampling, without any post-processing [HABM20]_. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + + delta : float, default: 0.0 + Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with + ``epsilon``. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + lower : float + The lower bound of the mechanism. + + upper : float + The upper bound of the mechanism. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + References + ---------- + .. [HABM20] Holohan, Naoise, Spiros Antonatos, Stefano Braghin, and Pól Mac Aonghusa. "The Bounded Laplace Mechanism + in Differential Privacy." Journal of Privacy and Confidentiality 10, no. 1 (2020). + + """ + def _find_scale(self): + eps = self.epsilon + delta = self.delta + diam = self.upper - self.lower + delta_q = self.sensitivity + + def _delta_c(shape): + if shape == 0: + return 2.0 + return (2 - np.exp(- delta_q / shape) - np.exp(- (diam - delta_q) / shape)) / (1 - np.exp(- diam / shape)) + + def _f(shape): + return delta_q / (eps - np.log(_delta_c(shape)) - np.log(1 - delta)) + + left = delta_q / (eps - np.log(1 - delta)) + right = _f(left) + old_interval_size = (right - left) * 2 + + while old_interval_size > right - left: + old_interval_size = right - left + middle = (right + left) / 2 + + if _f(middle) >= middle: + left = middle + if _f(middle) <= middle: + right = middle + + return (right + left) / 2 + + def effective_epsilon(self): + r"""Gets the effective epsilon of the mechanism, only for strict :math:`\epsilon`-differential privacy. Returns + ``None`` if :math:`\delta` is non-zero. + + Returns + ------- + float + The effective :math:`\epsilon` parameter of the mechanism. Returns ``None`` if `delta` is non-zero. + + """ + if self._scale is None: + self._scale = self._find_scale() + + if self.delta > 0.0: + return None + + return self.sensitivity / self._scale + + @copy_docstring(Laplace.bias) + def bias(self, value): + self._check_all(value) + + if self._scale is None: + self._scale = self._find_scale() + + bias = (self._scale - self.lower + value) / 2 * np.exp((self.lower - value) / self._scale) \ + - (self._scale + self.upper - value) / 2 * np.exp((value - self.upper) / self._scale) + bias /= 1 - np.exp((self.lower - value) / self._scale) / 2 \ + - np.exp((value - self.upper) / self._scale) / 2 + + return bias + + @copy_docstring(Laplace.variance) + def variance(self, value): + self._check_all(value) + + if self._scale is None: + self._scale = self._find_scale() + + variance = value**2 + variance -= (np.exp((self.lower - value) / self._scale) * (self.lower ** 2) + + np.exp((value - self.upper) / self._scale) * (self.upper ** 2)) / 2 + variance += self._scale * (self.lower * np.exp((self.lower - value) / self._scale) + - self.upper * np.exp((value - self.upper) / self._scale)) + variance += (self._scale ** 2) * (2 - np.exp((self.lower - value) / self._scale) + - np.exp((value - self.upper) / self._scale)) + variance /= 1 - (np.exp(-(value - self.lower) / self._scale) + + np.exp(-(self.upper - value) / self._scale)) / 2 + + variance -= (self.bias(value) + value) ** 2 + + return variance + + @copy_docstring(Laplace.randomise) + def randomise(self, value): + self._check_all(value) + + if self._scale is None: + self._scale = self._find_scale() + + value = max(min(value, self.upper), self.lower) + if np.isnan(value): + return float("nan") + + samples = 1 + + while True: + try: + unif = self._rng.random(4 * samples) + except TypeError: # rng is secrets.SystemRandom + unif = [self._rng.random() for _ in range(4 * samples)] + noisy = value + self._scale * self._laplace_sampler(*np.array(unif).reshape(4, -1)) + + if ((noisy >= self.lower) & (noisy <= self.upper)).any(): + idx = np.argmax((noisy >= self.lower) & (noisy <= self.upper)) + return noisy[idx] + samples = min(100000, samples * 2) + + +class LaplaceBoundedNoise(Laplace): + r""" + The Laplace mechanism with bounded noise, only applicable for approximate differential privacy (delta > 0) + [GDGK18]_. + + Epsilon must be strictly positive, `epsilon` > 0. `delta` must be strictly in the interval (0, 0.5). + - For zero `epsilon`, use :class:`.Uniform`. + - For zero `delta`, use :class:`.Laplace`. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + delta : float + Privacy parameter :math:`\delta` for the mechanism. Must be in (0, 0.5). + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + References + ---------- + .. [GDGK18] Geng, Quan, Wei Ding, Ruiqi Guo, and Sanjiv Kumar. "Truncated Laplacian Mechanism for Approximate + Differential Privacy." arXiv preprint arXiv:1810.00877v1 (2018). + + """ + def __init__(self, *, epsilon, delta, sensitivity, random_state=None): + super().__init__(epsilon=epsilon, delta=delta, sensitivity=sensitivity, random_state=random_state) + self._noise_bound = None + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if epsilon == 0: + raise ValueError("Epsilon must be strictly positive. For zero epsilon, use :class:`.Uniform`.") + + if isinstance(delta, Real) and not 0 < delta < 0.5: + raise ValueError("Delta must be strictly in the interval (0,0.5). For zero delta, use :class:`.Laplace`.") + + return super()._check_epsilon_delta(epsilon, delta) + + @copy_docstring(Laplace.bias) + def bias(self, value): + return 0.0 + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError + + @copy_docstring(Laplace.randomise) + def randomise(self, value): + self._check_all(value) + + if self._scale is None or self._noise_bound is None: + self._scale = self.sensitivity / self.epsilon + self._noise_bound = 0 if self._scale == 0 else \ + self._scale * np.log(1 + (np.exp(self.epsilon) - 1) / 2 / self.delta) + + if np.isnan(value): + return float("nan") + + samples = 1 + + while True: + try: + unif = self._rng.random(4 * samples) + except TypeError: # rng is secrets.SystemRandom + unif = [self._rng.random() for _ in range(4 * samples)] + noisy = self._scale * self._laplace_sampler(*np.array(unif).reshape(4, -1)) + + if ((noisy >= - self._noise_bound) & (noisy <= self._noise_bound)).any(): + idx = np.argmax((noisy >= - self._noise_bound) & (noisy <= self._noise_bound)) + return value + noisy[idx] + samples = min(100000, samples * 2) + + + + + + + + + + +# Previous code with full debugging print statements..!! + + + +# # MIT License +# # +# # Copyright (C) IBM Corporation 2019 +# # +# # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# # documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# # persons to whom the Software is furnished to do so, subject to the following conditions: +# # +# # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# # Software. +# # +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# # WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# # SOFTWARE. +# """ +# The classic Laplace mechanism in differential privacy, and its derivatives. +# """ +# from numbers import Real + +# import numpy as np + +# #from .laplace import nonnegativity +# from diffprivlib.mechanisms.base import DPMechanism, TruncationAndFoldingMixin +# from diffprivlib.utils import copy_docstring + + +# class Laplace(DPMechanism): +# r""" +# The classical Laplace mechanism in differential privacy. + +# First proposed by Dwork, McSherry, Nissim and Smith [DMNS16]_, with support for (relaxed) +# :math:`(\epsilon,\delta)`-differential privacy [HLM15]_. + +# Samples from the Laplace distribution are generated using 4 uniform variates, as detailed in [HB21]_, to prevent +# against reconstruction attacks due to limited floating point precision. + +# Parameters +# ---------- +# epsilon : float +# Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + +# delta : float, default: 0.0 +# Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with +# ``epsilon``. + +# sensitivity : float +# The sensitivity of the mechanism. Must be in [0, ∞). + +# random_state : int or RandomState, optional +# Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, +# ``random_state`` has to be fixed to an integer. + +# References +# ---------- +# .. [DMNS16] Dwork, Cynthia, Frank McSherry, Kobbi Nissim, and Adam Smith. "Calibrating noise to sensitivity in +# private data analysis." Journal of Privacy and Confidentiality 7, no. 3 (2016): 17-51. + +# .. [HLM15] Holohan, Naoise, Douglas J. Leith, and Oliver Mason. "Differential privacy in metric spaces: Numerical, +# categorical and functional data under the one roof." Information Sciences 305 (2015): 256-268. + +# .. [HB21] Holohan, Naoise, and Stefano Braghin. "Secure Random Sampling in Differential Privacy." arXiv preprint +# arXiv:2107.10138 (2021). + +# """ +# def __init__(self, *, epsilon, delta=0.0, sensitivity, random_state=None): +# super().__init__(epsilon=epsilon, delta=delta, random_state=random_state) +# self.sensitivity = self._check_sensitivity(sensitivity) +# self._scale = None + +# @classmethod +# def _check_sensitivity(cls, sensitivity): +# if not isinstance(sensitivity, Real): +# raise TypeError("Sensitivity must be numeric") + +# if sensitivity < 0: +# raise ValueError("Sensitivity must be non-negative") + +# return float(sensitivity) + +# def _check_all(self, value): +# super()._check_all(value) +# self._check_sensitivity(self.sensitivity) + +# if not isinstance(value, Real): +# raise TypeError("Value to be randomised must be a number") + +# return True + +# def bias(self, value): +# """Returns the bias of the mechanism at a given `value`. + +# Parameters +# ---------- +# value : int or float +# The value at which the bias of the mechanism is sought. + +# Returns +# ------- +# bias : float or None +# The bias of the mechanism at `value`. + +# """ +# return 0.0 + +# def variance(self, value): +# """Returns the variance of the mechanism at a given `value`. + +# Parameters +# ---------- +# value : float +# The value at which the variance of the mechanism is sought. + +# Returns +# ------- +# bias : float +# The variance of the mechanism at `value`. + +# """ +# self._check_all(0) + +# return 2 * (self.sensitivity / (self.epsilon - np.log(1 - self.delta))) ** 2 + +# @staticmethod +# def _laplace_sampler(unif1, unif2, unif3, unif4): +# return np.log(1 - unif1) * np.cos(np.pi * unif2) + np.log(1 - unif3) * np.cos(np.pi * unif4) + +# # def randomise(self, value): +# # """Randomise `value` with the mechanism. + +# # Parameters +# # ---------- +# # value : float +# # The value to be randomised. + +# # Returns +# # ------- +# # float +# # The randomised value. + +# # """ +# # self._check_all(value) +# # print('[super.randomize] Inside the laplace class randomize, and Value: {}'.format(value)) + +# # scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta)) +# # print('[super.randomize] Inside the laplace class randomize, and Scale: {}'.format(scale)) +# # standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), +# # self._rng.random()) +# # print('[super.randomize] Inside the laplace class randomize, and Standard Laplace: {}'.format(standard_laplace)) +# # print('[super.randomize] returns the new randomized value as: {}'.format(value - scale * standard_laplace)) +# # noisy_value = value - scale * standard_laplace +# # return value - scale * standard_laplace + +# def nonnegativity(self, array, rho=0.0001, T=30): +# """Enforces non-negativity on an array of integers, gradually reducing negative values. + +# Args: +# array: The input array of integers. +# rho: The threshold for absolute sum of negative numbers (default: 0.0001). +# T: The maximum number of rounds to perform (default: 30). + +# Returns: +# The modified array with non-negative values. +# """ + +# round_count = 0 +# print("\n\n----START---- [non-negativity] Initial array before non-negativity enforcement is:\n", array,"\n") +# while round_count < T and abs(sum(x for x in array if x < 0)) > rho: +# print(f"\n\n==============================[non-negativity] Inside the non-negativity enforcement loop. Round count is {round_count} :====================================") +# round_count += 1 +# for i in range(len(array)): +# if array[i] < 0: +# print(f"\n[non-negativity] --- negative value found at position {i}:", array[i]) +# else: +# print(f"\n[non-negativity] +++ positive value found at position {i}:", array[i]) +# print("\n\n") +# negative_sum = sum(abs(x) for x in array if x < 0) +# print("\n[non-negativity] Absolute Sum of negative numbers above is:", negative_sum) +# positive_count = sum(1 for x in array if x > 0) # Count positive values using generator expression +# print("\n[non-negativity] Count of positive numbers above is:", positive_count) + +# try: +# height = negative_sum / positive_count # Handle potential division by zero +# except ZeroDivisionError: +# height = 0 + +# print(f"\n[non-negativity] Height for round no.{round_count} is:", height) +# print("\n\n") +# for i in range(len(array)): +# if array[i] > 0: +# print(f"\n[non-negativity] +++ positive value at position {i}:", array[i]) +# array[i] -= height +# print(f"\n[non-negativity] +++ new value at position {i} after subtraction of height :", array[i]) +# elif array[i] < 0: +# print(f"\n[non-negativity] --- negative value at position {i}:", array[i]) +# array[i] = 0 +# print(f"\n[non-negativity] --- new value at position {i} after setting to 0 :", array[i]) + + +# print("\n\n[non-negativity] One of the 3 main conditions(mostly, No negative numbers found..!!) is not satisfied, so we are setting any of the remaining negative values to 0.\n") + +# # Set any remaining negative values to 0 +# for i in range(len(array)): +# if array[i] < 0: +# print(f"\n[non-negativity] --- previous negative value at position {i}:", array[i]) +# array[i] = 0 +# print(f"\n[non-negativity] --- new value at position {i} after setting to 0 :", array[i]) + +# print("\n----END---- [non-negativity] Final array after non-negativity enforcement is:\n", array,"\n") +# return array + + +# def randomise(self, values): +# """Randomise `values` with the mechanism. + +# Parameters +# ---------- +# values : list or array-like +# The values to be randomised. + +# Returns +# ------- +# array-like +# The array of randomised values. + +# """ +# result_values = [] +# for value in values: +# self._check_all(value) + +# print('[super.randomize] Inside the laplace class randomize, and Values: {}'.format(values)) + +# for value in values: +# scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta)) +# print('[super.randomize] Inside the laplace class randomize, and Scale: {}'.format(scale)) +# standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), +# self._rng.random()) +# print('[super.randomize] Inside the laplace class randomize, and Standard Laplace: {}'.format(standard_laplace)) +# noisy_value = value - scale * standard_laplace +# print('[super.randomize] returns the new randomized value as: {}'.format(noisy_value)) +# result_values.append(noisy_value) + +# print('\n[super.randomize] returns the new randomized values as: {}'.format(result_values)) +# #return result_values +# print("\n[super.randomize] The noisy values are sent for non-negativity enforcing..!") +# noisy_values = self.nonnegativity(result_values) +# return noisy_values + + + + + +# # def randomise(self, values, threshold=0.3, T=30): +# # """Randomise a list of values with the mechanism while enforcing non-negativity. + +# # Parameters +# # ---------- +# # values : list or array-like +# # The values to be randomised. +# # threshold : float, optional +# # The threshold for stopping the non-negativity loop. Default is 0.3. +# # T : int, optional +# # The maximum number of rounds for non-negativity. Default is 30. + +# # Returns +# # ------- +# # array-like +# # The array of randomised values. + +# # """ +# # # Check all values in the list +# # for value in values: +# # self._check_all(value) + +# # print('[super.randomize] Inside the laplace class randomize, and Values: {}'.format(values)) + +# # scale = self.sensitivity / (self.epsilon - np.log(1 - self.delta)) +# # print('[super.randomize] Inside the laplace class randomize, and Scale: {}'.format(scale)) + +# # sum_negative = 0 # Variable to store the sum of negative numbers +# # num_positive = 0 # Variable to count the number of positive numbers +# # T_counter = 0 # Variable to count the number of rounds of non-negativity + +# # while True: +# # randomized_values = [] # List to store the randomised values + +# # for value in values: +# # standard_laplace = self._laplace_sampler(self._rng.random(), self._rng.random(), self._rng.random(), +# # self._rng.random()) +# # randomized_value = value - scale * standard_laplace + +# # if randomized_value < 0: +# # sum_negative += abs(randomized_value) +# # num_positive += 1 +# # randomized_value = 0 # Set negative value to 0 for now + +# # randomized_values.append(randomized_value) + +# # if sum_negative / num_positive > threshold or T_counter >= T: +# # return np.maximum(0, randomized_values) # If threshold is exceeded or T rounds reached, return values with negative values set to 0 +# # else: +# # print("[super.randomize] SUM_negative: {}, NUM_positive: {}".format(sum_negative, num_positive)) +# # values = np.maximum(0, randomized_values - sum_negative / num_positive) # Adjust values for non-negativity +# # sum_negative = 0 # Reset sum_negative for the next round +# # num_positive = 0 # Reset num_positive for the next round +# # T_counter += 1 +# # return randomized_values + + + +# class LaplaceTruncated(Laplace, TruncationAndFoldingMixin): +# r""" +# The truncated Laplace mechanism, where values outside a pre-described domain are mapped to the closest point +# within the domain. + +# Parameters +# ---------- +# epsilon : float +# Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + +# delta : float, default: 0.0 +# Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with +# ``epsilon``. + +# sensitivity : float +# The sensitivity of the mechanism. Must be in [0, ∞). + +# lower : float +# The lower bound of the mechanism. + +# upper : float +# The upper bound of the mechanism. + +# random_state : int or RandomState, optional +# Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, +# ``random_state`` has to be fixed to an integer. + +# """ +# def __init__(self, *, epsilon, delta=0.0, sensitivity, lower, upper, random_state=None): +# super().__init__(epsilon=epsilon, delta=delta, sensitivity=sensitivity, random_state=random_state) +# TruncationAndFoldingMixin.__init__(self, lower=lower, upper=upper) + +# @copy_docstring(Laplace.bias) +# def bias(self, value): +# self._check_all(value) + +# shape = self.sensitivity / self.epsilon +# print('@@@@ Inside the laplace truncated class, and Shape: {}'.format(shape)) +# print('@@@@ Inside the laplace truncated class, and Value: {}'.format(value)) +# print('@@@@ Inside the laplace truncated class, and Lower: {}'.format(self.lower)) +# print('@@@@ Inside the laplace truncated class, and Upper: {}'.format(self.upper)) +# print('@@@@ Inside the laplace truncated class, and returns ----: {}',shape / 2 * (np.exp((self.lower - value) / shape) - np.exp((value - self.upper) / shape))) +# return shape / 2 * (np.exp((self.lower - value) / shape) - np.exp((value - self.upper) / shape)) + +# @copy_docstring(Laplace.variance) +# def variance(self, value): +# self._check_all(value) + +# shape = self.sensitivity / self.epsilon + +# variance = value ** 2 + shape * (self.lower * np.exp((self.lower - value) / shape) +# - self.upper * np.exp((value - self.upper) / shape)) +# variance += (shape ** 2) * (2 - np.exp((self.lower - value) / shape) +# - np.exp((value - self.upper) / shape)) + +# variance -= (self.bias(value) + value) ** 2 + +# return variance + +# def _check_all(self, value): +# Laplace._check_all(self, value) +# TruncationAndFoldingMixin._check_all(self, value) + +# return True + +# @copy_docstring(Laplace.randomise) +# # def randomise(self, value): +# # print('----START(randomize)---- Inside the laplace truncated randomize class, and original value is: {}'.format(value)) +# # self._check_all(value) +# # #print('@@@@ Inside the laplace truncated randomize class, and Value: {}'.format(value)) +# # noisy_value = super().randomise(value) +# # print('----END(randomize)---- The noisy value returned after randomizing the original value is: {}'.format(noisy_value)) +# # print('@@@@ Inside the laplace truncated randomize class, and returns-------------->>>>>>>: ',self._truncate(noisy_value)) +# # return self._truncate(noisy_value) + + +# def randomise(self, values): +# """Randomise a list of values with the mechanism. + +# Parameters +# ---------- +# values : list or array-like +# The values to be randomised. + +# Returns +# ------- +# array-like +# The array of randomised values. + +# """ +# print('----START(randomize)---- Inside the laplace truncated randomize class, and original values are: {}'.format(values)) + +# # Ensure all values in the list pass the check +# for value in values: +# self._check_all(value) + +# # Send the entire list of values for randomization to the superclass +# noisy_values = super().randomise(values) + +# print('----END(randomize)---- The noisy values returned after randomizing the original values are: {}'.format(noisy_values)) +# #print('@@@@ Inside the laplace truncated randomize class, and returns-------------->>>>>>>: ', self._truncate(noisy_values)) + +# return noisy_values +# #return self._truncate(noisy_values) + + + +# class LaplaceFolded(Laplace, TruncationAndFoldingMixin): +# r""" +# The folded Laplace mechanism, where values outside a pre-described domain are folded around the domain until they +# fall within. + +# Parameters +# ---------- +# epsilon : float +# Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + +# delta : float, default: 0.0 +# Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with +# ``epsilon``. + +# sensitivity : float +# The sensitivity of the mechanism. Must be in [0, ∞). + +# lower : float +# The lower bound of the mechanism. + +# upper : float +# The upper bound of the mechanism. + +# random_state : int or RandomState, optional +# Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, +# ``random_state`` has to be fixed to an integer. + +# """ +# def __init__(self, *, epsilon, delta=0.0, sensitivity, lower, upper, random_state=None): +# super().__init__(epsilon=epsilon, delta=delta, sensitivity=sensitivity, random_state=random_state) +# TruncationAndFoldingMixin.__init__(self, lower=lower, upper=upper) + +# @copy_docstring(Laplace.bias) +# def bias(self, value): +# self._check_all(value) + +# shape = self.sensitivity / self.epsilon + +# bias = shape * (np.exp((self.lower + self.upper - 2 * value) / shape) - 1) +# bias /= np.exp((self.lower - value) / shape) + np.exp((self.upper - value) / shape) + +# return bias + +# @copy_docstring(DPMechanism.variance) +# def variance(self, value): +# raise NotImplementedError + +# def _check_all(self, value): +# super()._check_all(value) +# TruncationAndFoldingMixin._check_all(self, value) + +# return True + +# @copy_docstring(Laplace.randomise) +# def randomise(self, value): +# self._check_all(value) + +# noisy_value = super().randomise(value) +# return self._fold(noisy_value) + + + +# class LaplaceBoundedDomain(LaplaceTruncated): +# r""" +# The bounded Laplace mechanism on a bounded domain. The mechanism draws values directly from the domain using +# rejection sampling, without any post-processing [HABM20]_. + +# Parameters +# ---------- +# epsilon : float +# Privacy parameter :math:`\epsilon` for the mechanism. Must be in [0, ∞]. + +# delta : float, default: 0.0 +# Privacy parameter :math:`\delta` for the mechanism. Must be in [0, 1]. Cannot be simultaneously zero with +# ``epsilon``. + +# sensitivity : float +# The sensitivity of the mechanism. Must be in [0, ∞). + +# lower : float +# The lower bound of the mechanism. + +# upper : float +# The upper bound of the mechanism. + +# random_state : int or RandomState, optional +# Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, +# ``random_state`` has to be fixed to an integer. + +# References +# ---------- +# .. [HABM20] Holohan, Naoise, Spiros Antonatos, Stefano Braghin, and Pól Mac Aonghusa. "The Bounded Laplace Mechanism +# in Differential Privacy." Journal of Privacy and Confidentiality 10, no. 1 (2020). + +# """ +# def _find_scale(self): +# eps = self.epsilon +# delta = self.delta +# diam = self.upper - self.lower +# delta_q = self.sensitivity + +# def _delta_c(shape): +# if shape == 0: +# return 2.0 +# return (2 - np.exp(- delta_q / shape) - np.exp(- (diam - delta_q) / shape)) / (1 - np.exp(- diam / shape)) + +# def _f(shape): +# return delta_q / (eps - np.log(_delta_c(shape)) - np.log(1 - delta)) + +# left = delta_q / (eps - np.log(1 - delta)) +# right = _f(left) +# old_interval_size = (right - left) * 2 + +# while old_interval_size > right - left: +# old_interval_size = right - left +# middle = (right + left) / 2 + +# if _f(middle) >= middle: +# left = middle +# if _f(middle) <= middle: +# right = middle + +# return (right + left) / 2 + +# def effective_epsilon(self): +# r"""Gets the effective epsilon of the mechanism, only for strict :math:`\epsilon`-differential privacy. Returns +# ``None`` if :math:`\delta` is non-zero. + +# Returns +# ------- +# float +# The effective :math:`\epsilon` parameter of the mechanism. Returns ``None`` if `delta` is non-zero. + +# """ +# if self._scale is None: +# self._scale = self._find_scale() + +# if self.delta > 0.0: +# return None + +# return self.sensitivity / self._scale + +# @copy_docstring(Laplace.bias) +# def bias(self, value): +# self._check_all(value) + +# if self._scale is None: +# self._scale = self._find_scale() + +# bias = (self._scale - self.lower + value) / 2 * np.exp((self.lower - value) / self._scale) \ +# - (self._scale + self.upper - value) / 2 * np.exp((value - self.upper) / self._scale) +# bias /= 1 - np.exp((self.lower - value) / self._scale) / 2 \ +# - np.exp((value - self.upper) / self._scale) / 2 + +# return bias + +# @copy_docstring(Laplace.variance) +# def variance(self, value): +# self._check_all(value) + +# if self._scale is None: +# self._scale = self._find_scale() + +# variance = value**2 +# variance -= (np.exp((self.lower - value) / self._scale) * (self.lower ** 2) +# + np.exp((value - self.upper) / self._scale) * (self.upper ** 2)) / 2 +# variance += self._scale * (self.lower * np.exp((self.lower - value) / self._scale) +# - self.upper * np.exp((value - self.upper) / self._scale)) +# variance += (self._scale ** 2) * (2 - np.exp((self.lower - value) / self._scale) +# - np.exp((value - self.upper) / self._scale)) +# variance /= 1 - (np.exp(-(value - self.lower) / self._scale) +# + np.exp(-(self.upper - value) / self._scale)) / 2 + +# variance -= (self.bias(value) + value) ** 2 + +# return variance + +# @copy_docstring(Laplace.randomise) +# def randomise(self, value): +# self._check_all(value) + +# if self._scale is None: +# self._scale = self._find_scale() + +# value = max(min(value, self.upper), self.lower) +# if np.isnan(value): +# return float("nan") + +# samples = 1 + +# while True: +# try: +# unif = self._rng.random(4 * samples) +# except TypeError: # rng is secrets.SystemRandom +# unif = [self._rng.random() for _ in range(4 * samples)] +# noisy = value + self._scale * self._laplace_sampler(*np.array(unif).reshape(4, -1)) + +# if ((noisy >= self.lower) & (noisy <= self.upper)).any(): +# idx = np.argmax((noisy >= self.lower) & (noisy <= self.upper)) +# return noisy[idx] +# samples = min(100000, samples * 2) + + +# class LaplaceBoundedNoise(Laplace): +# r""" +# The Laplace mechanism with bounded noise, only applicable for approximate differential privacy (delta > 0) +# [GDGK18]_. + +# Epsilon must be strictly positive, `epsilon` > 0. `delta` must be strictly in the interval (0, 0.5). +# - For zero `epsilon`, use :class:`.Uniform`. +# - For zero `delta`, use :class:`.Laplace`. + +# Parameters +# ---------- +# epsilon : float +# Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + +# delta : float +# Privacy parameter :math:`\delta` for the mechanism. Must be in (0, 0.5). + +# sensitivity : float +# The sensitivity of the mechanism. Must be in [0, ∞). + +# random_state : int or RandomState, optional +# Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, +# ``random_state`` has to be fixed to an integer. + +# References +# ---------- +# .. [GDGK18] Geng, Quan, Wei Ding, Ruiqi Guo, and Sanjiv Kumar. "Truncated Laplacian Mechanism for Approximate +# Differential Privacy." arXiv preprint arXiv:1810.00877v1 (2018). + +# """ +# def __init__(self, *, epsilon, delta, sensitivity, random_state=None): +# super().__init__(epsilon=epsilon, delta=delta, sensitivity=sensitivity, random_state=random_state) +# self._noise_bound = None + +# @classmethod +# def _check_epsilon_delta(cls, epsilon, delta): +# if epsilon == 0: +# raise ValueError("Epsilon must be strictly positive. For zero epsilon, use :class:`.Uniform`.") + +# if isinstance(delta, Real) and not 0 < delta < 0.5: +# raise ValueError("Delta must be strictly in the interval (0,0.5). For zero delta, use :class:`.Laplace`.") + +# return super()._check_epsilon_delta(epsilon, delta) + +# @copy_docstring(Laplace.bias) +# def bias(self, value): +# return 0.0 + +# @copy_docstring(DPMechanism.variance) +# def variance(self, value): +# raise NotImplementedError + +# @copy_docstring(Laplace.randomise) +# def randomise(self, value): +# self._check_all(value) + +# if self._scale is None or self._noise_bound is None: +# self._scale = self.sensitivity / self.epsilon +# self._noise_bound = 0 if self._scale == 0 else \ +# self._scale * np.log(1 + (np.exp(self.epsilon) - 1) / 2 / self.delta) + +# if np.isnan(value): +# return float("nan") + +# samples = 1 + +# while True: +# try: +# unif = self._rng.random(4 * samples) +# except TypeError: # rng is secrets.SystemRandom +# unif = [self._rng.random() for _ in range(4 * samples)] +# noisy = self._scale * self._laplace_sampler(*np.array(unif).reshape(4, -1)) + +# if ((noisy >= - self._noise_bound) & (noisy <= self._noise_bound)).any(): +# idx = np.argmax((noisy >= - self._noise_bound) & (noisy <= self._noise_bound)) +# return value + noisy[idx] +# samples = min(100000, samples * 2) diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/snapping.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/snapping.py new file mode 100644 index 0000000..393f3e7 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/snapping.py @@ -0,0 +1,250 @@ +""" +The Snapping mechanism in differential privacy, which eliminates a weakness to floating point errors in the classic +Laplace mechanism with standard Laplace sampling. +""" +import struct + +import numpy as np +try: + from crlibm import log_rn # pylint: disable=no-name-in-module +except ModuleNotFoundError: + log_rn = np.log + +from diffprivlib.mechanisms import LaplaceTruncated + + +class Snapping(LaplaceTruncated): + r""" + The Snapping mechanism for differential privacy. + + First proposed by Ilya Mironov [Mir12]_. + + It eliminates a vulnerability stemming from the representation of reals as floating-point numbers in implementations + of the classic Laplace mechanism and its variants which use the inverse CDF of the Laplace distribution to sample + it. It causes a high degree of reduction in the granularity of the output. + + For the most faithful implementation of the mechanism, the ``crlibm`` package should be installed. + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in [:math:`2 \eta`, ∞], where :math:`\eta` is the + machine epsilon of the floating point type. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + lower : float + The lower bound of the mechanism. + + upper : float + The upper bound of the mechanism. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + References + ---------- + .. [Mir12] Mironov, Ilya. "On significance of the least significant bits for differential privacy." Proceedings of + the 2012 ACM conference on Computer and communications security (2012). + + """ + def __init__(self, *, epsilon, sensitivity, lower, upper, random_state=None): + super().__init__(epsilon=epsilon, sensitivity=sensitivity, delta=0.0, lower=lower, upper=upper, + random_state=random_state) + self._bound = self._scale_bound() + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + epsilon, delta = super()._check_epsilon_delta(epsilon, delta) + + machine_epsilon = np.finfo(float).epsneg + if epsilon <= 2 * machine_epsilon: + raise ValueError("Epsilon must be at least as large as twice the machine epsilon for the floating point " + "type, as the effective epsilon must be non-negative") + + return epsilon, delta + + def _scale_bound(self): + """ + Scales the lower and upper bounds to be proportionate to sensitivity 1, and symmetrical about 0. + For sensitivity 0, only centres the bound, as scaling up and down is not defined. + + Returns + ------- + float + A symmetric bound around 0 scaled to sensitivity 1 + + """ + if self.sensitivity == 0: + return (self.upper - self.lower) / 2.0 + return (self.upper - self.lower) / 2.0 / self.sensitivity + + def _truncate(self, value): + if value > self._bound: + return self._bound + if value < -self._bound: + return -self._bound + + return value + + def bias(self, value): + raise NotImplementedError + + def variance(self, value): + raise NotImplementedError + + def effective_epsilon(self): + r""" + Returns the effective value used in the Snapping mechanism to give the required :math:`\epsilon`-DP, based on + the bounds and the machine epsilon. + Based on section 5.2 of [Mir12]_. + + Returns + ------- + float + The effective value of :math:`\epsilon` + + """ + machine_epsilon = np.finfo(float).epsneg + return (self.epsilon - 2 * machine_epsilon) / (1 + 12 * self._bound * machine_epsilon) + + def _scale_and_offset_value(self, value): + """ + Centre value around 0 with symmetric bound and scale to sensitivity 1 + + Parameters + ---------- + value : float + value to be scaled + Returns + ------- + float + value offset to be centered on 0 and scaled to sensitivity 1 + + """ + value_scaled = value / self.sensitivity + return value_scaled - self._bound - (self.lower / self.sensitivity) + + def _reverse_scale_and_offset_value(self, value): + return (value + self._bound) * self.sensitivity + self.lower + + @staticmethod + def _get_nearest_power_of_2(x): + def float_to_bits(d): + s = struct.pack('>d', d) + return struct.unpack('>q', s)[0] + + def bits_to_float(b): + s = struct.pack('>q', b) + return struct.unpack('>d', s)[0] + + bits = float_to_bits(x) + mantissa_size = np.finfo(float).nmant + if bits % (1 << mantissa_size) == 0: + return x + return bits_to_float(((bits >> mantissa_size) + 1) << mantissa_size) + + def _round_to_nearest_power_of_2(self, value, lambda_): + """ Performs the rounding step from [Mir12]_ with ties resolved towards +∞ + + Parameters + ---------- + value : float + Value to be rounded + + Returns + ------- + float + Rounded value + + """ + if self.epsilon == float('inf'): # infinitely small rounding + return value + remainder = value % lambda_ + if remainder > lambda_ / 2: + return value - remainder + lambda_ + if remainder == lambda_ / 2: + return value + remainder + return value - remainder + + def _uniform_sampler(self): + """ + Uniformly sample the full domain of floating-point numbers between (0, 1), rather than only multiples of 2^-53. + A uniform distribution over D ∩ (0, 1) can be generated by independently sampling an exponent + from the geometric distribution with parameter .5 and a significand by drawing a uniform string from + {0, 1}^52 [Mir12]_ + + Based on code recipe in Python standard library documentation [Py21]_. + + Returns + ------- + float + A value sampled from float in (0, 1) with probability proportional to the size of the infinite-precision + real interval each float represents + + References + ---------- + .. [Py21] The Python Standard Library. "random — Generate pseudo-random numbers", 2021 + https://docs.python.org/3/library/random.html#recipes + + """ + mantissa_size = np.finfo(float).nmant + mantissa = 1 << mantissa_size | self._getrandbits(mantissa_size) + exponent = -(mantissa_size + 1) + x = 0 + while not x: + x = self._getrandbits(32) + exponent += x.bit_length() - 32 + return np.ldexp(mantissa, exponent) + + def _getrandbits(self, bits): + try: + return self._rng.getrandbits(bits) + except AttributeError: + return self._rng.randint(0, 2 ** bits) + + @staticmethod + def _laplace_sampler(unif_bit, unif): + r""" + Laplace inverse CDF random sampling implementation which uses full domain uniform sampling and exact log + implementation from crlibm (if installed), as mentioned in [Mir12]_. + Outputs a random value scaled according to privacy budget and sensitivity 1, as bounds and input are scaled to + sensitivity 1 before Laplacian noise is added. + + Returns + ------- + float + Random value from Laplace distribution scaled according to :math:`\epsilon` + + """ + laplace = (-1) ** unif_bit * log_rn(unif) + return laplace + + def randomise(self, value): + """Randomise `value` with the mechanism. + + Parameters + ---------- + value : float + The value to be randomised. + + Returns + ------- + float + The randomised value. + + """ + self._check_all(value) + if self.sensitivity == 0: + return self._truncate(value) + + value_scaled_offset = self._scale_and_offset_value(value) + value_clamped = self._truncate(value_scaled_offset) + + scale = 1.0 / self.effective_epsilon() # everything is already scaled to sensitivity 1 + lambda_ = self._get_nearest_power_of_2(scale) + laplace = scale * self._laplace_sampler(self._getrandbits(1), self._uniform_sampler()) + value_rounded = self._round_to_nearest_power_of_2(value_clamped + laplace, lambda_) + return self._reverse_scale_and_offset_value(self._truncate(value_rounded)) diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/staircase.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/staircase.py new file mode 100644 index 0000000..be29429 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/staircase.py @@ -0,0 +1,107 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +The staircase mechanism in differential privacy. +""" +import secrets +from numbers import Real + +import numpy as np + +from diffprivlib.mechanisms.laplace import Laplace +from diffprivlib.utils import copy_docstring + + +class Staircase(Laplace): + r""" + The staircase mechanism in differential privacy. + + The staircase mechanism is an optimisation of the classical Laplace Mechanism (:class:`.Laplace`), described as a + "geometric mixture of uniform random variables". + Paper link: https://arxiv.org/pdf/1212.1186.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + gamma : float, default: 1 / (1 + exp(epsilon/2)) + Value of the tuning parameter gamma for the mechanism. Must be in [0, 1]. + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, sensitivity, gamma=None, random_state=None): + super().__init__(epsilon=epsilon, delta=0, sensitivity=sensitivity, random_state=random_state) + self.gamma = self._check_gamma(gamma, epsilon=self.epsilon) + + if isinstance(self._rng, secrets.SystemRandom): + self._rng = np.random.default_rng() + + @classmethod + def _check_gamma(cls, gamma, epsilon=None): + if gamma is None and epsilon is not None: + gamma = 1 / (1 + np.exp(epsilon / 2)) + + if not isinstance(gamma, Real): + raise TypeError("Gamma must be numeric") + if not 0.0 <= gamma <= 1.0: + raise ValueError("Gamma must be in [0,1]") + + return float(gamma) + + @copy_docstring(Laplace._check_all) + def _check_all(self, value): + super()._check_all(value) + self._check_gamma(self.gamma) + + return True + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if not delta == 0: + raise ValueError("Delta must be zero") + + return super()._check_epsilon_delta(epsilon, delta) + + @copy_docstring(Laplace.bias) + def bias(self, value): + return 0.0 + + @copy_docstring(Laplace.variance) + def variance(self, value): + raise NotImplementedError + + @copy_docstring(Laplace.randomise) + def randomise(self, value): + self._check_all(value) + + sign = -1 if self._rng.random() < 0.5 else 1 + geometric_rv = self._rng.geometric(1 - np.exp(- self.epsilon)) - 1 + unif_rv = self._rng.random() + binary_rv = 0 if self._rng.random() < self.gamma / (self.gamma + + (1 - self.gamma) * np.exp(- self.epsilon)) else 1 + + return value + sign * ((1 - binary_rv) * ((geometric_rv + self.gamma * unif_rv) * self.sensitivity) + + binary_rv * ((geometric_rv + self.gamma + (1 - self.gamma) * unif_rv) * + self.sensitivity)) diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__init__.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__init__.py new file mode 100644 index 0000000..5831369 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__init__.py @@ -0,0 +1,32 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Transform wrappers for differential privacy mechanisms to extend their use to alternative data types. + +Notes +----- +The naming convention for new transforms is to describe the `pre-transform` action, i.e. the action performed on the +data to be ingested by the mechanism. For transforms without a `pre-transform`, the `post-transform` action should be +described. + +""" +from diffprivlib.mechanisms.transforms.base import DPTransformer + +from diffprivlib.mechanisms.transforms.roundedinteger import RoundedInteger +from diffprivlib.mechanisms.transforms.stringtoint import StringToInt +from diffprivlib.mechanisms.transforms.inttostring import IntToString diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/__init__.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..1bfd310 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/__init__.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/base.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/base.cpython-311.pyc new file mode 100644 index 0000000..aa2abcb Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/base.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/inttostring.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/inttostring.cpython-311.pyc new file mode 100644 index 0000000..eee503b Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/inttostring.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/roundedinteger.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/roundedinteger.cpython-311.pyc new file mode 100644 index 0000000..ce2d2d9 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/roundedinteger.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/stringtoint.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/stringtoint.cpython-311.pyc new file mode 100644 index 0000000..2d0284f Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/__pycache__/stringtoint.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/base.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/base.py new file mode 100644 index 0000000..34b1a71 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/base.py @@ -0,0 +1,89 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Core utilities for DP transformers. +""" +from diffprivlib.mechanisms.base import DPMachine + + +class DPTransformer(DPMachine): + """ + Base class for DP transformers. DP Transformers are simple wrappers for DP Mechanisms to allow mechanisms to be + used with data types and structures outside their scope. + + A :class:`.DPTransformer` must be initiated with a :class:`.DPMachine` (either another :class:`.DPTransformer`, or a + :class:`.DPMechanism`). This allows many instances of :class:`.DPTransformer` to be chained together, but the chain + must terminate with a :class:`.DPMechanism`. + + """ + def __init__(self, parent): + if not isinstance(parent, DPMachine): + raise TypeError("Data transformer must take a DPMachine as input") + + self.parent = parent + + def pre_transform(self, value): + """Performs no transformation on the input data, and is ingested by the mechanism as-is. + + Parameters + ---------- + value : float or string + Input value to be transformed. + + Returns + ------- + float or string + Transformed input value + """ + return value + + def post_transform(self, value): + """Performs no transformation on the output of the mechanism, and is returned as-is. + + Parameters + ---------- + value : float or string + Mechanism output to be transformed. + + Returns + ------- + float or string + Transformed output value. + + """ + return value + + def randomise(self, value): + """ + Randomise the given value using the :class:`.DPMachine`. + + Parameters + ---------- + value : float or string + Value to be randomised. + + Returns + ------- + float or string + Randomised value, same type as `value`. + + """ + transformed_value = self.pre_transform(value) + noisy_value = self.parent.randomise(transformed_value) + output_value = self.post_transform(noisy_value) + return output_value diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/inttostring.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/inttostring.py new file mode 100644 index 0000000..00350fc --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/inttostring.py @@ -0,0 +1,60 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +IntToString DP transformer, for using integer-valued data with string-valued mechanisms. +""" +from diffprivlib.mechanisms.transforms.base import DPTransformer + + +class IntToString(DPTransformer): + """ + IntToString DP transformer, for using integer-valued data with string-valued mechanisms. + + Useful when using integer-valued data with :class:`.Binary` or :class:`.Exponential`. + """ + def pre_transform(self, value): + """Transforms the input to be string-valued for ingestion by the mechanism. + + Parameters + ---------- + value : float or string + Input value to be transformed. + + Returns + ------- + string + Transformed input value + + """ + return str(value) + + def post_transform(self, value): + """Transforms the output of the mechanism to be integer-valued. + + Parameters + ---------- + value : float or string + Mechanism output to be transformed. + + Returns + ------- + int + Transformed output value. + + """ + return int(value) diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/roundedinteger.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/roundedinteger.py new file mode 100644 index 0000000..7717d01 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/roundedinteger.py @@ -0,0 +1,42 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Rounded integer transformer. Rounds the output of the given mechanism to the nearest integer. +""" +from diffprivlib.mechanisms.transforms.base import DPTransformer + + +class RoundedInteger(DPTransformer): + """ + Rounded integer transform. Rounds the (float) output of the given mechanism to the nearest integer. + """ + def post_transform(self, value): + """Transforms the (float) output of the mechanism to be a rounded integer. + + Parameters + ---------- + value : float + Mechanism output to be transformed. + + Returns + ------- + int + Transformed output value. + + """ + return int(round(value)) diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/stringtoint.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/stringtoint.py new file mode 100644 index 0000000..e3d4224 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/transforms/stringtoint.py @@ -0,0 +1,61 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +StringToInt DP transformer, for using string-valued data with integer-valued mechanisms. +""" +from diffprivlib.mechanisms.transforms.base import DPTransformer + + +class StringToInt(DPTransformer): + """ + StringToInt DP transformer, for using string-valued data with integer-valued mechanisms. + + Useful when using ordered, string-valued data with :class:`.Geometric`. + """ + + def pre_transform(self, value): + """Transforms the input to be integer-valued for ingestion by the mechanism. + + Parameters + ---------- + value : float or string + Input value to be transformed. + + Returns + ------- + int + Transformed input value + + """ + return int(value) + + def post_transform(self, value): + """Transforms the output of the mechanism to be string-valued. + + Parameters + ---------- + value : float or string + Mechanism output to be transformed. + + Returns + ------- + string + Transformed output value. + + """ + return str(value) diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/uniform.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/uniform.py new file mode 100644 index 0000000..f68cd1d --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/uniform.py @@ -0,0 +1,98 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +The uniform mechanism in differential privacy. +""" +from numbers import Real + +from diffprivlib.mechanisms.base import DPMechanism +from diffprivlib.mechanisms.laplace import Laplace +from diffprivlib.utils import copy_docstring + + +class Uniform(DPMechanism): + r""" + The Uniform mechanism in differential privacy. + + This emerges as a special case of the :class:`.LaplaceBoundedNoise` mechanism when epsilon = 0. + Paper link: https://arxiv.org/pdf/1810.00877.pdf + + Parameters + ---------- + delta : float + Privacy parameter :math:`\delta` for the mechanism. Must be in (0, 0.5]. + + sensitivity : float + The sensitivity of the mechanism. Must be in [0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, delta, sensitivity, random_state=None): + super().__init__(epsilon=0.0, delta=delta, random_state=random_state) + self.sensitivity = self._check_sensitivity(sensitivity) + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if not epsilon == 0: + raise ValueError("Epsilon must be strictly zero.") + + if not 0 < delta <= 0.5: + raise ValueError("Delta must be in the half-open interval (0, 0.5]") + + return super()._check_epsilon_delta(epsilon, delta) + + @classmethod + def _check_sensitivity(cls, sensitivity): + if not isinstance(sensitivity, Real): + raise TypeError("Sensitivity must be numeric") + + if sensitivity < 0: + raise ValueError("Sensitivity must be non-negative") + + return float(sensitivity) + + @copy_docstring(Laplace.bias) + def bias(self, value): + return 0.0 + + @copy_docstring(Laplace.variance) + def variance(self, value): + self._check_all(value) + + return (self.sensitivity / self.delta) ** 2 / 12 + + def _check_all(self, value): + super()._check_all(value) + self._check_sensitivity(self.sensitivity) + + if not isinstance(value, Real): + raise TypeError("Value to be randomised must be a number") + + return True + + @copy_docstring(Laplace.randomise) + def randomise(self, value): + self._check_all(value) + + unif_rv = 2 * self._rng.random() - 1 + unif_rv *= self.sensitivity / self.delta / 2 + + return value + unif_rv diff --git a/privbayes-synthesizer/code/diffprivlib/mechanisms/vector.py b/privbayes-synthesizer/code/diffprivlib/mechanisms/vector.py new file mode 100644 index 0000000..d59847a --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/mechanisms/vector.py @@ -0,0 +1,184 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +The vector mechanism in differential privacy, for producing perturbed objectives +""" +from numbers import Real + +import numpy as np + +from diffprivlib.mechanisms.base import DPMechanism +from diffprivlib.utils import copy_docstring + + +class Vector(DPMechanism): + r""" + The vector mechanism in differential privacy. + + The vector mechanism is used when perturbing convex objective functions. + Full paper: http://www.jmlr.org/papers/volume12/chaudhuri11a/chaudhuri11a.pdf + + Parameters + ---------- + epsilon : float + Privacy parameter :math:`\epsilon` for the mechanism. Must be in (0, ∞]. + + function_sensitivity : float + The function sensitivity of the mechanism. Must be in [0, ∞). + + data_sensitivity : float, default: 1.0 + The data sensitivityof the mechanism. Must be in [0, ∞). + + dimension : int + Function input dimension. This dimension relates to the size of the input vector of the function being + considered by the mechanism. This corresponds to the size of the random vector produced by the mechanism. Must + be in [1, ∞). + + alpha : float, default: 0.01 + Regularisation parameter. Must be in (0, ∞). + + random_state : int or RandomState, optional + Controls the randomness of the mechanism. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + """ + def __init__(self, *, epsilon, function_sensitivity, data_sensitivity=1.0, dimension, alpha=0.01, + random_state=None): + super().__init__(epsilon=epsilon, delta=0.0, random_state=random_state) + self.function_sensitivity, self.data_sensitivity = self._check_sensitivity(function_sensitivity, + data_sensitivity) + self.dimension = self._check_dimension(dimension) + self.alpha = self._check_alpha(alpha) + + @classmethod + def _check_epsilon_delta(cls, epsilon, delta): + if not delta == 0: + raise ValueError("Delta must be zero") + + return super()._check_epsilon_delta(epsilon, delta) + + @classmethod + def _check_alpha(cls, alpha): + if not isinstance(alpha, Real): + raise TypeError("Alpha must be numeric") + + if alpha <= 0: + raise ValueError("Alpha must be strictly positive") + + return alpha + + @classmethod + def _check_dimension(cls, vector_dim): + if not isinstance(vector_dim, Real) or not np.isclose(vector_dim, int(vector_dim)): + raise TypeError("d must be integer-valued") + if int(vector_dim) < 1: + raise ValueError("d must be strictly positive") + + return int(vector_dim) + + @classmethod + def _check_sensitivity(cls, function_sensitivity, data_sensitivity): + if not isinstance(function_sensitivity, Real) or not isinstance(data_sensitivity, Real): + raise TypeError("Sensitivities must be numeric") + + if function_sensitivity < 0 or data_sensitivity < 0: + raise ValueError("Sensitivities must be non-negative") + + return function_sensitivity, data_sensitivity + + def _check_all(self, value): + super()._check_all(value) + self._check_alpha(self.alpha) + self._check_sensitivity(self.function_sensitivity, self.data_sensitivity) + self._check_dimension(self.dimension) + + if not callable(value): + raise TypeError("Value to be randomised must be a function") + + return True + + @copy_docstring(DPMechanism.bias) + def bias(self, value): + raise NotImplementedError + + @copy_docstring(DPMechanism.variance) + def variance(self, value): + raise NotImplementedError + + def randomise(self, value): + """Randomise `value` with the mechanism. + + If `value` is a method of two outputs, they are taken as `f` and `fprime` (i.e., its gradient), and both are + perturbed accordingly. + + Parameters + ---------- + value : method + The function to be randomised. + + Returns + ------- + method + The randomised method. + + """ + self._check_all(value) + + epsilon_p = self.epsilon - 2 * np.log(1 + self.function_sensitivity * self.data_sensitivity / + (0.5 * self.alpha)) + delta = 0 + + if epsilon_p <= 0: + delta = (self.function_sensitivity * self.data_sensitivity / (np.exp(self.epsilon / 4) - 1) + - 0.5 * self.alpha) + epsilon_p = self.epsilon / 2 + + scale = self.data_sensitivity * 2 / epsilon_p + + try: + normed_noisy_vector = self._rng.standard_normal((self.dimension, 4)).sum(axis=1) / 2 + noisy_norm = self._rng.gamma(self.dimension / 4, scale, 4).sum() + except AttributeError: # rng is secrets.SystemRandom + normed_noisy_vector = np.reshape([self._rng.normalvariate(0, 1) for _ in range(self.dimension * 4)], + (-1, 4)).sum(axis=1) / 2 + noisy_norm = sum(self._rng.gammavariate(self.dimension / 4, scale) for _ in range(4)) if scale > 0 else 0.0 + + norm = np.linalg.norm(normed_noisy_vector, 2) + normed_noisy_vector = normed_noisy_vector / norm * noisy_norm + + def output_func(*args): + input_vec = args[0] + + func = value(*args) + + if isinstance(func, tuple): + func, grad = func + else: + grad = None + + func += np.dot(normed_noisy_vector, input_vec) + func += 0.5 * delta * np.dot(input_vec, input_vec) + + if grad is not None: + grad += normed_noisy_vector + delta * input_vec + + return func, grad + + return func + + return output_func diff --git a/privbayes-synthesizer/code/diffprivlib/models/__init__.py b/privbayes-synthesizer/code/diffprivlib/models/__init__.py new file mode 100644 index 0000000..6feb569 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/__init__.py @@ -0,0 +1,27 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Machine learning models with differential privacy +""" +from diffprivlib.models.naive_bayes import GaussianNB +from diffprivlib.models.k_means import KMeans +from diffprivlib.models.linear_regression import LinearRegression +from diffprivlib.models.logistic_regression import LogisticRegression +from diffprivlib.models.pca import PCA +from diffprivlib.models.standard_scaler import StandardScaler +from diffprivlib.models.forest import RandomForestClassifier, DecisionTreeClassifier diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/__init__.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..34e1c93 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/__init__.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/forest.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/forest.cpython-311.pyc new file mode 100644 index 0000000..8409c16 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/forest.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/k_means.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/k_means.cpython-311.pyc new file mode 100644 index 0000000..fa6dc89 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/k_means.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/linear_regression.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/linear_regression.cpython-311.pyc new file mode 100644 index 0000000..6cda42b Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/linear_regression.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/logistic_regression.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/logistic_regression.cpython-311.pyc new file mode 100644 index 0000000..1ced9f6 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/logistic_regression.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/naive_bayes.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/naive_bayes.cpython-311.pyc new file mode 100644 index 0000000..747074e Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/naive_bayes.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/pca.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/pca.cpython-311.pyc new file mode 100644 index 0000000..106248c Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/pca.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/standard_scaler.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/standard_scaler.cpython-311.pyc new file mode 100644 index 0000000..fbb6c76 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/standard_scaler.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/__pycache__/utils.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000..0f26ee5 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/models/__pycache__/utils.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/models/forest.py b/privbayes-synthesizer/code/diffprivlib/models/forest.py new file mode 100644 index 0000000..b99e670 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/forest.py @@ -0,0 +1,637 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2021 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Random Forest Classifier with Differential Privacy +""" +from collections import namedtuple +import warnings + +from joblib import Parallel, delayed +import numpy as np +from sklearn.exceptions import DataConversionWarning +from sklearn.tree._tree import Tree, DOUBLE, DTYPE, NODE_DTYPE # pylint: disable=no-name-in-module +from sklearn.ensemble._forest import RandomForestClassifier as skRandomForestClassifier, _parallel_build_trees +from sklearn.tree import DecisionTreeClassifier as skDecisionTreeClassifier + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.utils import PrivacyLeakWarning, check_random_state +from diffprivlib.mechanisms import PermuteAndFlip +from diffprivlib.validation import DiffprivlibMixin + +MAX_INT = np.iinfo(np.int32).max + + +class RandomForestClassifier(skRandomForestClassifier, DiffprivlibMixin): # pylint: disable=too-many-ancestors + r"""Random Forest Classifier with differential privacy. + + This class implements Differentially Private Random Decision Forests using [1]. + :math:`\epsilon`-Differential privacy is achieved by constructing decision trees via random splitting criterion and + applying the :class:`.PermuteAndFlip` Mechanism to determine a noisy label. + + Parameters + ---------- + n_estimators : int, default: 10 + The number of trees in the forest. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering + the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed + on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. + + classes : array-like of shape (n_classes,) + Array of classes to be trained on. If not provided, the classes will be read from the data when ``.fit()`` is + first called, resulting in a :class:`.PrivacyLeakWarning`. + + n_jobs : int, default: 1 + Number of CPU cores used when parallelising over classes. ``-1`` means using all processors. + + verbose : int, default: 0 + Set to any positive number for verbosity. + + random_state : int or RandomState, optional + Controls both the randomness of the shuffling of the samples used when building trees (if ``shuffle=True``) and + training of the differentially-private :class:`.DecisionTreeClassifier` to construct the forest. To obtain a + deterministic behaviour during randomisation, ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + max_depth : int, default: 5 + The maximum depth of the tree. The depth translates to an exponential increase in memory usage. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, + otherwise, just fit a whole new forest. + + shuffle : bool, default=False + When set to ``True``, shuffles the datapoints to be trained on trees at random. In diffprivlib, each datapoint + is used to train exactly one tree. When set to ``False``, datapoints are chosen in-order to their tree in + sequence. + + Attributes + ---------- + base_estimator_ : DecisionTreeClassifier + The child estimator template used to create the collection of fitted sub-estimators. + + estimators_ : list of DecisionTreeClassifier + The collection of fitted sub-estimators. + + classes_ : ndarray of shape (n_classes,) or a list of such arrays + The classes labels. + + n_classes_ : int or list + The number of classes. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from diffprivlib.models import RandomForestClassifier + >>> X, y = make_classification(n_samples=1000, n_features=4, + ... n_informative=2, n_redundant=0, + ... random_state=0, shuffle=False) + >>> clf = RandomForestClassifier(n_estimators=100, random_state=0) + >>> clf.fit(X, y) + >>> print(clf.predict([[0, 0, 0, 0]])) + [1] + + References + ---------- + [1] Sam Fletcher, Md Zahidul Islam. "Differentially Private Random Decision Forests using Smooth Sensitivity" + https://arxiv.org/abs/1606.03572 + + """ + + _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints( + skRandomForestClassifier, "n_estimators", "n_jobs", "verbose", "random_state", "warm_start") + + def __init__(self, n_estimators=10, *, epsilon=1.0, bounds=None, classes=None, n_jobs=1, verbose=0, accountant=None, + random_state=None, max_depth=5, warm_start=False, shuffle=False, **unused_args): + super().__init__( + n_estimators=n_estimators, + criterion=None, + bootstrap=False, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start) + self.epsilon = epsilon + self.bounds = bounds + self.classes = classes + self.max_depth = max_depth + self.shuffle = shuffle + self.accountant = BudgetAccountant.load_default(accountant) + + # Todo: Remove when scikit-learn v1.2 is a min requirement + if hasattr(self, "estimator"): + self.estimator = DecisionTreeClassifier() + else: + self.base_estimator = DecisionTreeClassifier() + self.estimator_params = ("max_depth", "epsilon", "bounds", "classes") + + self._warn_unused_args(unused_args) + + def fit(self, X, y, sample_weight=None): + """ + Build a forest of trees from the training set (X, y). + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. + + y : array-like of shape (n_samples,) + The target values (class labels in classification, real numbers in regression). + + sample_weight : ignored + Ignored by diffprivlib. Present for consistency with sklearn API. + + Returns + ------- + self : object + Fitted estimator. + """ + self._validate_params() + self.accountant.check(self.epsilon, 0) + + if sample_weight is not None: + self._warn_unused_args("sample_weight") + + # Validate or convert input data + X, y = self._validate_data(X, y, multi_output=False, dtype=DTYPE) + + if self.bounds is None: + warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) + self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) + self.bounds = self._check_bounds(self.bounds, shape=X.shape[1]) + X = self._clip_to_bounds(X, self.bounds) + + y = np.atleast_1d(y) + if y.ndim == 2 and y.shape[1] == 1: + warnings.warn("A column-vector y was passed when a 1d array was expected. Please change the shape of y to " + "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) + + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) + + self.n_outputs_ = y.shape[1] + + if self.classes is None: + warnings.warn("Classes have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify the prediction classes for model.", PrivacyLeakWarning) + self.classes = np.unique(y) + self.classes_ = np.ravel(self.classes) + self.n_classes_ = len(self.classes_) + + # y, expanded_class_weight = self._validate_y_class_weight(y) + y = np.searchsorted(self.classes_, y) + + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + # Check parameters + self._validate_estimator() + + random_state = check_random_state(self.random_state) + + if not self.warm_start or not hasattr(self, "estimators_"): + # Free allocated memory, if any + self.estimators_ = [] + + n_more_estimators = self.n_estimators - len(self.estimators_) + + if n_more_estimators < 0: + raise ValueError(f"n_estimators={self.n_estimators} must be larger or equal to len(estimators_)=" + f"{len(self.estimators_)} when warm_start==True") + if n_more_estimators == 0: + warnings.warn("Warm-start fitting without increasing n_estimators does not fit new trees.") + return self + + if self.warm_start and len(self.estimators_) > 0: + # We draw from the random state to get the random state we + # would have got if we hadn't used a warm_start. + random_state.randint(MAX_INT, size=len(self.estimators_)) + + trees = [ + self._make_estimator(append=False, random_state=random_state) + for _ in range(n_more_estimators) + ] + + # Split samples between trees as evenly as possible (randomly if shuffle==True) + n_samples = X.shape[0] + tree_idxs = random_state.permutation(n_samples) if self.shuffle else np.arange(n_samples) + tree_idxs = (tree_idxs // (n_samples / n_more_estimators)).astype(int) + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + # Todo: Remove when scikit-learn v1.1 is a min requirement + try: + trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads")( + delayed(_parallel_build_trees)( + tree=t, + bootstrap=False, + X=X[tree_idxs == i], + y=y[tree_idxs == i], + sample_weight=None, + tree_idx=i, + n_trees=len(trees), + verbose=self.verbose, + ) + for i, t in enumerate(trees) + ) + except TypeError: + trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads")( + delayed(_parallel_build_trees)( + tree=t, + forest=self, + X=X[tree_idxs == i], + y=y[tree_idxs == i], + sample_weight=None, + tree_idx=i, + n_trees=len(trees), + verbose=self.verbose, + ) + for i, t in enumerate(trees) + ) + + # Collect newly grown trees + self.estimators_.extend(trees) + + self.accountant.spend(self.epsilon, 0) + + return self + + +class DecisionTreeClassifier(skDecisionTreeClassifier, DiffprivlibMixin): + r"""Decision Tree Classifier with differential privacy. + + This class implements the base differentially private decision tree classifier + for the Random Forest classifier algorithm. Not meant to be used separately. + + Parameters + ---------- + max_depth : int, default: 5 + The maximum depth of the tree. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering + the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed + on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. + + classes : array-like of shape (n_classes,), optional + Array of class labels. If not provided, the classes will be read from the data when ``.fit()`` is first called, + resulting in a :class:`.PrivacyLeakWarning`. + + random_state : int or RandomState, optional + Controls the randomness of the estimator. At each split, the feature to split on is chosen randomly, as is the + threshold at which to split. The classification label at each leaf is then randomised, subject to differential + privacy constraints. To obtain a deterministic behaviour during randomisation, ``random_state`` has to be fixed + to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Attributes + ---------- + n_features_in_: int + The number of features when fit is performed. + + n_classes_: int + The number of classes. + + classes_: array of shape (n_classes, ) + The class labels. + + """ + + _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints( + skDecisionTreeClassifier, "max_depth", "random_state") + + def __init__(self, max_depth=5, *, epsilon=1, bounds=None, classes=None, random_state=None, accountant=None, + **unused_args): + # Todo: Remove when scikit-learn v1.0 is a min requirement + try: + super().__init__( # pylint: disable=unexpected-keyword-arg + criterion=None, + splitter=None, + max_depth=max_depth, + min_samples_split=None, + min_samples_leaf=None, + min_weight_fraction_leaf=None, + max_features=None, + random_state=random_state, + max_leaf_nodes=None, + min_impurity_decrease=None, + min_impurity_split=None + ) + except TypeError: + super().__init__( + criterion=None, + splitter=None, + max_depth=max_depth, + min_samples_split=None, + min_samples_leaf=None, + min_weight_fraction_leaf=None, + max_features=None, + random_state=random_state, + max_leaf_nodes=None, + min_impurity_decrease=None + ) + self.epsilon = epsilon + self.bounds = bounds + self.classes = classes + self.accountant = BudgetAccountant.load_default(accountant) + + self._warn_unused_args(unused_args) + + def fit(self, X, y, sample_weight=None, check_input=True): + """Build a differentially-private decision tree classifier from the training set (X, y). + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to ``dtype=np.float32``. + + y : array-like of shape (n_samples,) + The target values (class labels) as integers or strings. + + sample_weight : ignored + Ignored by diffprivlib. Present for consistency with sklearn API. + + check_input : bool, default=True + Allow to bypass several input checking. Don't use this parameter unless you know what you do. + + Returns + ------- + self : DecisionTreeClassifier + Fitted estimator. + """ + self._validate_params() + random_state = check_random_state(self.random_state) + + self.accountant.check(self.epsilon, 0) + + if sample_weight is not None: + self._warn_unused_args("sample_weight") + + if check_input: + X, y = self._validate_data(X, y, multi_output=False) + self.n_outputs_ = 1 + + if self.bounds is None: + warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) + self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) + self.bounds = self._check_bounds(self.bounds, shape=X.shape[1]) + X = self._clip_to_bounds(X, self.bounds) + + if self.classes is None: + warnings.warn("Classes have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify the prediction classes for model.", PrivacyLeakWarning) + self.classes = np.unique(y) + self.classes_ = np.ravel(self.classes) + self.n_classes_ = len(self.classes_) + self.n_features_in_ = X.shape[1] + + # Build and fit the _FittingTree + fitting_tree = _FittingTree(self.max_depth, self.n_features_in_, self.classes_, self.epsilon, self.bounds, + random_state) + fitting_tree.build() + fitting_tree.fit(X, y) + + # Load params from _FittingTree into sklearn.Tree + d = fitting_tree.__getstate__() + tree = Tree(self.n_features_in_, np.array([self.n_classes_]), self.n_outputs_) + tree.__setstate__(d) + self.tree_ = tree + + self.accountant.spend(self.epsilon, 0) + + return self + + @property + def n_features_(self): + return self.n_features_in_ + + def _more_tags(self): + return {} + + +class _FittingTree(DiffprivlibMixin): + r"""Array-based representation of a binary decision tree, trained with differential privacy. + + This tree mimics the architecture of the corresponding Tree from sklearn.tree.tree_, but without many methods given + in Tree. The purpose of _FittingTree is to fit the parameters of the model, and have those parameters passed to + Tree (using _FittingTree.__getstate__() and Tree.__setstate__()), to be used for prediction. + + Parameters + ---------- + max_depth : int + The maximum depth of the tree. + + n_features : int + The number of features of the training dataset. + + classes : array-like of shape (n_classes,) + The classes of the training dataset. + + epsilon : float + Privacy parameter :math:`\epsilon`. + + bounds : tuple + Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering + the min/max of the entire data. + + random_state : RandomState + Controls the randomness of the building and training process: the feature to split at each node, the threshold + to split at and the randomisation of the label at each leaf. + + """ + _TREE_LEAF = -1 + _TREE_UNDEFINED = -2 + StackNode = namedtuple("StackNode", ["parent", "is_left", "depth", "bounds"]) + + def __init__(self, max_depth, n_features, classes, epsilon, bounds, random_state): + self.node_count = 0 + self.nodes = [] + self.max_depth = max_depth + self.n_features = n_features + self.classes = classes + self.epsilon = epsilon + self.bounds = bounds + self.random_state = random_state + + def __getstate__(self): + """Get state of _FittingTree to feed into __setstate__ of sklearn.Tree""" + d = {"max_depth": self.max_depth, + "node_count": self.node_count, + "nodes": np.array([tuple(node) for node in self.nodes], dtype=NODE_DTYPE), + "values": self.values_} + return d + + def build(self): + """Build the decision tree using random feature selection and random thresholding.""" + stack = [self.StackNode(parent=self._TREE_UNDEFINED, is_left=False, depth=0, bounds=self.bounds)] + + while stack: + parent, is_left, depth, bounds = stack.pop() + node_id = self.node_count + bounds_lower, bounds_upper = self._check_bounds(bounds, shape=self.n_features) + + # Update parent node with its child + if parent != self._TREE_UNDEFINED: + if is_left: + self.nodes[parent].left_child = node_id + else: + self.nodes[parent].right_child = node_id + + # Check if we have a leaf node, then add it + if depth >= self.max_depth: + node = _Node(node_id, self._TREE_UNDEFINED, self._TREE_UNDEFINED) + node.left_child = self._TREE_LEAF + node.right_child = self._TREE_LEAF + + self.nodes.append(node) + self.node_count += 1 + continue + + # We have a decision node, so pick feature and threshold + feature = self.random_state.randint(self.n_features) + threshold = self.random_state.uniform(bounds_lower[feature], bounds_upper[feature]) + + left_bounds_upper = bounds_upper.copy() + left_bounds_upper[feature] = threshold + right_bounds_lower = bounds_lower.copy() + right_bounds_lower[feature] = threshold + + self.nodes.append(_Node(node_id, feature, threshold)) + self.node_count += 1 + + stack.append(self.StackNode(parent=node_id, is_left=True, depth=depth+1, + bounds=(bounds_lower, left_bounds_upper))) + stack.append(self.StackNode(parent=node_id, is_left=False, depth=depth+1, + bounds=(right_bounds_lower, bounds_upper))) + + return self + + def fit(self, X, y): + """Fit the tree to the given training data. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training vector, where n_samples is the number of samples and n_features is the number of features. + + y : array-like, shape (n_samples,) + Target vector relative to X. + + """ + if not self.nodes: + raise ValueError("Fitting Tree must be built before calling fit().") + + leaves = self.apply(X) + unique_leaves = np.unique(leaves) + values = np.zeros(shape=(self.node_count, 1, len(self.classes))) + + # Populate value of real leaves + for leaf in unique_leaves: + idxs = (leaves == leaf) + leaf_y = y[idxs] + + counts = [np.sum(leaf_y == cls) for cls in self.classes] + mech = PermuteAndFlip(epsilon=self.epsilon, sensitivity=1, monotonic=True, utility=counts, + random_state=self.random_state) + values[leaf, 0, mech.randomise()] = 1 + + # Populate value of empty leaves + for node in self.nodes: + if values[node.node_id].sum() or node.left_child != self._TREE_LEAF: + continue + + values[node.node_id, 0, self.random_state.randint(len(self.classes))] = 1 + + self.values_ = values + + return self + + def apply(self, X): + """Finds the terminal region (=leaf node) for each sample in X.""" + n_samples = X.shape[0] + out = np.zeros((n_samples,), dtype=int) + out_ptr = out.data + + for i in range(n_samples): + node = self.nodes[0] + + while node.left_child != self._TREE_LEAF: + if X[i, node.feature] <= node.threshold: + node = self.nodes[node.left_child] + else: + node = self.nodes[node.right_child] + + out_ptr[i] = node.node_id + + return out + + +class _Node: + """Base storage structure for the nodes in a _FittingTree object.""" + def __init__(self, node_id, feature, threshold): + self.feature = feature + self.threshold = threshold + self.left_child = -1 + self.right_child = -1 + self.node_id = node_id + + def __iter__(self): + """Defines parameters needed to populate NODE_DTYPE for Tree.__setstate__ using tuple(_Node).""" + yield self.left_child + yield self.right_child + yield self.feature + yield self.threshold + yield 0.0 # Impurity + yield 0 # n_node_samples + yield 0.0 # weighted_n_node_samples + + # remove branch when scikit-learn v1.3 is min requirement + if len(NODE_DTYPE) > 7: + yield False diff --git a/privbayes-synthesizer/code/diffprivlib/models/k_means.py b/privbayes-synthesizer/code/diffprivlib/models/k_means.py new file mode 100644 index 0000000..9e075bc --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/k_means.py @@ -0,0 +1,294 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +K-means clustering algorithm satisfying differential privacy. +""" +import warnings + +import numpy as np +import sklearn.cluster as sk_cluster + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricFolded +from diffprivlib.utils import PrivacyLeakWarning, check_random_state +from diffprivlib.validation import DiffprivlibMixin + + +class KMeans(sk_cluster.KMeans, DiffprivlibMixin): + r"""K-Means clustering with differential privacy. + + Implements the DPLloyd approach presented in [SCL16]_, leveraging the :class:`sklearn.cluster.KMeans` class for full + integration with Scikit Learn. + + Parameters + ---------- + n_clusters : int, default: 8 + The number of clusters to form as well as the number of centroids to generate. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering + the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed + on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. + + random_state : int or RandomState, optional + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Attributes + ---------- + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers. If the algorithm stops before fully converging, these will not be consistent + with ``labels_``. + + labels_ : + Labels of each point + + inertia_ : float + Sum of squared distances of samples to their closest cluster center. + + n_iter_ : int + Number of iterations run. + + References + ---------- + .. [SCL16] Su, Dong, Jianneng Cao, Ninghui Li, Elisa Bertino, and Hongxia Jin. "Differentially private k-means + clustering." In Proceedings of the sixth ACM conference on data and application security and privacy, pp. 26-37. + ACM, 2016. + + """ + + _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints( + sk_cluster.KMeans, "n_clusters", "random_state") + + def __init__(self, n_clusters=8, *, epsilon=1.0, bounds=None, random_state=None, accountant=None, **unused_args): + super().__init__(n_clusters=n_clusters, random_state=random_state) + + self.epsilon = epsilon + self.bounds = bounds + self.accountant = BudgetAccountant.load_default(accountant) + + self._warn_unused_args(unused_args) + + self.cluster_centers_ = None + self.bounds_processed = None + self.labels_ = None + self.inertia_ = None + self.n_iter_ = None + self._n_threads = 1 + + def fit(self, X, y=None, sample_weight=None): + """Computes k-means clustering with differential privacy. + + Parameters + ---------- + X : array-like, shape=(n_samples, n_features) + Training instances to cluster. + + y : Ignored + not used, present here for API consistency by convention. + + sample_weight : ignored + Ignored by diffprivlib. Present for consistency with sklearn API. + + Returns + ------- + self : class + + """ + self._validate_params() + self.accountant.check(self.epsilon, 0) + + if sample_weight is not None: + self._warn_unused_args("sample_weight") + + del y + + random_state = check_random_state(self.random_state) + + X = self._validate_data(X, accept_sparse=False, dtype=[np.float64, np.float32]) + n_samples, n_dims = X.shape + + if n_samples < self.n_clusters: + raise ValueError(f"n_samples={n_samples} should be >= n_clusters={self.n_clusters}") + + iters = self._calc_iters(n_dims, n_samples) + + if self.bounds is None: + warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify `bounds` for each dimension.", PrivacyLeakWarning) + self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) + + self.bounds = self._check_bounds(self.bounds, n_dims, min_separation=1e-5) + X = self._clip_to_bounds(X, self.bounds) + + centers = self._init_centers(n_dims, random_state=random_state) + labels = None + distances = None + + # Run _update_centers first to ensure consistency of `labels` and `centers`, since convergence unlikely + for _ in range(-1, iters): + if labels is not None: + centers = self._update_centers(X, centers=centers, labels=labels, dims=n_dims, total_iters=iters, + random_state=random_state) + + distances, labels = self._distances_labels(X, centers) + + self.cluster_centers_ = centers + self.labels_ = labels + self.inertia_ = distances[np.arange(len(labels)), labels].sum() + self.n_iter_ = iters + + self.accountant.spend(self.epsilon, 0) + + return self + + def _init_centers(self, dims, random_state): + if self.bounds_processed is None: + bounds_processed = np.zeros(shape=(dims, 2)) + + for dim in range(dims): + lower = self.bounds[0][dim] + upper = self.bounds[1][dim] + + bounds_processed[dim, :] = [upper - lower, lower] + + self.bounds_processed = bounds_processed + + cluster_proximity = np.min(self.bounds_processed[:, 0]) / 2.0 + + while cluster_proximity > 0: + centers = np.zeros(shape=(self.n_clusters, dims)) + cluster, retry = 0, 0 + + while retry < 100: + if cluster >= self.n_clusters: + break + + temp_center = random_state.random(dims) * (self.bounds_processed[:, 0] - 2 * cluster_proximity) + \ + self.bounds_processed[:, 1] + cluster_proximity + + if cluster == 0: + centers[0, :] = temp_center + cluster += 1 + continue + + min_distance = ((centers[:cluster, :] - temp_center) ** 2).sum(axis=1).min() + + if np.sqrt(min_distance) >= 2 * cluster_proximity: + centers[cluster, :] = temp_center + cluster += 1 + retry = 0 + else: + retry += 1 + + if cluster >= self.n_clusters: + return centers + + cluster_proximity /= 2.0 + + return None + + def _distances_labels(self, X, centers): + distances = np.zeros((X.shape[0], self.n_clusters)) + + for cluster in range(self.n_clusters): + distances[:, cluster] = ((X - centers[cluster, :]) ** 2).sum(axis=1) + + labels = np.argmin(distances, axis=1) + return distances, labels + + def _update_centers(self, X, centers, labels, dims, total_iters, random_state): + """Updates the centers of the KMeans algorithm for the current iteration, while satisfying differential + privacy. + + Differential privacy is satisfied by adding (integer-valued, using :class:`.GeometricFolded`) random noise to + the count of nearest neighbours to the previous cluster centers, and adding (real-valued, using + :class:`.LaplaceBoundedDomain`) random noise to the sum of values per dimension. + + """ + epsilon_0, epsilon_i = self._split_epsilon(dims, total_iters) + geometric_mech = GeometricFolded(epsilon=epsilon_0, sensitivity=1, lower=0.5, upper=float("inf"), + random_state=random_state) + + for cluster in range(self.n_clusters): + if cluster not in labels: + continue + + cluster_count = sum(labels == cluster) + noisy_count = geometric_mech.randomise(cluster_count) + + cluster_sum = np.sum(X[labels == cluster], axis=0) + noisy_sum = np.zeros_like(cluster_sum) + + for i in range(dims): + laplace_mech = LaplaceBoundedDomain(epsilon=epsilon_i, + sensitivity=self.bounds[1][i] - self.bounds[0][i], + lower=noisy_count * self.bounds[0][i], + upper=noisy_count * self.bounds[1][i], random_state=random_state) + noisy_sum[i] = laplace_mech.randomise(cluster_sum[i]) + + centers[cluster, :] = noisy_sum / noisy_count + + return centers + + def _split_epsilon(self, dims, total_iters, rho=0.225): + """Split epsilon between sum perturbation and count perturbation, as proposed by Su et al. + + Parameters + ---------- + dims : int + Number of dimensions to split `epsilon` across. + + total_iters : int + Total number of iterations to split `epsilon` across. + + rho : float, default: 0.225 + Coordinate normalisation factor. + + Returns + ------- + epsilon_0 : float + The epsilon value for satisfying differential privacy on the count of a cluster. + + epsilon_i : float + The epsilon value for satisfying differential privacy on each dimension of the center of a cluster. + + """ + epsilon_i = 1 + epsilon_0 = np.cbrt(4 * dims * rho ** 2) + + normaliser = self.epsilon / total_iters / (epsilon_i * dims + epsilon_0) + + return epsilon_i * normaliser, epsilon_0 * normaliser + + def _calc_iters(self, n_dims, n_samples, rho=0.225): + """Calculate the number of iterations to allow for the KMeans algorithm.""" + + epsilon_m = np.sqrt(500 * (self.n_clusters ** 3) / (n_samples ** 2) * + (n_dims + np.cbrt(4 * n_dims * (rho ** 2))) ** 3) + + iters = max(min(self.epsilon / epsilon_m, 7), 2) + + return int(iters) diff --git a/privbayes-synthesizer/code/diffprivlib/models/linear_regression.py b/privbayes-synthesizer/code/diffprivlib/models/linear_regression.py new file mode 100644 index 0000000..5178c69 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/linear_regression.py @@ -0,0 +1,318 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# +# New BSD License +# +# Copyright (c) 2007–2019 The scikit-learn developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +# following conditions are met: +# +# a. Redistributions of source code must retain the above copyright notice, this list of conditions and the following +# disclaimer. +# b. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the +# following disclaimer in the documentation and/or other materials provided with the distribution. +# c. Neither the name of the Scikit-learn Developers nor the names of its contributors may be used to endorse or +# promote products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +""" +Linear Regression with differential privacy +""" +import warnings + +import numpy as np +import sklearn.linear_model as sk_lr +from scipy.optimize import minimize +from sklearn.utils import check_array +from sklearn.utils.validation import FLOAT_DTYPES + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.mechanisms import Laplace, LaplaceFolded +from diffprivlib.tools import mean +from diffprivlib.utils import warn_unused_args, PrivacyLeakWarning, check_random_state +from diffprivlib.validation import check_bounds, clip_to_bounds, DiffprivlibMixin + + +# noinspection PyPep8Naming +def _preprocess_data(X, y, fit_intercept, epsilon=1.0, bounds_X=None, bounds_y=None, copy=True, check_input=True, + random_state=None, **unused_args): + warn_unused_args(unused_args) + + random_state = check_random_state(random_state) + + if check_input: + X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES) + elif copy: + X = X.copy(order='K') + + y = np.asarray(y, dtype=X.dtype) + X_scale = np.ones(X.shape[1], dtype=X.dtype) + + if fit_intercept: + bounds_X = check_bounds(bounds_X, X.shape[1]) + bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1) + + X = clip_to_bounds(X, bounds_X) + y = clip_to_bounds(y, bounds_y) + + X_offset = mean(X, axis=0, bounds=bounds_X, epsilon=epsilon, random_state=random_state, + accountant=BudgetAccountant()) + X -= X_offset + y_offset = mean(y, axis=0, bounds=bounds_y, epsilon=epsilon, random_state=random_state, + accountant=BudgetAccountant()) + y = y - y_offset + else: + X_offset = np.zeros(X.shape[1], dtype=X.dtype) + if y.ndim == 1: + y_offset = X.dtype.type(0) + else: + y_offset = np.zeros(y.shape[1], dtype=X.dtype) + + return X, y, X_offset, y_offset, X_scale + + +def _construct_regression_obj(X, y, bounds_X, bounds_y, epsilon, alpha, random_state): + if y.ndim == 1: + y = y.reshape(-1, 1) + + n_features = X.shape[1] + n_targets = y.shape[1] + + local_epsilon = epsilon / (1 + n_targets * n_features + n_features * (n_features + 1) / 2) + coefs = ((y ** 2).sum(axis=0), np.einsum('ij,ik->jk', X, y), np.einsum('ij,ik', X, X)) + + del X, y + + def get_max_sensitivity(y_lower, y_upper, x_lower, x_upper): + corners = [y_lower * x_lower, y_lower * x_upper, y_upper * x_lower, y_upper * x_upper] + return np.max(corners) - np.min(corners) + + # Randomise 0th-degree monomial coefficients + mono_coef_0 = np.zeros(n_targets) + + for i in range(n_targets): + sensitivity = np.abs([bounds_y[0][i], bounds_y[1][i]]).max() ** 2 + mech = LaplaceFolded(epsilon=local_epsilon, sensitivity=sensitivity, lower=0, upper=float("inf"), + random_state=random_state) + mono_coef_0[i] = mech.randomise(coefs[0][i]) + + # Randomise 1st-degree monomial coefficients + mono_coef_1 = np.zeros((n_features, n_targets)) + + for i in range(n_targets): + for j in range(n_features): + sensitivity = get_max_sensitivity(bounds_y[0][i], bounds_y[1][i], bounds_X[0][j], bounds_X[1][j]) + mech = Laplace(epsilon=local_epsilon, sensitivity=sensitivity, random_state=random_state) + mono_coef_1[j, i] = mech.randomise(coefs[1][j, i]) + + # Randomise 2nd-degree monomial coefficients + mono_coef_2 = np.zeros((n_features, n_features)) + + for i in range(n_features): + sensitivity = np.max(np.abs([bounds_X[0][i], bounds_X[0][i]])) ** 2 + mech = LaplaceFolded(epsilon=local_epsilon, sensitivity=sensitivity, lower=0, upper=float("inf"), + random_state=random_state) + mono_coef_2[i, i] = mech.randomise(coefs[2][i, i]) + + for j in range(i + 1, n_features): + sensitivity = get_max_sensitivity(bounds_X[0][i], bounds_X[1][i], bounds_X[0][j], bounds_X[1][j]) + mech = Laplace(epsilon=local_epsilon, sensitivity=sensitivity, random_state=random_state) + mono_coef_2[i, j] = mech.randomise(coefs[2][i, j]) + mono_coef_2[j, i] = mono_coef_2[i, j] # Enforce symmetry + + del coefs + noisy_coefs = (mono_coef_0, mono_coef_1, mono_coef_2) + + def obj(idx): + def inner_obj(omega): + func = noisy_coefs[0][idx] + func -= 2 * np.dot(noisy_coefs[1][:, idx], omega) + func += np.multiply(noisy_coefs[2], np.tensordot(omega, omega, axes=0)).sum() + func += alpha * (omega ** 2).sum() + + grad = - 2 * noisy_coefs[1][:, idx] + 2 * np.matmul(noisy_coefs[2], omega) + 2 * omega * alpha + + return func, grad + + return inner_obj + + output = tuple(obj(i) for i in range(n_targets)) + + return output, noisy_coefs + + +# noinspection PyPep8Naming,PyAttributeOutsideInit +class LinearRegression(sk_lr.LinearRegression, DiffprivlibMixin): + r""" + Ordinary least squares Linear Regression with differential privacy. + + LinearRegression fits a linear model with coefficients w = (w1, ..., wp) to minimize the residual sum of squares + between the observed targets in the dataset, and the targets predicted by the linear approximation. Differential + privacy is guaranteed with respect to the training sample. + + Differential privacy is achieved by adding noise to the coefficients of the objective function, taking inspiration + from [ZZX12]_. + + Parameters + ---------- + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds_X : tuple + Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering + the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed + on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. + + bounds_y : tuple + Same as `bounds_X`, but for the training label set `y`. + + fit_intercept : bool, default: True + Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default: True + If True, X will be copied; else, it may be overwritten. + + random_state : int or RandomState, optional + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Attributes + ---------- + coef_ : array of shape (n_features, ) or (n_targets, n_features) + Estimated coefficients for the linear regression problem. If multiple targets are passed during the fit (y 2D), + this is a 2D array of shape (n_targets, n_features), while if only one target is passed, this is a 1D array of + length n_features. + + intercept_ : float or array of shape of (n_targets,) + Independent term in the linear model. Set to 0.0 if `fit_intercept = False`. + + References + ---------- + .. [ZZX12] Zhang, Jun, Zhenjie Zhang, Xiaokui Xiao, Yin Yang, and Marianne Winslett. "Functional mechanism: + regression analysis under differential privacy." arXiv preprint arXiv:1208.0219 (2012). + + """ + + _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints( + sk_lr.LinearRegression, "fit_intercept", "copy_X") + + def __init__(self, *, epsilon=1.0, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True, random_state=None, + accountant=None, **unused_args): + super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, n_jobs=None) + + self.epsilon = epsilon + self.bounds_X = bounds_X + self.bounds_y = bounds_y + self.random_state = random_state + self.accountant = BudgetAccountant.load_default(accountant) + + self._warn_unused_args(unused_args) + + def fit(self, X, y, sample_weight=None): + """ + Fit linear model. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Training data + + y : array_like, shape (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary + + sample_weight : ignored + Ignored by diffprivlib. Present for consistency with sklearn API. + + Returns + ------- + self : returns an instance of self. + + """ + self._validate_params() + self.accountant.check(self.epsilon, 0) + + if sample_weight is not None: + self._warn_unused_args("sample_weight") + + random_state = check_random_state(self.random_state) + + X, y = self._validate_data(X, y, accept_sparse=False, y_numeric=True, multi_output=True) + + if self.bounds_X is None or self.bounds_y is None: + warnings.warn( + "Bounds parameters haven't been specified, so falling back to determining bounds from the " + "data.\n" + "This will result in additional privacy leakage. To ensure differential privacy with no " + "additional privacy loss, specify `bounds_X` and `bounds_y`.", + PrivacyLeakWarning) + + if self.bounds_X is None: + self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0)) + if self.bounds_y is None: + self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0)) + + # pylint: disable=no-member + self.bounds_X = self._check_bounds(self.bounds_X, X.shape[1]) + self.bounds_y = self._check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1) + + n_features = X.shape[1] + n_targets = y.shape[1] if y.ndim > 1 else 1 + epsilon_intercept_scale = 1 / (n_features + 1) if self.fit_intercept else 0 + + X, y, X_offset, y_offset, X_scale = self._preprocess_data( + X, y, fit_intercept=self.fit_intercept, bounds_X=self.bounds_X, bounds_y=self.bounds_y, + epsilon=self.epsilon * epsilon_intercept_scale, copy=self.copy_X, random_state=random_state) + + bounds_X = (self.bounds_X[0] - X_offset, self.bounds_X[1] - X_offset) + bounds_y = (self.bounds_y[0] - y_offset, self.bounds_y[1] - y_offset) + + objs, obj_coefs = _construct_regression_obj( + X, y, bounds_X, bounds_y, epsilon=self.epsilon * (1 - epsilon_intercept_scale), alpha=0, + random_state=random_state) + coef = np.zeros((n_features, n_targets)) + + for i, obj in enumerate(objs): + opt_result = minimize(obj, np.zeros(n_features), jac=True) + coef[:, i] = opt_result.x + + self.coef_ = coef.T + self._obj_coefs = obj_coefs + + if y.ndim == 1: + self.coef_ = np.ravel(self.coef_) + self._set_intercept(X_offset, y_offset, X_scale) + + self.accountant.spend(self.epsilon, 0) + + return self + + _preprocess_data = staticmethod(_preprocess_data) diff --git a/privbayes-synthesizer/code/diffprivlib/models/logistic_regression.py b/privbayes-synthesizer/code/diffprivlib/models/logistic_regression.py new file mode 100644 index 0000000..112ea9f --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/logistic_regression.py @@ -0,0 +1,425 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# +# New BSD License +# +# Copyright (c) 2007–2019 The scikit-learn developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +# following conditions are met: +# +# a. Redistributions of source code must retain the above copyright notice, this list of conditions and the following +# disclaimer. +# b. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the +# following disclaimer in the documentation and/or other materials provided with the distribution. +# c. Neither the name of the Scikit-learn Developers nor the names of its contributors may be used to endorse or +# promote products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +""" +Logistic Regression classifier satisfying differential privacy. +""" +import numbers +import warnings + +import numpy as np +from joblib import delayed, Parallel +from scipy import optimize +from sklearn.exceptions import ConvergenceWarning +from sklearn import linear_model +from sklearn.utils import check_array, check_consistent_length +from sklearn.utils.multiclass import check_classification_targets + +# todo: Remove when sklearn v1.1.0 is min requirement +try: + from sklearn.linear_model._linear_loss import LinearModelLoss + from sklearn._loss import HalfBinomialLoss + SKL_LOSS_MODULE = True +except (ModuleNotFoundError, ImportError): + from sklearn.linear_model._logistic import _logistic_loss_and_grad + SKL_LOSS_MODULE = False + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.mechanisms import Vector +from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args, check_random_state +from diffprivlib.validation import DiffprivlibMixin + + +class LogisticRegression(linear_model.LogisticRegression, DiffprivlibMixin): + r"""Logistic Regression (aka logit, MaxEnt) classifier with differential privacy. + + This class implements regularised logistic regression using :ref:`Scipy's L-BFGS-B algorithm + `. :math:`\epsilon`-Differential privacy is achieved relative to the maximum norm + of the data, as determined by `data_norm`, by the :class:`.Vector` mechanism, which adds a Laplace-distributed + random vector to the objective. Adapted from the work presented in [CMS11]_. + + This class is a child of :obj:`sklearn.linear_model.LogisticRegression`, with amendments to allow for the + implementation of differential privacy. Some parameters of `Scikit Learn`'s model have therefore had to be fixed, + including: + + - The only permitted solver is 'lbfgs'. Specifying the ``solver`` option will result in a warning. + - Consequently, the only permitted penalty is 'l2'. Specifying the ``penalty`` option will result in a warning. + - In the multiclass case, only the one-vs-rest (OvR) scheme is permitted. Specifying the ``multi_class`` option + will result in a warning. + + Parameters + ---------- + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + data_norm : float, optional + The max l2 norm of any row of the data. This defines the spread of data that will be protected by + differential privacy. + + If not specified, the max norm is taken from the data when ``.fit()`` is first called, but will result in a + :class:`.PrivacyLeakWarning`, as it reveals information about the data. To preserve differential privacy fully, + `data_norm` should be selected independently of the data, i.e. with domain knowledge. + + tol : float, default: 1e-4 + Tolerance for stopping criteria. + + C : float, default: 1.0 + Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values + specify stronger regularization. + + fit_intercept : bool, default: True + Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. + + max_iter : int, default: 100 + Maximum number of iterations taken for the solver to converge. For smaller `epsilon` (more noise), `max_iter` + may need to be increased. + + verbose : int, default: 0 + Set to any positive number for verbosity. + + warm_start : bool, default: False + When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase + the previous solution. + + n_jobs : int, optional + Number of CPU cores used when parallelising over classes. ``None`` means 1 unless in a context. ``-1`` means + using all processors. + + random_state : int or RandomState, optional + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Attributes + ---------- + classes_ : array, shape (n_classes, ) + A list of class labels known to the classifier. + + coef_ : array, shape (1, n_features) or (n_classes, n_features) + Coefficient of the features in the decision function. + + `coef_` is of shape (1, n_features) when the given problem is binary. + + intercept_ : array, shape (1,) or (n_classes,) + Intercept (a.k.a. bias) added to the decision function. + + If `fit_intercept` is set to False, the intercept is set to zero. `intercept_` is of shape (1,) when the + given problem is binary. + + n_iter_ : array, shape (n_classes,) or (1, ) + Actual number of iterations for all classes. If binary, it returns only 1 element. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from diffprivlib.models import LogisticRegression + >>> X, y = load_iris(return_X_y=True) + >>> clf = LogisticRegression(data_norm=12, epsilon=2).fit(X, y) + >>> clf.predict(X[:2, :]) + array([0, 0]) + >>> clf.predict_proba(X[:2, :]) + array([[7.35362932e-01, 2.16667422e-14, 2.64637068e-01], + [9.08384378e-01, 3.47767052e-13, 9.16156215e-02]]) + >>> clf.score(X, y) + 0.5266666666666666 + + See also + -------- + sklearn.linear_model.LogisticRegression : The implementation of logistic regression in scikit-learn, upon which this + implementation is built. + .Vector : The mechanism used by the model to achieve differential privacy. + + References + ---------- + .. [CMS11] Chaudhuri, Kamalika, Claire Monteleoni, and Anand D. Sarwate. "Differentially private empirical risk + minimization." Journal of Machine Learning Research 12, no. Mar (2011): 1069-1109. + + """ + + _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints( + linear_model.LogisticRegression, "tol", "C", "fit_intercept", "max_iter", "verbose", "warm_start", "n_jobs", + "random_state") + + def __init__(self, *, epsilon=1.0, data_norm=None, tol=1e-4, C=1.0, fit_intercept=True, max_iter=100, verbose=0, + warm_start=False, n_jobs=None, random_state=None, accountant=None, **unused_args): + super().__init__(penalty='l2', dual=False, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=1.0, + class_weight=None, random_state=random_state, solver='lbfgs', max_iter=max_iter, + multi_class='ovr', verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) + self.epsilon = epsilon + self.data_norm = data_norm + self.classes_ = None + self.accountant = BudgetAccountant.load_default(accountant) + + self._warn_unused_args(unused_args) + + # noinspection PyAttributeOutsideInit + def fit(self, X, y, sample_weight=None): + """Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training vector, where n_samples is the number of samples and n_features is the number of features. + + y : array-like, shape (n_samples,) + Target vector relative to X. + + sample_weight : ignored + Ignored by diffprivlib. Present for consistency with sklearn API. + + Returns + ------- + self : class + + """ + self._validate_params() + self.accountant.check(self.epsilon, 0) + + if sample_weight is not None: + self._warn_unused_args("sample_weight") + + random_state = check_random_state(self.random_state) + + # Todo: Remove when scikit-learn v1.2 is a min requirement + if not isinstance(self.C, numbers.Real) or self.C < 0: + raise ValueError(f"Penalty term must be positive; got (C={self.C})") + if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0: + raise ValueError(f"Maximum number of iteration must be positive; got (max_iter={self.max_iter})") + if not isinstance(self.tol, numbers.Real) or self.tol < 0: + raise ValueError(f"Tolerance for stopping criteria must be positive; got (tol={self.tol})") + + X, y = self._validate_data(X, y, accept_sparse='csr', dtype=float, order="C", + accept_large_sparse=True) + check_classification_targets(y) + self.classes_ = np.unique(y) + _, n_features = X.shape + + if self.data_norm is None: + warnings.warn("Data norm has not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) + self.data_norm = np.linalg.norm(X, axis=1).max() + + X = self._clip_to_norm(X, self.data_norm) + + n_classes = len(self.classes_) + classes_ = self.classes_ + if n_classes < 2: + raise ValueError("This solver needs samples of at least 2 classes in the data, but the data contains only " + f"one class: {classes_[0]}") + + if len(self.classes_) == 2: + n_classes = 1 + classes_ = classes_[1:] + + if self.warm_start: + warm_start_coef = getattr(self, 'coef_', None) + else: + warm_start_coef = None + if warm_start_coef is not None and self.fit_intercept: + warm_start_coef = np.append(warm_start_coef, self.intercept_[:, np.newaxis], axis=1) + + self.coef_ = [] + self.intercept_ = np.zeros(n_classes) + + if warm_start_coef is None: + warm_start_coef = [None] * n_classes + + path_func = delayed(_logistic_regression_path) + + fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer='processes')( + path_func(X, y, epsilon=self.epsilon / n_classes, data_norm=self.data_norm, pos_class=class_, Cs=[self.C], + fit_intercept=self.fit_intercept, max_iter=self.max_iter, tol=self.tol, verbose=self.verbose, + coef=warm_start_coef_, random_state=random_state, check_input=False) + for class_, warm_start_coef_ in zip(classes_, warm_start_coef)) + + fold_coefs_, _, n_iter_ = zip(*fold_coefs_) + self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0] + + self.coef_ = np.asarray(fold_coefs_) + self.coef_ = self.coef_.reshape(n_classes, n_features + int(self.fit_intercept)) + + if self.fit_intercept: + self.intercept_ = self.coef_[:, -1] + self.coef_ = self.coef_[:, :-1] + + self.accountant.spend(self.epsilon, 0) + + return self + + +def _logistic_regression_path(X, y, epsilon, data_norm, pos_class=None, Cs=10, fit_intercept=True, max_iter=100, + tol=1e-4, verbose=0, coef=None, random_state=None, check_input=True, **unused_args): + """Compute a Logistic Regression model with differential privacy for a list of regularization parameters. Takes + inspiration from ``_logistic_regression_path`` in scikit-learn, specified to the LBFGS solver and one-vs-rest + multi class fitting. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Input data. + + y : array-like, shape (n_samples,) or (n_samples, n_targets) + Input data, target values. + + epsilon : float + Privacy parameter for differential privacy. + + data_norm : float + Max norm of the data for which differential privacy is satisfied. + + pos_class : int, optional + The class with respect to which we perform a one-vs-all fit. If None, then it is assumed that the given problem + is binary. + + Cs : int | array-like, shape (n_cs,), default: 10 + List of values for the regularization parameter or integer specifying the number of regularization parameters + that should be used. In this case, the parameters will be chosen in a logarithmic scale between 1e-4 and 1e4. + + fit_intercept : bool, default: True + Whether to fit an intercept for the model. In this case the shape of the returned array is + (n_cs, n_features + 1). + + max_iter : int, default: 100 + Maximum number of iterations for the solver. + + tol : float, default: 1e-4 + Stopping criterion. For the newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i | i = 1, + ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient. + + verbose : int, default: 0 + For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. + + coef : array-like, shape (n_features,), optional + Initialization value for coefficients of logistic regression. Useless for liblinear solver. + + random_state : int or RandomState, optional + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + check_input : bool, default: True + If False, the input arrays X and y will not be checked. + + Returns + ------- + coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1) + List of coefficients for the Logistic Regression model. If fit_intercept is set to True then the second + dimension will be n_features + 1, where the last item represents the intercept. For + ``multiclass='multinomial'``, the shape is (n_classes, n_cs, n_features) or (n_classes, n_cs, n_features + 1). + + Cs : ndarray + Grid of Cs used for cross-validation. + + n_iter : array, shape (n_cs,) + Actual number of iteration for each Cs. + + """ + warn_unused_args(unused_args) + + random_state = check_random_state(random_state) + + if isinstance(Cs, numbers.Integral): + Cs = np.logspace(-4, 4, int(Cs)) + + # Data norm increases if intercept is included + if fit_intercept: + data_norm = np.sqrt(data_norm ** 2 + 1) + + # Pre-processing. + if check_input: + X = check_array(X, accept_sparse='csr', dtype=np.float64, accept_large_sparse=True) + y = check_array(y, ensure_2d=False, dtype=None) + check_consistent_length(X, y) + _, n_features = X.shape + + classes = np.unique(y) + + if pos_class is None: + if classes.size > 2: + raise ValueError('To fit OvR, use the pos_class argument') + # np.unique(y) gives labels in sorted order. + pos_class = classes[1] + + sample_weight = np.ones(X.shape[0], dtype=X.dtype) + + # For doing a ovr, we need to mask the labels first. + output_vec = np.zeros(n_features + int(fit_intercept), dtype=X.dtype) + mask = (y == pos_class) + y_bin = np.ones(y.shape, dtype=X.dtype) + y_bin[~mask] = 0.0 if SKL_LOSS_MODULE else -1.0 + + if coef is not None: + # it must work both giving the bias term and not + if coef.size not in (n_features, output_vec.size): + raise ValueError(f"Initialization coef is of shape {coef.size}, expected shape {n_features} or " + f"{output_vec.size}") + output_vec[:coef.size] = coef + + target = y_bin + + if SKL_LOSS_MODULE: + func = LinearModelLoss(base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept).loss_gradient + else: + func = _logistic_loss_and_grad + + coefs = [] + n_iter = np.zeros(len(Cs), dtype=np.int32) + for i, C in enumerate(Cs): + vector_mech = Vector(epsilon=epsilon, dimension=n_features + int(fit_intercept), alpha=1. / C, + function_sensitivity=0.25, data_sensitivity=data_norm, random_state=random_state) + noisy_logistic_loss = vector_mech.randomise(func) + + args = (X, target, sample_weight, 1. / C) if SKL_LOSS_MODULE else (X, target, 1. / C, sample_weight) + + iprint = [-1, 50, 1, 100, 101][np.searchsorted(np.array([0, 1, 2, 3]), verbose)] + output_vec, _, info = optimize.fmin_l_bfgs_b(noisy_logistic_loss, output_vec, fprime=None, + args=args, iprint=iprint, pgtol=tol, maxiter=max_iter) + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge. Increase the number of iterations.", ConvergenceWarning) + + coefs.append(output_vec.copy()) + + n_iter[i] = info['nit'] + + return np.array(coefs), np.array(Cs), n_iter diff --git a/privbayes-synthesizer/code/diffprivlib/models/naive_bayes.py b/privbayes-synthesizer/code/diffprivlib/models/naive_bayes.py new file mode 100644 index 0000000..290d4ac --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/naive_bayes.py @@ -0,0 +1,305 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Gaussian Naive Bayes classifier satisfying differential privacy +""" +import warnings + +import numpy as np +import sklearn.naive_bayes as sk_nb +from sklearn.utils.multiclass import _check_partial_fit_first_call + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricTruncated, LaplaceTruncated +from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args, check_random_state +from diffprivlib.validation import DiffprivlibMixin + + +class GaussianNB(sk_nb.GaussianNB, DiffprivlibMixin): + r"""Gaussian Naive Bayes (GaussianNB) with differential privacy + + Inherits the :class:`sklearn.naive_bayes.GaussianNB` class from Scikit Learn and adds noise to satisfy differential + privacy to the learned means and variances. Adapted from the work presented in [VSB13]_. + + Parameters + ---------- + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon` for the model. + + bounds : tuple, optional + Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering + the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed + on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. + + priors : array-like, shape (n_classes,) + Prior probabilities of the classes. If specified the priors are not adjusted according to the data. + + var_smoothing : float, default: 1e-9 + Portion of the largest variance of all features that is added to variances for calculation stability. + + random_state : int or RandomState, optional + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Attributes + ---------- + class_prior_ : array, shape (n_classes,) + probability of each class. + + class_count_ : array, shape (n_classes,) + number of training samples observed in each class. + + theta_ : array, shape (n_classes, n_features) + mean of each feature per class + + var_ : array, shape (n_classes, n_features) + variance of each feature per class + + epsilon_ : float + absolute additive value to variances (unrelated to ``epsilon`` parameter for differential privacy) + + References + ---------- + .. [VSB13] Vaidya, Jaideep, Basit Shafiq, Anirban Basu, and Yuan Hong. "Differentially private naive bayes + classification." In 2013 IEEE/WIC/ACM International Joint Conferences on Web Intelligence (WI) and Intelligent + Agent Technologies (IAT), vol. 1, pp. 571-576. IEEE, 2013. + + """ + + def __init__(self, *, epsilon=1.0, bounds=None, priors=None, var_smoothing=1e-9, random_state=None, + accountant=None): + super().__init__(priors=priors, var_smoothing=var_smoothing) + + self.epsilon = epsilon + self.bounds = bounds + self.random_state = random_state + self.accountant = BudgetAccountant.load_default(accountant) + + def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): + self.accountant.check(self.epsilon, 0) + + if sample_weight is not None: + warn_unused_args("sample_weight") + + random_state = check_random_state(self.random_state) + + X, y = self._validate_data(X, y) + + if self.bounds is None: + warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) + self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) + + self.bounds = self._check_bounds(self.bounds, shape=X.shape[1]) + X = self._clip_to_bounds(X, self.bounds) + + self.epsilon_ = self.var_smoothing + + if _refit: + self.classes_ = None + + if _check_partial_fit_first_call(self, classes): + n_features = X.shape[1] + n_classes = len(self.classes_) + self.theta_ = np.zeros((n_classes, n_features)) + self.var_ = np.zeros((n_classes, n_features)) + + self.class_count_ = np.zeros(n_classes, dtype=np.float64) + + if self.priors is not None: + priors = np.asarray(self.priors) + + if len(priors) != n_classes: + raise ValueError("Number of priors must match number of classes.") + if not np.isclose(priors.sum(), 1.0): + raise ValueError("The sum of the priors should be 1.") + if (priors < 0).any(): + raise ValueError("Priors must be non-negative.") + self.class_prior_ = priors + else: + # Initialize the priors to zeros for each class + self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) + else: + if X.shape[1] != self.theta_.shape[1]: + raise ValueError(f"Number of features {X.shape[1]} does not match previous " + f"data {self.theta_.shape[1]}.") + # Put epsilon back in each time + self.var_[:, :] -= self.epsilon_ + + classes = self.classes_ + + unique_y = np.unique(y) + unique_y_in_classes = np.in1d(unique_y, classes) + + if not np.all(unique_y_in_classes): + raise ValueError(f"The target label(s) {unique_y[~unique_y_in_classes]} in y do not exist in the initial " + f"classes {classes}") + + noisy_class_counts = self._noisy_class_counts(y, random_state=random_state) + + for _i, y_i in enumerate(unique_y): + i = classes.searchsorted(y_i) + X_i = X[y == y_i, :] + + n_i = noisy_class_counts[_i] + + new_theta, new_var = self._update_mean_variance(self.class_count_[i], self.theta_[i, :], self.var_[i, :], + X_i, random_state=random_state, n_noisy=n_i) + + self.theta_[i, :] = new_theta + self.var_[i, :] = new_var + self.class_count_[i] += n_i + + self.var_[:, :] += self.epsilon_ + + # Update if only no priors is provided + if self.priors is None: + # Empirical prior, with sample_weight taken into account + self.class_prior_ = self.class_count_ / self.class_count_.sum() + + self.accountant.spend(self.epsilon, 0) + + return self + + def _update_mean_variance(self, n_past, mu, var, X, random_state, sample_weight=None, n_noisy=None): + """Compute online update of Gaussian mean and variance. + + Given starting sample count, mean, and variance, a new set of points X return the updated mean and variance. + (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance). + + Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of + independent Gaussians. + + See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque: + + http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf + + Parameters + ---------- + n_past : int + Number of samples represented in old mean and variance. If sample weights were given, this should contain + the sum of sample weights represented in old mean and variance. + + mu : array-like, shape (number of Gaussians,) + Means for Gaussians in original set. + + var : array-like, shape (number of Gaussians,) + Variances for Gaussians in original set. + + random_state : RandomState + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + sample_weight : ignored + Ignored in diffprivlib. + + n_noisy : int, optional + Noisy count of the given class, satisfying differential privacy. + + Returns + ------- + total_mu : array-like, shape (number of Gaussians,) + Updated mean for each Gaussian over the combined set. + + total_var : array-like, shape (number of Gaussians,) + Updated variance for each Gaussian over the combined set. + """ + if n_noisy is None: + warnings.warn("Noisy class count has not been specified and will be read from the data. To use this " + "method correctly, make sure it is run by the parent GaussianNB class.", PrivacyLeakWarning) + n_noisy = X.shape[0] + + if not n_noisy: + return mu, var + + if sample_weight is not None: + warn_unused_args("sample_weight") + + # Split epsilon between each feature, using 1/3 of total budget for each of mean and variance + n_features = X.shape[1] + local_epsilon = self.epsilon / 3 / n_features + + new_mu = np.zeros((n_features,)) + new_var = np.zeros((n_features,)) + + for feature in range(n_features): + temp_x = X[:, feature] + lower, upper = self.bounds[0][feature], self.bounds[1][feature] + local_diameter = upper - lower + + mech_mu = LaplaceTruncated(epsilon=local_epsilon, delta=0, sensitivity=local_diameter, + lower=lower * n_noisy, upper=upper * n_noisy, random_state=random_state) + _mu = mech_mu.randomise(temp_x.sum()) / n_noisy + + local_sq_sens = max(_mu - lower, upper - _mu) ** 2 + mech_var = LaplaceBoundedDomain(epsilon=local_epsilon, delta=0, sensitivity=local_sq_sens, lower=0, + upper=local_sq_sens * n_noisy, random_state=random_state) + _var = mech_var.randomise(((temp_x - _mu) ** 2).sum()) / n_noisy + + new_mu[feature] = _mu + new_var[feature] = _var + + if n_past == 0: + return new_mu, new_var + + n_total = float(n_past + n_noisy) + + # Combine mean of old and new data, taking into consideration + # (weighted) number of observations + total_mu = (n_noisy * new_mu + n_past * mu) / n_total + + # Combine variance of old and new data, taking into consideration + # (weighted) number of observations. This is achieved by combining + # the sum-of-squared-differences (ssd) + old_ssd = n_past * var + new_ssd = n_noisy * new_var + total_ssd = old_ssd + new_ssd + (n_past / float(n_noisy * n_total)) * (n_noisy * mu - n_noisy * new_mu) ** 2 + total_var = total_ssd / n_total + + return total_mu, total_var + + def _noisy_class_counts(self, y, random_state): + unique_y = np.unique(y) + n_total = y.shape[0] + + # Use 1/3 of total epsilon budget for getting noisy class counts + mech = GeometricTruncated(epsilon=self.epsilon / 3, sensitivity=1, lower=1, upper=n_total, + random_state=random_state) + noisy_counts = np.array([mech.randomise((y == y_i).sum()) for y_i in unique_y]) + + argsort = np.argsort(noisy_counts) + i = 0 if noisy_counts.sum() > n_total else len(unique_y) - 1 + + while np.sum(noisy_counts) != n_total: + _i = argsort[i] + sgn = np.sign(n_total - noisy_counts.sum()) + noisy_counts[_i] = np.clip(noisy_counts[_i] + sgn, 1, n_total) + + i = (i - sgn) % len(unique_y) + + return noisy_counts + + @property + def sigma_(self): + """Variance of each feature per class.""" + # Todo: Consider removing when sklearn v1.0 is required + return self.var_ diff --git a/privbayes-synthesizer/code/diffprivlib/models/pca.py b/privbayes-synthesizer/code/diffprivlib/models/pca.py new file mode 100644 index 0000000..e4f5de6 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/pca.py @@ -0,0 +1,288 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# +# New BSD License +# +# Copyright (c) 2007–2019 The scikit-learn developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +# following conditions are met: +# +# a. Redistributions of source code must retain the above copyright notice, this list of conditions and the following +# disclaimer. +# b. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the +# following disclaimer in the documentation and/or other materials provided with the distribution. +# c. Neither the name of the Scikit-learn Developers nor the names of its contributors may be used to endorse or +# promote products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +""" +Principal Component Analysis with differential privacy +""" +import warnings +from numbers import Integral + +import numpy as np +import sklearn.decomposition._pca as sk_pca +from sklearn.utils.extmath import stable_cumsum, svd_flip + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.models.utils import covariance_eig +from diffprivlib.tools import mean +from diffprivlib.utils import copy_docstring, PrivacyLeakWarning, check_random_state +from diffprivlib.validation import DiffprivlibMixin + + +# noinspection PyPep8Naming +class PCA(sk_pca.PCA, DiffprivlibMixin): + r"""Principal component analysis (PCA) with differential privacy. + + This class is a child of :obj:`sklearn.decomposition.PCA`, with amendments to allow for the implementation of + differential privacy as given in [IS16b]_. Some parameters of `Scikit Learn`'s model have therefore had to be + fixed, including: + + - The only permitted `svd_solver` is 'full'. Specifying the ``svd_solver`` option will result in a warning; + - The parameters ``tol`` and ``iterated_power`` are not applicable (as a consequence of fixing ``svd_solver = + 'full'``). + + Parameters + ---------- + n_components : int, float, None or str + Number of components to keep. + If n_components is not set all components are kept:: + + n_components == min(n_samples, n_features) + + If ``n_components == 'mle'``, Minka's MLE is used to guess the dimension. + + If ``0 < n_components < 1``, select the number of components such that the amount of variance that needs to be + explained is greater than the percentage specified by n_components. + + Hence, the None case results in:: + + n_components == min(n_samples, n_features) - 1 + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. If ``centered=False``, half of epsilon is used to calculate the + differentially private mean to center the data prior to the calculation of principal components. + + data_norm : float, optional + The max l2 norm of any row of the data. This defines the spread of data that will be protected by + differential privacy. + + If not specified, the max norm is taken from the data when ``.fit()`` is first called, but will result in a + :class:`.PrivacyLeakWarning`, as it reveals information about the data. To preserve differential privacy fully, + `data_norm` should be selected independently of the data, i.e. with domain knowledge. + + centered : bool, default: False + If False, the data will be centered before calculating the principal components. This will be calculated with + differential privacy, consuming privacy budget from epsilon. + + If True, the data is assumed to have been centered previously (e.g. using :class:`.StandardScaler`), and + therefore will not require the consumption of privacy budget to calculate the mean. + + bounds : tuple, optional + Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering + the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed + on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. + + copy : bool, default: True + If False, data passed to fit are overwritten and running fit(X).transform(X) will not yield the expected + results, use fit_transform(X) instead. + + whiten : bool, default: False + When True (False by default) the `components_` vectors are multiplied by the square root of n_samples and + then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. + + Whitening will remove some information from the transformed signal (the relative variance scales of the + components) but can sometime improve the predictive accuracy of the downstream estimators by making their + data respect some hard-wired assumptions. + + random_state : int or RandomState, optional + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Attributes + ---------- + components_ : array, shape (n_components, n_features) + Principal axes in feature space, representing the directions of maximum variance in the data. The components + are sorted by ``explained_variance_``. + + explained_variance_ : array, shape (n_components,) + The amount of variance explained by each of the selected components. + + Equal to n_components largest eigenvalues of the covariance matrix of X. + + explained_variance_ratio_ : array, shape (n_components,) + Percentage of variance explained by each of the selected components. + + If ``n_components`` is not set then all components are stored and the sum of the ratios is equal to 1.0. + + singular_values_ : array, shape (n_components,) + The singular values corresponding to each of the selected components. The singular values are equal to the + 2-norms of the ``n_components`` variables in the lower-dimensional space. + + mean_ : array, shape (n_features,) + Per-feature empirical mean, estimated from the training set. + + Equal to `X.mean(axis=0)`. + + n_components_ : int + The estimated number of components. When n_components is set to 'mle' or a number between 0 and 1 (with + svd_solver == 'full') this number is estimated from input data. Otherwise it equals the parameter + n_components, or the lesser value of n_features and n_samples if n_components is None. + + n_features_in_ : int + Number of features in the training data. + + n_samples_ : int + Number of samples in the training data. + + noise_variance_ : float + The estimated noise covariance following the Probabilistic PCA model from Tipping and Bishop 1999. See + "Pattern Recognition and Machine Learning" by C. Bishop, 12.2.1 p. 574 or + http://www.miketipping.com/papers/met-mppca.pdf. It is required to compute the estimated data covariance and + score samples. + + Equal to the average of (min(n_features, n_samples) - n_components) smallest eigenvalues of the covariance + matrix of X. + + See Also + -------- + :obj:`sklearn.decomposition.PCA` : Scikit-learn implementation Principal Component Analysis. + + References + ---------- + .. [IS16b] Imtiaz, Hafiz, and Anand D. Sarwate. "Symmetric matrix perturbation for differentially-private principal + component analysis." In 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), + pp. 2339-2343. IEEE, 2016. + """ + + _parameter_constraints = DiffprivlibMixin._copy_parameter_constraints( + sk_pca.PCA, "n_components", "copy", "whiten", "random_state") + + def __init__(self, n_components=None, *, epsilon=1.0, data_norm=None, centered=False, bounds=None, copy=True, + whiten=False, random_state=None, accountant=None, **unused_args): + super().__init__(n_components=n_components, copy=copy, whiten=whiten, svd_solver='full', tol=0.0, + iterated_power='auto', random_state=random_state) + self.centered = centered + self.epsilon = epsilon + self.data_norm = data_norm + self.bounds = bounds + self.accountant = BudgetAccountant.load_default(accountant) + + self._warn_unused_args(unused_args) + + # Todo: Remove when scikit-learn v1.2 is a min requirement + @property + def n_features_(self): + return self.n_features_in_ + + def _fit_full(self, X, n_components): + self.accountant.check(self.epsilon, 0) + + random_state = check_random_state(self.random_state) + + n_samples, n_features = X.shape + + if self.centered: + self.mean_ = np.zeros_like(np.mean(X, axis=0)) + else: + if self.bounds is None: + warnings.warn( + "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n" + "This will result in additional privacy leakage. To ensure differential privacy with no " + "additional privacy loss, specify `range` for each valued returned by np.mean().", + PrivacyLeakWarning) + + self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) + + self.bounds = self._check_bounds(self.bounds, n_features) + self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, random_state=random_state, + accountant=BudgetAccountant()) + + X -= self.mean_ + + if self.data_norm is None: + warnings.warn("Data norm has not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) + self.data_norm = np.linalg.norm(X, axis=1).max() + + X = self._clip_to_norm(X, self.data_norm) + + sigma_vec, u_mtx = covariance_eig(X, epsilon=self.epsilon if self.centered else self.epsilon / 2, + norm=self.data_norm, random_state=random_state, + dims=n_components if isinstance(n_components, Integral) else None) + u_mtx, _ = svd_flip(u_mtx, np.zeros_like(u_mtx).T) + sigma_vec = np.sqrt(sigma_vec) + + components_ = u_mtx.T + + # Get variance explained by singular values + explained_variance_ = np.sort((sigma_vec ** 2) / (n_samples - 1))[::-1] + total_var = explained_variance_.sum() + explained_variance_ratio_ = explained_variance_ / total_var + singular_values_ = sigma_vec.copy() # Store the singular values. + + # Post-process the number of components required + if n_components == 'mle': + n_components = sk_pca._infer_dimension(explained_variance_, n_samples) # pylint: disable=protected-access + elif 0 < n_components < 1.0: + # number of components for which the cumulated explained + # variance percentage is superior to the desired threshold + ratio_cumsum = stable_cumsum(explained_variance_ratio_) + n_components = np.searchsorted(ratio_cumsum, n_components) + 1 + + # Compute noise covariance using Probabilistic PCA model + # The sigma2 maximum likelihood (cf. eq. 12.46) + if n_components < min(n_features, n_samples): + self.noise_variance_ = explained_variance_[n_components:].mean() + else: + self.noise_variance_ = 0. + + self.n_samples_ = n_samples + self.components_ = components_[:n_components] + self.n_components_ = n_components + self.explained_variance_ = explained_variance_[:n_components] + self.explained_variance_ratio_ = explained_variance_ratio_[:n_components] + self.singular_values_ = singular_values_[:n_components] + + self.accountant.spend(self.epsilon, 0) + + return u_mtx, sigma_vec[:n_components], u_mtx.T + + @copy_docstring(sk_pca.PCA.fit_transform) + def fit_transform(self, X, y=None): + del y + + self._fit(X) + + return self.transform(X) diff --git a/privbayes-synthesizer/code/diffprivlib/models/standard_scaler.py b/privbayes-synthesizer/code/diffprivlib/models/standard_scaler.py new file mode 100644 index 0000000..a018def --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/standard_scaler.py @@ -0,0 +1,267 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# +# New BSD License +# +# Copyright (c) 2007–2019 The scikit-learn developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +# following conditions are met: +# +# a. Redistributions of source code must retain the above copyright notice, this list of conditions and the following +# disclaimer. +# b. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the +# following disclaimer in the documentation and/or other materials provided with the distribution. +# c. Neither the name of the Scikit-learn Developers nor the names of its contributors may be used to endorse or +# promote products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +""" +Standard Scaler with differential privacy +""" +import warnings + +import numpy as np +import sklearn.preprocessing as sk_pp +from sklearn.preprocessing._data import _handle_zeros_in_scale + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.utils import PrivacyLeakWarning, check_random_state +from diffprivlib.tools import nanvar, nanmean +from diffprivlib.validation import DiffprivlibMixin + + +def _incremental_mean_and_var(X, epsilon, bounds, last_mean, last_variance, last_sample_count, random_state=None): + # Initialising new accountant, as budget is tracked in main class. Subject to review in line with GH issue #21 + temp_acc = BudgetAccountant() + + # old = stats until now + # new = the current increment + # updated = the aggregated stats + last_sum = last_mean * last_sample_count + + new_mean = nanmean(X, epsilon=epsilon, axis=0, bounds=bounds, random_state=random_state, accountant=temp_acc) + new_sample_count = np.sum(~np.isnan(X), axis=0) + new_sum = new_mean * new_sample_count + updated_sample_count = last_sample_count + new_sample_count + + updated_mean = (last_sum + new_sum) / updated_sample_count + + if last_variance is None: + updated_variance = None + else: + new_unnormalized_variance = nanvar(X, epsilon=epsilon, axis=0, bounds=bounds, random_state=random_state, + accountant=temp_acc) * new_sample_count + last_unnormalized_variance = last_variance * last_sample_count + + with np.errstate(divide='ignore', invalid='ignore'): + last_over_new_count = last_sample_count / new_sample_count + updated_unnormalized_variance = ( + last_unnormalized_variance + new_unnormalized_variance + + last_over_new_count / updated_sample_count * + (last_sum / last_over_new_count - new_sum) ** 2) + + zeros = last_sample_count == 0 + updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros] + updated_variance = updated_unnormalized_variance / updated_sample_count + + return updated_mean, updated_variance, updated_sample_count + + +# noinspection PyPep8Naming,PyAttributeOutsideInit +class StandardScaler(sk_pp.StandardScaler, DiffprivlibMixin): + """Standardize features by removing the mean and scaling to unit variance, calculated with differential privacy + guarantees. Differential privacy is guaranteed on the learned scaler with respect to the training sample; the + transformed output will certainly not satisfy differential privacy. + + The standard score of a sample `x` is calculated as: + + z = (x - u) / s + + where `u` is the (differentially private) mean of the training samples or zero if `with_mean=False`, and `s` is the + (differentially private) standard deviation of the training samples or one if `with_std=False`. + + Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in + the training set. Mean and standard deviation are then stored to be used on later data using the `transform` + method. + + For further information, users are referred to :class:`sklearn.preprocessing.StandardScaler`. + + Parameters + ---------- + epsilon : float, default: 1.0 + The privacy budget to be allocated to learning the mean and variance of the training sample. If + `with_std=True`, the privacy budget is split evenly between mean and variance (the mean must be calculated even + when `with_mean=False`, as it is used in the calculation of the variance. + + bounds : tuple, optional + Bounds of the data, provided as a tuple of the form (min, max). `min` and `max` can either be scalars, covering + the min/max of the entire data, or vectors with one entry per feature. If not provided, the bounds are computed + on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`. + + copy : boolean, default: True + If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; + e.g. if the data is not a NumPy array, a copy may still be returned. + + with_mean : boolean, True by default + If True, center the data before scaling. + + with_std : boolean, True by default + If True, scale the data to unit variance (or equivalently, unit standard deviation). + + random_state : int or RandomState, optional + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Attributes + ---------- + scale_ : ndarray or None, shape (n_features,) + Per feature relative scaling of the data. This is calculated using `np.sqrt(var_)`. Equal to ``None`` when + ``with_std=False``. + + mean_ : ndarray or None, shape (n_features,) + The mean value for each feature in the training set. Equal to ``None`` when ``with_mean=False``. + + var_ : ndarray or None, shape (n_features,) + The variance for each feature in the training set. Used to compute `scale_`. Equal to ``None`` when + ``with_std=False``. + + n_samples_seen_ : int or array, shape (n_features,) + The number of samples processed by the estimator for each feature. If there are not missing samples, the + ``n_samples_seen`` will be an integer, otherwise it will be an array. + Will be reset on new calls to fit, but increments across ``partial_fit`` calls. + + See also + -------- + :class:`sklearn.preprocessing.StandardScaler` + Vanilla scikit-learn version, without differential privacy. + + :class:`.PCA` + Further removes the linear correlation across features with 'whiten=True'. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in transform. + + """ # noqa + def __init__(self, *, epsilon=1.0, bounds=None, copy=True, with_mean=True, with_std=True, random_state=None, + accountant=None): + super().__init__(copy=copy, with_mean=with_mean, with_std=with_std) + self.epsilon = epsilon + self.bounds = bounds + self.random_state = random_state + self.accountant = BudgetAccountant.load_default(accountant) + + def partial_fit(self, X, y=None, sample_weight=None): + """Online computation of mean and std with differential privacy on X for later scaling. All of X is processed + as a single batch. This is intended for cases when `fit` is not feasible due to very large number of + `n_samples` or because X is read from a continuous stream. + + The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and + Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American + Statistician 37.3 (1983): 242-247: + + Parameters + ---------- + X : {array-like}, shape [n_samples, n_features] + The data used to compute the mean and standard deviation used for later scaling along the features axis. + + y + Ignored + + sample_weight + Ignored by diffprivlib. Present for consistency with sklearn API. + + """ + self._validate_params() + self.accountant.check(self.epsilon, 0) + + if sample_weight is not None: + self._warn_unused_args("sample_weight") + + random_state = check_random_state(self.random_state) + + epsilon_0 = self.epsilon / 2 if self.with_std else self.epsilon + + X = self._validate_data(X, accept_sparse=False, copy=self.copy, estimator=self, dtype=float, + force_all_finite='allow-nan') + + if self.bounds is None: + warnings.warn("Bounds parameter hasn't been specified, so falling back to determining bounds from the " + "data.\n This will result in additional privacy leakage. To ensure differential privacy " + "with no additional privacy loss, specify `bounds` for each valued returned by np.mean().", + PrivacyLeakWarning) + self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) + + self.bounds = self._check_bounds(self.bounds, X.shape[1]) + X = self._clip_to_bounds(X, self.bounds) + + # Even in the case of `with_mean=False`, we update the mean anyway. This is needed for the incremental + # computation of the var See incr_mean_variance_axis and _incremental_mean_variance_axis + + # if n_samples_seen_ is an integer (i.e. no missing values), we need to transform it to a NumPy array of + # shape (n_features,) required by incr_mean_variance_axis and _incremental_variance_axis + if hasattr(self, 'n_samples_seen_') and isinstance(self.n_samples_seen_, (int, np.integer)): + self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1]).astype(np.int64) + + if not hasattr(self, 'n_samples_seen_'): + self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64) + + # First pass + if not hasattr(self, 'scale_'): + self.mean_ = .0 + if self.with_std: + self.var_ = .0 + else: + self.var_ = None + + if not self.with_mean and not self.with_std: + self.mean_ = None + self.var_ = None + self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0) + else: + self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var( + X, epsilon_0, self.bounds, self.mean_, self.var_, self.n_samples_seen_, random_state + ) + + # for backward-compatibility, reduce n_samples_seen_ to an integer + # if the number of samples is the same for each feature (i.e. no + # missing values) + if np.ptp(self.n_samples_seen_) == 0: + self.n_samples_seen_ = self.n_samples_seen_[0] + + if self.with_std: + self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) + else: + self.scale_ = None + + self.accountant.spend(self.epsilon, 0) + + return self diff --git a/privbayes-synthesizer/code/diffprivlib/models/utils.py b/privbayes-synthesizer/code/diffprivlib/models/utils.py new file mode 100644 index 0000000..6595a51 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/models/utils.py @@ -0,0 +1,124 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2020 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Utilities for use in machine learning models +""" +import warnings +from numbers import Integral + +import numpy as np +from scipy.linalg import null_space + +from diffprivlib.mechanisms import LaplaceBoundedDomain, Bingham +from diffprivlib.utils import PrivacyLeakWarning, check_random_state + + +def covariance_eig(array, epsilon=1.0, norm=None, dims=None, eigvals_only=False, random_state=None): + r""" + Return the eigenvalues and eigenvectors of the covariance matrix of `array`, satisfying differential privacy. + + Paper link: http://papers.nips.cc/paper/9567-differentially-private-covariance-estimation.pdf + + Parameters + ---------- + array : array-like, shape (n_samples, n_features) + Matrix for which the covariance matrix is sought. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + norm : float, optional + The max l2 norm of any row of the input array. This defines the spread of data that will be protected by + differential privacy. + + If not specified, the max norm is taken from the data, but will result in a :class:`.PrivacyLeakWarning`, as it + reveals information about the data. To preserve differential privacy fully, `norm` should be selected + independently of the data, i.e. with domain knowledge. + + dims : int, optional + Number of eigenvectors to return. If `None`, return all eigenvectors. + + eigvals_only : bool, default: False + Only return the eigenvalue estimates. If True, all the privacy budget is spent on estimating the eigenvalues. + + random_state : int or RandomState, optional + Controls the randomness of the model. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + Returns + ------- + w : (n_features) array + The eigenvalues, each repeated according to its multiplicity. + + v : (n_features, dims) array + The normalized (unit "length") eigenvectors, such that the column ``v[:,i]`` is the eigenvector corresponding to + the eigenvalue ``w[i]``. + + """ + + random_state = check_random_state(random_state) + + n_features = array.shape[1] + dims = n_features if dims is None else min(dims, n_features) + if not isinstance(dims, Integral): + raise TypeError(f"Number of requested dimensions must be integer-valued, got {type(dims)}") + if dims < 0: + raise ValueError(f"Number of requested dimensions must be non-negative, got {dims}") + + max_norm = np.linalg.norm(array, axis=1).max() + if norm is None: + warnings.warn("Data norm has not been specified and will be calculated on the data provided. This will result " + "in additional privacy leakage. To ensure differential privacy and no additional privacy " + "leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) + norm = max_norm + elif max_norm > norm and not np.isclose(max_norm, norm): + raise ValueError(f"Rows of input array must have l2 norm of at most {norm}, got {max_norm}") + + cov = array.T.dot(array) / (norm ** 2) + eigvals = np.sort(np.linalg.eigvalsh(cov))[::-1] + epsilon_0 = epsilon if eigvals_only else epsilon / (dims + (dims != n_features)) + + mech_eigvals = LaplaceBoundedDomain(epsilon=epsilon_0, lower=0, upper=float("inf"), sensitivity=2, + random_state=random_state) + noisy_eigvals = np.array([mech_eigvals.randomise(eigval) for eigval in eigvals]) * (norm ** 2) + + if eigvals_only: + return noisy_eigvals + + # When estimating all eigenvectors, we don't need to spend budget for the dth vector + epsilon_i = epsilon / (dims + (dims != n_features)) + cov_i = cov + proj_i = np.eye(n_features) + + theta = np.zeros((0, n_features)) + mech_cov = Bingham(epsilon=epsilon_i, random_state=random_state) + + for _ in range(dims): + if cov_i.size > 1: + u_i = mech_cov.randomise(cov_i) + else: + u_i = np.ones((1,)) + + theta_i = proj_i.T.dot(u_i) + theta = np.vstack((theta, theta_i)) + + if cov_i.size > 1: + proj_i = null_space(theta).T + cov_i = proj_i.dot(cov).dot(proj_i.T) + + return noisy_eigvals, theta.T diff --git a/privbayes-synthesizer/code/diffprivlib/tools/__init__.py b/privbayes-synthesizer/code/diffprivlib/tools/__init__.py new file mode 100644 index 0000000..414ab68 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/tools/__init__.py @@ -0,0 +1,23 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Tools for data analysis with differential privacy. +""" +from diffprivlib.tools.histograms import histogram, histogramdd, histogram2d +from diffprivlib.tools.quantiles import quantile, median, percentile +from diffprivlib.tools.utils import count_nonzero, mean, std, sum, var, nanmean, nanstd, nansum, nanvar diff --git a/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/__init__.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..9f2cc19 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/__init__.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/histograms.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/histograms.cpython-311.pyc new file mode 100644 index 0000000..430fd41 Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/histograms.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/quantiles.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/quantiles.cpython-311.pyc new file mode 100644 index 0000000..3d1013f Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/quantiles.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/utils.cpython-311.pyc b/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000..423e4aa Binary files /dev/null and b/privbayes-synthesizer/code/diffprivlib/tools/__pycache__/utils.cpython-311.pyc differ diff --git a/privbayes-synthesizer/code/diffprivlib/tools/histograms.py b/privbayes-synthesizer/code/diffprivlib/tools/histograms.py new file mode 100644 index 0000000..faf5d41 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/tools/histograms.py @@ -0,0 +1,365 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# +# Copyright (c) 2005-2019, NumPy Developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +# following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this list of conditions and the following +# disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the +# following disclaimer in the documentation and/or other materials provided with the distribution. +# +# * Neither the name of the NumPy Developers nor the names of any contributors may be used to endorse or promote +# products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" +Differentially private histogram-related functions +Builds upon the histogram functionality of Numpy +""" +import warnings +from sys import maxsize + +import numpy as np + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.mechanisms import GeometricTruncated +from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args, check_random_state + + +# noinspection PyShadowingBuiltins +def histogram(sample, epsilon=1.0, bins=10, range=None, weights=None, density=None, random_state=None, accountant=None, + **unused_args): + r""" + Compute the differentially private histogram of a set of data. + + The histogram is computed using :obj:`numpy.histogram`, and noise added using :class:`.GeometricTruncated` to + satisfy differential privacy. If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning` + is thrown. Users are referred to :obj:`numpy.histogram` for more usage notes. + + Parameters + ---------- + sample : array_like + Input data. The histogram is computed over the flattened array. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon` to be applied. + + bins : int or sequence of scalars or str, default: 10 + If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` + is a sequence, it defines a monotonically increasing array of bin edges, including the rightmost edge, allowing + for non-uniform bin widths. + + If `bins` is a string, it defines the method used to calculate the optimal bin width, as defined by + `histogram_bin_edges`. + + range : (float, float), optional + The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside + the range are ignored. The first element of the range must be less than or equal to the second. `range` affects + the automatic bin computation as well. While bin width is computed to be optimal based on the actual data + within `range`, the bin count will fill the entire range including portions containing no data. + + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight + towards the bin count (instead of 1). If `density` is True, the weights are normalized, so that the integral + of the density over the range remains 1. + + density : bool, optional + If ``False``, the result will contain the number of samples in each bin. If ``True``, the result is the value + of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. + Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is + not a probability *mass* function. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + hist : array + The values of the histogram. See `density` and `weights` for a + description of the possible semantics. + bin_edges : array of dtype float + Return the bin edges ``(length(hist)+1)``. + + + See Also + -------- + histogramdd, histogram2d + + Notes + ----- + All but the last (righthand-most) bin is half-open. In other words, if `bins` is:: + + [1, 2, 3, 4] + + then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The last bin, however, + is ``[3, 4]``, which *includes* 4. + + """ + warn_unused_args(unused_args) + + random_state = check_random_state(random_state) + + accountant = BudgetAccountant.load_default(accountant) + accountant.check(epsilon, 0) + + if range is None: + warnings.warn("Range parameter has not been specified. Falling back to taking range from the data.\n" + "To ensure differential privacy, and no additional privacy leakage, the range must be " + "specified independently of the data (i.e., using domain knowledge).", PrivacyLeakWarning) + + hist, bin_edges = np.histogram(sample, bins=bins, range=range, weights=weights, density=None) + + dp_mech = GeometricTruncated(epsilon=epsilon, sensitivity=1, lower=0, upper=maxsize, random_state=random_state) + + dp_hist = np.zeros_like(hist) + + for i in np.arange(dp_hist.shape[0]): + dp_hist[i] = dp_mech.randomise(int(hist[i])) + + # dp_hist = dp_hist.astype(float, casting='safe') + + accountant.spend(epsilon, 0) + + if density: + bin_sizes = np.array(np.diff(bin_edges), float) + return dp_hist / bin_sizes / (dp_hist.sum() if dp_hist.sum() else 1), bin_edges + + return dp_hist, bin_edges + + +# noinspection PyShadowingBuiltins +def histogramdd(sample, epsilon=1.0, bins=10, range=None, weights=None, density=None, random_state=None, + accountant=None, **unused_args): + r""" + Compute the differentially private multidimensional histogram of some data. + + The histogram is computed using :obj:`numpy.histogramdd`, and noise added using :class:`.GeometricTruncated` to + satisfy differential privacy. If the `range` parameter is not specified correctly, a :class:`.PrivacyLeakWarning` + is thrown. Users are referred to :obj:`numpy.histogramdd` for more usage notes. + + Parameters + ---------- + sample : (N, D) array, or (D, N) array_like + The data to be histogrammed. + + Note the unusual interpretation of sample when an array_like: + + * When an array, each row is a coordinate in a D-dimensional space - such as + ``histogramgramdd(np.array([p1, p2, p3]))``. + * When an array_like, each element is the list of values for single coordinate - such as + ``histogramgramdd((X, Y, Z))``. + + The first form should be preferred. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon` to be applied. + + bins : sequence or int, default: 10 + The bin specification: + + * A sequence of arrays describing the monotonically increasing bin edges along each dimension. + * The number of bins for each dimension (nx, ny, ... =bins) + * The number of bins for all dimensions (nx=ny=...=bins). + + range : sequence, optional + A sequence of length D, each an optional (lower, upper) tuple giving the outer bin edges to be used if the edges + are not given explicitly in `bins`. + An entry of None in the sequence results in the minimum and maximum values being used for the corresponding + dimension. + The default, None, is equivalent to passing a tuple of D None values. + + density : bool, optional + If False, the default, returns the number of samples in each bin. If True, returns the probability *density* + function at the bin, ``bin_count / sample_count / bin_volume``. + + weights : (N,) array_like, optional + An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. Weights are normalized to 1 if normed is + True. If normed is False, the values of the returned histogram are equal to the sum of the weights belonging to + the samples falling into each bin. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + H : ndarray + The multidimensional histogram of sample x. See normed and weights for the different possible semantics. + edges : list + A list of D arrays describing the bin edges for each dimension. + + See Also + -------- + histogram: 1-D differentially private histogram + histogram2d: 2-D differentially private histogram + + """ + warn_unused_args(unused_args) + + random_state = check_random_state(random_state) + + accountant = BudgetAccountant.load_default(accountant) + accountant.check(epsilon, 0) + + # Range only required if bin edges not specified + if np.array(bins, dtype=object).ndim == 0 or not np.all([np.ndim(_bin) for _bin in bins]): + if range is None or (isinstance(range, list) and None in range): + warnings.warn("Range parameter has not been specified (or has missing elements). Falling back to taking " + "range from the data.\n " + "To ensure differential privacy, and no additional privacy leakage, the range must be " + "specified for each dimension independently of the data (i.e., using domain knowledge).", + PrivacyLeakWarning) + + hist, bin_edges = np.histogramdd(sample, bins=bins, range=range, weights=weights, density=None) + + dp_mech = GeometricTruncated(epsilon=epsilon, sensitivity=1, lower=0, upper=maxsize, random_state=random_state) + + dp_hist = np.zeros_like(hist) + iterator = np.nditer(hist, flags=['multi_index']) + + while not iterator.finished: + dp_hist[iterator.multi_index] = dp_mech.randomise(int(iterator[0])) + iterator.iternext() + + dp_hist = dp_hist.astype(float, casting='safe') + + if density: + # calculate the probability density function + dims = len(dp_hist.shape) + dp_hist_sum = dp_hist.sum() + for i in np.arange(dims): + shape = np.ones(dims, int) + shape[i] = dp_hist.shape[i] + # noinspection PyUnresolvedReferences + dp_hist = dp_hist / np.diff(bin_edges[i]).reshape(shape) + + if dp_hist_sum > 0: + dp_hist /= dp_hist_sum + + accountant.spend(epsilon, 0) + + return dp_hist, bin_edges + + +# noinspection PyShadowingBuiltins +def histogram2d(array_x, array_y, epsilon=1.0, bins=10, range=None, weights=None, density=None, random_state=None, + accountant=None, **unused_args): + r""" + Compute the differentially private bi-dimensional histogram of two data samples. + + Parameters + ---------- + array_x : array_like, shape (N,) + An array containing the x coordinates of the points to be histogrammed. + + array_y : array_like, shape (N,) + An array containing the y coordinates of the points to be histogrammed. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon` to be applied. + + bins : int or array_like or [int, int] or [array, array], default: 10 + The bin specification: + + * If int, the number of bins for the two dimensions (nx=ny=bins). + * If array_like, the bin edges for the two dimensions (x_edges=y_edges=bins). + * If [int, int], the number of bins in each dimension (nx, ny = bins). + * If [array, array], the bin edges in each dimension (x_edges, y_edges = bins). + * A combination [int, array] or [array, int], where int is the number of bins and array is the bin edges. + + range : array_like, shape(2,2), optional + The leftmost and rightmost edges of the bins along each dimension (if not specified explicitly in the `bins` + parameters): ``[[xmin, xmax], [ymin, ymax]]``. All values outside of this range will be considered outliers and + not tallied in the histogram. + + density : bool, optional + If False, the default, returns the number of samples in each bin. If True, returns the probability *density* + function at the bin, ``bin_count / sample_count / bin_area``. + + weights : array_like, shape(N,), optional + An array of values ``w_i`` weighing each sample ``(x_i, y_i)``. Weights are normalized to 1 if `normed` is + True. If `normed` is False, the values of the returned histogram are equal to the sum of the weights belonging + to the samples falling into each bin. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + H : ndarray, shape(nx, ny) + The bi-dimensional histogram of samples `x` and `y`. Values in `x` are histogrammed along the first dimension + and values in `y` are histogrammed along the second dimension. + + xedges : ndarray, shape(nx+1,) + The bin edges along the first dimension. + + yedges : ndarray, shape(ny+1,) + The bin edges along the second dimension. + + See Also + -------- + histogram : 1D differentially private histogram + histogramdd : Differentially private Multidimensional histogram + + Notes + ----- + When `normed` is True, then the returned histogram is the sample density, defined such that the sum over bins of the + product ``bin_value * bin_area`` is 1. + + Please note that the histogram does not follow the Cartesian convention where `x` values are on the abscissa and `y` + values on the ordinate axis. Rather, `x` is histogrammed along the first dimension of the array (vertical), and `y` + along the second dimension of the array (horizontal). This ensures compatibility with `histogramdd`. + + """ + warn_unused_args(unused_args) + + try: + num_bins = len(bins) + except TypeError: + num_bins = 1 + + if num_bins not in (1, 2): + xedges = yedges = np.asarray(bins) + bins = [xedges, yedges] + + hist, edges = histogramdd([array_x, array_y], epsilon=epsilon, bins=bins, range=range, weights=weights, + density=density, random_state=random_state, accountant=accountant) + return hist, edges[0], edges[1] diff --git a/privbayes-synthesizer/code/diffprivlib/tools/quantiles.py b/privbayes-synthesizer/code/diffprivlib/tools/quantiles.py new file mode 100644 index 0000000..1c95dd9 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/tools/quantiles.py @@ -0,0 +1,273 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2020 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Quantile functions with differential privacy +""" +import warnings + +import numpy as np + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.mechanisms import Exponential +from diffprivlib.utils import warn_unused_args, PrivacyLeakWarning, check_random_state +from diffprivlib.validation import clip_to_bounds, check_bounds +from diffprivlib.tools.utils import _wrap_axis + + +def quantile(array, quant, epsilon=1.0, bounds=None, axis=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the differentially private quantile of the array. + + Returns the specified quantile with differential privacy. The quantile is calculated over the flattened array. + Differential privacy is achieved with the :class:`.Exponential` mechanism, using the method first proposed by + Smith, 2011. + + Paper link: https://dl.acm.org/doi/pdf/10.1145/1993636.1993743 + + Parameters + ---------- + array : array_like + Array containing numbers whose quantile is sought. If `array` is not an array, a conversion is attempted. + + quant : float or array-like + Quantile or array of quantiles. Each quantile must be in the unit interval [0, 1]. If quant is array-like, + quantiles are returned over the flattened array. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split + evenly between each output value. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : None or int or tuple of ints, optional + Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input + array. If axis is negative it counts from the last to the first axis. + + If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single + axis or all the axes as before. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes + of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any + exceptions will be raised. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + m : ndarray + Returns a new array containing the quantile values. + + See Also + -------- + numpy.quantile : Equivalent non-private method. + + percentile, median + + """ + warn_unused_args(unused_args) + + random_state = check_random_state(random_state) + + if bounds is None: + warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) + bounds = (np.min(array), np.max(array)) + + quant = np.ravel(quant) + + if np.any(quant < 0) or np.any(quant > 1): + raise ValueError("Quantiles must be in the unit interval [0, 1].") + + if len(quant) > 1: + return np.array([quantile(array, q_i, epsilon=epsilon / len(quant), bounds=bounds, axis=axis, keepdims=keepdims, + accountant=accountant, random_state=random_state) for q_i in quant]) + + # Dealing with a single quant from now on + quant = quant.item() + + if axis is not None or keepdims: + return _wrap_axis(quantile, array, quant=quant, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, + random_state=random_state, accountant=accountant) + + # Dealing with a scalar output from now on + bounds = check_bounds(bounds, shape=0, min_separation=1e-5) + + accountant = BudgetAccountant.load_default(accountant) + accountant.check(epsilon, 0) + + # Let's ravel array to be single-dimensional + array = clip_to_bounds(np.ravel(array), bounds) + + k = array.size + array = np.append(array, list(bounds)) + array.sort() + + interval_sizes = np.diff(array) + + # Todo: Need to find a way to do this in a differentially private way, see GH 80 + if np.isnan(interval_sizes).any(): + return np.nan + + mech = Exponential(epsilon=epsilon, sensitivity=1, utility=list(-np.abs(np.arange(0, k + 1) - quant * k)), + measure=list(interval_sizes), random_state=random_state) + idx = mech.randomise() + output = random_state.random() * (array[idx+1] - array[idx]) + array[idx] + + accountant.spend(epsilon, 0) + + return output + + +def percentile(array, percent, epsilon=1.0, bounds=None, axis=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the differentially private percentile of the array. + + This method calls :obj:`.quantile`, where quantile = percentile / 100. + + Parameters + ---------- + array : array_like + Array containing numbers whose percentile is sought. If `array` is not an array, a conversion is attempted. + + percent : float or array-like + Percentile or list of percentiles sought. Each percentile must be in [0, 100]. If percent is array-like, + percentiles are returned over the flattened array. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split + evenly between each output value. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : None or int or tuple of ints, optional + Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input + array. If axis is negative it counts from the last to the first axis. + + If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single + axis or all the axes as before. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes + of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any + exceptions will be raised. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + m : ndarray + Returns a new array containing the percentile values. + + See Also + -------- + numpy.percentile : Equivalent non-private method. + + quantile, median + + """ + warn_unused_args(unused_args) + + quant = np.asarray(percent) / 100 + + if np.any(quant < 0) or np.any(quant > 1): + raise ValueError("Percentiles must be between 0 and 100 inclusive") + + return quantile(array, quant, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, + random_state=random_state, accountant=accountant) + + +def median(array, epsilon=1.0, bounds=None, axis=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the differentially private median of the array. + + Returns the median with differential privacy. The median is calculated over each axis, or the flattened array + if an axis is not provided. This method calls the :obj:`.quantile` method, for the 0.5 quantile. + + Parameters + ---------- + array : array_like + Array containing numbers whose median is sought. If `array` is not an array, a conversion is attempted. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. Differential privacy is achieved over the entire output, with epsilon split + evenly between each output value. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : None or int or tuple of ints, optional + Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input + array. If axis is negative it counts from the last to the first axis. + + If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single + axis or all the axes as before. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + If the default value is passed, then `keepdims` will not be passed through to the `mean` method of sub-classes + of `ndarray`, however any non-default value will be. If the sub-class' method does not implement `keepdims` any + exceptions will be raised. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + m : ndarray + Returns a new array containing the median values. + + See Also + -------- + numpy.median : Equivalent non-private method. + + quantile, percentile + + """ + warn_unused_args(unused_args) + + return quantile(array, 0.5, epsilon=epsilon, bounds=bounds, axis=axis, keepdims=keepdims, random_state=random_state, + accountant=accountant) diff --git a/privbayes-synthesizer/code/diffprivlib/tools/utils.py b/privbayes-synthesizer/code/diffprivlib/tools/utils.py new file mode 100644 index 0000000..99188e0 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/tools/utils.py @@ -0,0 +1,747 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# +# Copyright (c) 2005-2019, NumPy Developers. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the +# following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this list of conditions and the following +# disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the +# following disclaimer in the documentation and/or other materials provided with the distribution. +# +# * Neither the name of the NumPy Developers nor the names of any contributors may be used to endorse or promote +# products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" +General utilities and tools for performing differentially private operations on data. +""" +import warnings +from numbers import Integral +import numpy as np +from numpy.core import multiarray as mu +from numpy.core import umath as um + +from diffprivlib.accountant import BudgetAccountant +from diffprivlib.mechanisms import LaplaceBoundedDomain, GeometricTruncated, LaplaceTruncated +from diffprivlib.utils import PrivacyLeakWarning, warn_unused_args, check_random_state +from diffprivlib.validation import check_bounds, clip_to_bounds + +_sum_ = sum + + +def _wrap_axis(func, array, *, axis, keepdims, epsilon, bounds, **kwargs): + """Wrapper for functions with axis and keepdims parameters to ensure the function only needs to be evaluated on + scalar outputs. + + """ + dummy = np.zeros_like(array).sum(axis=axis, keepdims=keepdims) + array = np.asarray(array) + ndim = array.ndim + bounds = check_bounds(bounds, np.size(dummy) if np.ndim(dummy) == 1 else 0) + + if isinstance(axis, int): + axis = (axis,) + elif axis is None: + axis = tuple(range(ndim)) + + # Ensure all axes are non-negative + axis = tuple(ndim + ax if ax < 0 else ax for ax in axis) + + if isinstance(dummy, np.ndarray): + iterator = np.nditer(dummy, flags=['multi_index']) + + while not iterator.finished: + idx = list(iterator.multi_index) # Multi index on 'dummy' + _bounds = (bounds[0][idx], bounds[1][idx]) if np.ndim(dummy) == 1 else bounds + + # Construct slicing tuple on 'array' + if len(idx) + len(axis) > ndim: + full_slice = tuple(slice(None) if ax in axis else idx[ax] for ax in range(ndim)) + else: + idx.reverse() + full_slice = tuple(slice(None) if ax in axis else idx.pop() for ax in range(ndim)) + + dummy[iterator.multi_index] = func(array[full_slice], epsilon=epsilon / dummy.size, bounds=_bounds, + **kwargs) + iterator.iternext() + + return dummy + + return func(array, bounds=bounds, epsilon=epsilon, **kwargs) + + +def count_nonzero(array, epsilon=1.0, axis=None, keepdims=False, random_state=None, accountant=None): + r"""Counts the number of non-zero values in the array ``array`` with differential privacy. + + It is typical to use this function on the result of binary operations, such as ``count_nonzero(array >= 0)``. If + you wish to count the number of elements of an array, use ``count_nonzero(np.ones_like(array))``. + + The word "non-zero" is in reference to the Python 2.x built-in method ``__nonzero__()`` (renamed ``__bool__()`` in + Python 3.x) of Python objects that tests an object's "truthfulness". For example, any number is considered truthful + if it is nonzero, whereas any string is considered truthful if it is not the empty string. Thus, this function + (recursively) counts how many elements in ``array`` (and in sub-arrays thereof) have their ``__nonzero__()`` or + ``__bool__()`` method evaluated to ``True``. + + Parameters + ---------- + array : array_like + The array for which to count non-zeros. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + axis : int or tuple, optional + Axis or tuple of axes along which to count non-zeros. Default is None, meaning that non-zeros will be counted + along a flattened version of ``array``. + + keepdims : bool, default: False + If this is set to True, the axes that are counted are left in the result as dimensions with size one. With this + option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + count : int or array of int + Differentially private number of non-zero values in the array along a given axis. Otherwise, the total number + of non-zero values in the array is returned. + + """ + array = np.asanyarray(array) + + if np.issubdtype(array.dtype, np.character): + array_bool = array != array.dtype.type() + else: + array_bool = array.astype(np.bool_, copy=False) + + return sum(array_bool, axis=axis, dtype=np.intp, bounds=(0, 1), epsilon=epsilon, keepdims=keepdims, + random_state=random_state, accountant=accountant) + + +def mean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the differentially private arithmetic mean along the specified axis. + + Returns the average of the array elements with differential privacy. The average is taken over the flattened array + by default, otherwise over the specified axis. Noise is added using :class:`.Laplace` to satisfy differential + privacy, where sensitivity is calculated using `bounds`. Users are advised to consult the documentation of + :obj:`numpy.mean` for further details, as the behaviour of `mean` closely follows its Numpy variant. + + Parameters + ---------- + array : array_like + Array containing numbers whose mean is desired. If `array` is not an array, a conversion is attempted. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : int or tuple of ints, optional + Axis or axes along which the means are computed. The default is to compute the mean of the flattened array. + + If this is a tuple of ints, a mean is performed over multiple axes, instead of a single axis or all the axes as + before. + + dtype : data-type, optional + Type to use in computing the mean. For integer inputs, the default is `float64`; for floating point inputs, it + is the same as the input dtype. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + m : ndarray, see dtype parameter above + Returns a new array containing the mean values. + + See Also + -------- + std, var, nanmean + + """ + warn_unused_args(unused_args) + + return _mean(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=False) + + +def nanmean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the differentially private arithmetic mean along the specified axis, ignoring NaNs. + + Returns the average of the array elements with differential privacy. The average is taken over the flattened array + by default, otherwise over the specified axis. Noise is added using :class:`.Laplace` to satisfy differential + privacy, where sensitivity is calculated using `bounds`. Users are advised to consult the documentation of + :obj:`numpy.mean` for further details, as the behaviour of `mean` closely follows its Numpy variant. + + For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised. + + Parameters + ---------- + array : array_like + Array containing numbers whose mean is desired. If `array` is not an array, a conversion is attempted. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : int or tuple of ints, optional + Axis or axes along which the means are computed. The default is to compute the mean of the flattened array. + + If this is a tuple of ints, a mean is performed over multiple axes, instead of a single axis or all the axes as + before. + + dtype : data-type, optional + Type to use in computing the mean. For integer inputs, the default is `float64`; for floating point inputs, it + is the same as the input dtype. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + m : ndarray, see dtype parameter above + Returns a new array containing the mean values. + + See Also + -------- + std, var, mean + + """ + warn_unused_args(unused_args) + + return _mean(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=True) + + +def _mean(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, + accountant=None, nan=False): + random_state = check_random_state(random_state) + + if bounds is None: + warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) + bounds = (np.min(array), np.max(array)) + + if axis is not None or keepdims: + return _wrap_axis(_mean, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=nan) + + lower, upper = check_bounds(bounds, shape=0, dtype=dtype) + + accountant = BudgetAccountant.load_default(accountant) + accountant.check(epsilon, 0) + + array = clip_to_bounds(np.ravel(array), bounds) + + _func = np.nanmean if nan else np.mean + actual_mean = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) + + mech = LaplaceTruncated(epsilon=epsilon, delta=0, sensitivity=(upper - lower) / array.size, lower=lower, + upper=upper, random_state=random_state) + output = mech.randomise(actual_mean) + + accountant.spend(epsilon, 0) + + return output + + +def var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the differentially private variance along the specified axis. + + Returns the variance of the array elements, a measure of the spread of a distribution, with differential privacy. + The variance is computer for the flattened array by default, otherwise over the specified axis. Noise is added + using :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is calculated using + `bounds`. Users are advised to consult the documentation of :obj:`numpy.var` for further details, as the behaviour + of `var` closely follows its Numpy variant. + + Parameters + ---------- + array : array_like + Array containing numbers whose variance is desired. If `array` is not an array, a conversion is attempted. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : int or tuple of ints, optional + Axis or axes along which the variance is computed. The default is to compute the variance of the flattened + array. + + If this is a tuple of ints, a variance is performed over multiple axes, instead of a single axis or all the axes + as before. + + dtype : data-type, optional + Type to use in computing the variance. For arrays of integer type the default is `float32`; for arrays of float + types it is the same as the array type. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + variance : ndarray, see dtype parameter above + Returns a new array containing the variance. + + See Also + -------- + std , mean, nanvar + + """ + warn_unused_args(unused_args) + + return _var(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=False) + + +def nanvar(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the differentially private variance along the specified axis, ignoring NaNs. + + Returns the variance of the array elements, a measure of the spread of a distribution, with differential privacy. + The variance is computer for the flattened array by default, otherwise over the specified axis. Noise is added + using :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is calculated using + `bounds`. Users are advised to consult the documentation of :obj:`numpy.var` for further details, as the behaviour + of `var` closely follows its Numpy variant. + + For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised. + + Parameters + ---------- + array : array_like + Array containing numbers whose variance is desired. If `array` is not an array, a conversion is attempted. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : int or tuple of ints, optional + Axis or axes along which the variance is computed. The default is to compute the variance of the flattened + array. + + If this is a tuple of ints, a variance is performed over multiple axes, instead of a single axis or all the axes + as before. + + dtype : data-type, optional + Type to use in computing the variance. For arrays of integer type the default is `float32`; for arrays of float + types it is the same as the array type. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + variance : ndarray, see dtype parameter above + If ``out=None``, returns a new array containing the variance; otherwise, a reference to the output array is + returned. + + See Also + -------- + std , mean, var + + """ + warn_unused_args(unused_args) + + return _var(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=True) + + +def _var(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + nan=False): + random_state = check_random_state(random_state) + + if bounds is None: + warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) + bounds = (np.min(array), np.max(array)) + + if axis is not None or keepdims: + return _wrap_axis(_var, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=nan) + + lower, upper = check_bounds(bounds, shape=0, dtype=dtype) + + accountant = BudgetAccountant.load_default(accountant) + accountant.check(epsilon, 0) + + # Let's ravel array to be single-dimensional + array = clip_to_bounds(np.ravel(array), bounds) + + _func = np.nanvar if nan else np.var + actual_var = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) + + dp_mech = LaplaceBoundedDomain(epsilon=epsilon, delta=0, + sensitivity=((upper - lower) / array.size) ** 2 * (array.size - 1), lower=0, + upper=((upper - lower) ** 2) / 4, random_state=random_state) + output = dp_mech.randomise(actual_var) + + accountant.spend(epsilon, 0) + + return output + + +def std(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the standard deviation along the specified axis. + + Returns the standard deviation of the array elements, a measure of the spread of a distribution, with differential + privacy. The standard deviation is computed for the flattened array by default, otherwise over the specified axis. + Noise is added using :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is + calculated using `bounds`. Users are advised to consult the documentation of :obj:`numpy.std` for further details, + as the behaviour of `std` closely follows its Numpy variant. + + Parameters + ---------- + array : array_like + Calculate the standard deviation of these values. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : int or tuple of ints, optional + Axis or axes along which the standard deviation is computed. The default is to compute the standard deviation + of the flattened array. + + If this is a tuple of ints, a standard deviation is performed over multiple axes, instead of a single axis or + all the axes as before. + + dtype : dtype, optional + Type to use in computing the standard deviation. For arrays of integer type the default is float64, for arrays + of float types it is the same as the array type. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + standard_deviation : ndarray, see dtype parameter above. + Return a new array containing the standard deviation. + + See Also + -------- + var, mean, nanstd + + """ + warn_unused_args(unused_args) + + return _std(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=False) + + +def nanstd(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r""" + Compute the standard deviation along the specified axis, ignoring NaNs. + + Returns the standard deviation of the array elements, a measure of the spread of a distribution, with differential + privacy. The standard deviation is computed for the flattened array by default, otherwise over the specified axis. + Noise is added using :class:`.LaplaceBoundedDomain` to satisfy differential privacy, where sensitivity is + calculated using `bounds`. Users are advised to consult the documentation of :obj:`numpy.std` for further details, + as the behaviour of `std` closely follows its Numpy variant. + + For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised. + + Parameters + ---------- + array : array_like + Calculate the standard deviation of these values. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : int or tuple of ints, optional + Axis or axes along which the standard deviation is computed. The default is to compute the standard deviation + of the flattened array. + + If this is a tuple of ints, a standard deviation is performed over multiple axes, instead of a single axis or + all the axes as before. + + dtype : dtype, optional + Type to use in computing the standard deviation. For arrays of integer type the default is float64, for arrays + of float types it is the same as the array type. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + standard_deviation : ndarray, see dtype parameter above. + Return a new array containing the standard deviation. + + See Also + -------- + var, mean, std + + """ + warn_unused_args(unused_args) + + return _std(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=True) + + +def _std(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + nan=False): + ret = _var(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=nan) + + if isinstance(ret, mu.ndarray): + ret = um.sqrt(ret) + elif hasattr(ret, 'dtype'): + ret = ret.dtype.type(um.sqrt(ret)) + else: + ret = um.sqrt(ret) + + return ret + + +def sum(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r"""Sum of array elements over a given axis with differential privacy. + + Parameters + ---------- + array : array_like + Elements to sum. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : None or int or tuple of ints, optional + Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input + array. If axis is negative it counts from the last to the first axis. + + If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single + axis or all the axes as before. + + dtype : dtype, optional + The type of the returned array and of the accumulator in which the elements are summed. The dtype of `array` is + used by default unless `array` has an integer dtype of less precision than the default platform integer. In + that case, if `array` is signed then the platform integer is used while if `array` is unsigned then an unsigned + integer of the same precision as the platform integer is used. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + sum_along_axis : ndarray + An array with the same shape as `array`, with the specified axis removed. If `array` is a 0-d array, or if + `axis` is None, a scalar is returned. + + See Also + -------- + ndarray.sum : Equivalent non-private method. + + mean, nansum + + """ + warn_unused_args(unused_args) + + return _sum(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=False) + + +def nansum(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + **unused_args): + r"""Sum of array elements over a given axis with differential privacy, ignoring NaNs. + + Parameters + ---------- + array : array_like + Elements to sum. + + epsilon : float, default: 1.0 + Privacy parameter :math:`\epsilon`. + + bounds : tuple, optional + Bounds of the values of the array, of the form (min, max). + + axis : None or int or tuple of ints, optional + Axis or axes along which a sum is performed. The default, axis=None, will sum all of the elements of the input + array. If axis is negative it counts from the last to the first axis. + + If axis is a tuple of ints, a sum is performed on all of the axes specified in the tuple instead of a single + axis or all the axes as before. + + dtype : dtype, optional + The type of the returned array and of the accumulator in which the elements are summed. The dtype of `array` is + used by default unless `array` has an integer dtype of less precision than the default platform integer. In + that case, if `array` is signed then the platform integer is used while if `array` is unsigned then an unsigned + integer of the same precision as the platform integer is used. + + keepdims : bool, default: False + If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With + this option, the result will broadcast correctly against the input array. + + random_state : int or RandomState, optional + Controls the randomness of the algorithm. To obtain a deterministic behaviour during randomisation, + ``random_state`` has to be fixed to an integer. + + accountant : BudgetAccountant, optional + Accountant to keep track of privacy budget. + + Returns + ------- + sum_along_axis : ndarray + An array with the same shape as `array`, with the specified axis removed. If `array` is a 0-d array, or if + `axis` is None, a scalar is returned. If an output array is specified, a reference to `out` is returned. + + See Also + -------- + ndarray.sum : Equivalent non-private method. + + mean, sum + + """ + warn_unused_args(unused_args) + + return _sum(array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=True) + + +def _sum(array, epsilon=1.0, bounds=None, axis=None, dtype=None, keepdims=False, random_state=None, accountant=None, + nan=False): + random_state = check_random_state(random_state) + + if bounds is None: + warnings.warn("Bounds have not been specified and will be calculated on the data provided. This will " + "result in additional privacy leakage. To ensure differential privacy and no additional " + "privacy leakage, specify bounds for each dimension.", PrivacyLeakWarning) + bounds = (np.min(array), np.max(array)) + + if axis is not None or keepdims: + return _wrap_axis(_sum, array, epsilon=epsilon, bounds=bounds, axis=axis, dtype=dtype, keepdims=keepdims, + random_state=random_state, accountant=accountant, nan=nan) + + lower, upper = check_bounds(bounds, shape=0, dtype=dtype) + + accountant = BudgetAccountant.load_default(accountant) + accountant.check(epsilon, 0) + + # Let's ravel array to be single-dimensional + array = clip_to_bounds(np.ravel(array), bounds) + + _func = np.nansum if nan else np.sum + actual_sum = _func(array, axis=axis, dtype=dtype, keepdims=keepdims) + + mech = GeometricTruncated if dtype is not None and issubclass(dtype, Integral) else LaplaceTruncated + mech = mech(epsilon=epsilon, sensitivity=upper - lower, lower=lower * array.size, upper=upper * array.size, + random_state=random_state) + output = mech.randomise(actual_sum) + + accountant.spend(epsilon, 0) + + return output diff --git a/privbayes-synthesizer/code/diffprivlib/utils.py b/privbayes-synthesizer/code/diffprivlib/utils.py new file mode 100644 index 0000000..467bc89 --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/utils.py @@ -0,0 +1,198 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2019 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Basic functions and other utilities for the differential privacy library +""" +import secrets +import warnings + +import numpy as np +from sklearn.utils import check_random_state as skl_check_random_state + + +def copy_docstring(source): + """Decorator function to copy a docstring from a `source` function to a `target` function. + + The docstring is only copied if a docstring is present in `source`, and if none is present in `target`. Takes + inspiration from similar in `matplotlib`. + + Parameters + ---------- + source : method + Source function from which to copy the docstring. If ``source.__doc__`` is empty, do nothing. + + Returns + ------- + target : method + Target function with new docstring. + + """ + def copy_func(target): + if source.__doc__ and not target.__doc__: + target.__doc__ = source.__doc__ + return target + return copy_func + + +def warn_unused_args(args): + """Warn the user about supplying unused `args` to a diffprivlib model. + + Arguments can be supplied as a string, a list of strings, or a dictionary as supplied to kwargs. + + Parameters + ---------- + args : str or list or dict + Arguments for which warnings should be thrown. + + Returns + ------- + None + + """ + if isinstance(args, str): + args = [args] + + for arg in args: + warnings.warn(f"Parameter '{arg}' is not functional in diffprivlib. Remove this parameter to suppress this " + "warning.", DiffprivlibCompatibilityWarning) + + +def check_random_state(seed, secure=False): + """Turn seed into a np.random.RandomState or secrets.SystemRandom instance. + + If secure=True, and seed is None (or was generated from a previous None seed), then secrets is used. Otherwise a + np.random.RandomState is used. + + Parameters + ---------- + seed : None, int or instance of RandomState + If seed is None and secure is False, return the RandomState singleton used by np.random. + If seed is None and secure is True, return a SystemRandom instance from secrets. + If seed is an int, return a new RandomState instance seeded with seed. + If seed is already a RandomState or SystemRandom instance, return it. + Otherwise raise ValueError. + + secure : bool, default: False + Specifies if a secure random number generator from secrets can be used. + """ + if secure: + if isinstance(seed, secrets.SystemRandom): + return seed + + if seed is None or seed is np.random.mtrand._rand: # pylint: disable=protected-access + return secrets.SystemRandom() + elif isinstance(seed, secrets.SystemRandom): + raise ValueError("secrets.SystemRandom instance cannot be passed when secure is False.") + + return skl_check_random_state(seed) + + +class Budget(tuple): + """Custom tuple subclass for privacy budgets of the form (epsilon, delta). + + The ``Budget`` class allows for correct comparison/ordering of privacy budget, ensuring that both epsilon and delta + satisfy the comparison (tuples are compared lexicographically). Additionally, tuples are represented with added + verbosity, labelling epsilon and delta appropriately. + + Examples + -------- + + >>> from diffprivlib.utils import Budget + >>> Budget(1, 0.5) + (epsilon=1, delta=0.5) + >>> Budget(2, 0) >= Budget(1, 0.5) + False + >>> (2, 0) >= (1, 0.5) # Tuples are compared with lexicographic ordering + True + + """ + def __new__(cls, epsilon, delta): + if epsilon < 0: + raise ValueError("Epsilon must be non-negative") + + if not 0 <= delta <= 1: + raise ValueError("Delta must be in [0, 1]") + + return tuple.__new__(cls, (epsilon, delta)) + + def __gt__(self, other): + if self.__ge__(other) and not self.__eq__(other): + return True + return False + + def __ge__(self, other): + if self[0] >= other[0] and self[1] >= other[1]: + return True + return False + + def __lt__(self, other): + if self.__le__(other) and not self.__eq__(other): + return True + return False + + def __le__(self, other): + if self[0] <= other[0] and self[1] <= other[1]: + return True + return False + + def __repr__(self): + return f"(epsilon={self[0]}, delta={self[1]})" + + +class BudgetError(ValueError): + """Custom exception to capture the privacy budget being exceeded, typically controlled by a + :class:`.BudgetAccountant`. + + For example, this exception may be raised when the user: + + - Attempts to execute a query which would exceed the privacy budget of the accountant. + - Attempts to change the slack of the accountant in such a way that the existing budget spends would exceed the + accountant's budget. + + """ + + +class PrivacyLeakWarning(RuntimeWarning): + """Custom warning to capture privacy leaks resulting from incorrect parameter setting. + + For example, this warning may occur when the user: + + - fails to specify the bounds or range of data to a model where required (e.g., `bounds=None` to + :class:`.GaussianNB`). + - inputs data to a model that falls outside the bounds or range originally specified. + + """ + + +class DiffprivlibCompatibilityWarning(RuntimeWarning): + """Custom warning to capture inherited class arguments that are not compatible with diffprivlib. + + The purpose of the warning is to alert the user of the incompatibility, but to continue execution having fixed the + incompatibility at runtime. + + For example, this warning may occur when the user: + + - passes a parameter value that is not compatible with diffprivlib (e.g., `solver='liblinear'` to + :class:`.LogisticRegression`) + - specifies a non-default value for a parameter that is ignored by diffprivlib (e.g., `intercept_scaling=0.5` + to :class:`.LogisticRegression`. + + """ + + +warnings.simplefilter('always', PrivacyLeakWarning) diff --git a/privbayes-synthesizer/code/diffprivlib/validation.py b/privbayes-synthesizer/code/diffprivlib/validation.py new file mode 100644 index 0000000..a12301a --- /dev/null +++ b/privbayes-synthesizer/code/diffprivlib/validation.py @@ -0,0 +1,221 @@ +# MIT License +# +# Copyright (C) IBM Corporation 2020 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Validation functions for the differential privacy library +""" +from numbers import Real, Integral + +import numpy as np + +from diffprivlib.utils import warn_unused_args + + +def check_epsilon_delta(epsilon, delta, allow_zero=False): + """Checks that epsilon and delta are valid values for differential privacy. Throws an error if checks fail, + otherwise returns nothing. + + As well as the requirements of epsilon and delta separately, both cannot be simultaneously zero, unless + ``allow_zero`` is set to ``True``. + + Parameters + ---------- + epsilon : float + Epsilon parameter for differential privacy. Must be non-negative. + + delta : float + Delta parameter for differential privacy. Must be on the unit interval, [0, 1]. + + allow_zero : bool, default: False + Allow epsilon and delta both be zero. + + """ + if not isinstance(epsilon, Real) or not isinstance(delta, Real): + raise TypeError("Epsilon and delta must be numeric") + + if epsilon < 0: + raise ValueError("Epsilon must be non-negative") + + if not 0 <= delta <= 1: + raise ValueError("Delta must be in [0, 1]") + + if not allow_zero and epsilon + delta == 0: + raise ValueError("Epsilon and Delta cannot both be zero") + + +def check_bounds(bounds, shape=0, min_separation=0.0, dtype=float): + """Input validation for the ``bounds`` parameter. + + Checks that ``bounds`` is composed of a list of tuples of the form (lower, upper), where lower <= upper and both + are numeric. Also checks that ``bounds`` contains the appropriate number of dimensions, and that there is a + ``min_separation`` between the bounds. + + Parameters + ---------- + bounds : tuple + Tuple of bounds of the form (min, max). `min` and `max` can either be scalars or 1-dimensional arrays. + + shape : int, default: 0 + Number of dimensions to be expected in ``bounds``. + + min_separation : float, default: 0.0 + The minimum separation between `lower` and `upper` of each dimension. This separation is enforced if not + already satisfied. + + dtype : data-type, default: float + Data type of the returned bounds. + + Returns + ------- + bounds : tuple + + """ + if not isinstance(bounds, tuple): + raise TypeError(f"Bounds must be specified as a tuple of (min, max), got {type(bounds)}.") + if not isinstance(shape, Integral): + raise TypeError(f"shape parameter must be integer-valued, got {type(shape)}.") + + lower, upper = bounds + + if np.asarray(lower).size == 1 or np.asarray(upper).size == 1: + lower = np.ravel(lower).astype(dtype) + upper = np.ravel(upper).astype(dtype) + else: + lower = np.asarray(lower, dtype=dtype) + upper = np.asarray(upper, dtype=dtype) + + if lower.shape != upper.shape: + raise ValueError("lower and upper bounds must be the same shape array") + if lower.ndim > 1: + raise ValueError("lower and upper bounds must be scalar or a 1-dimensional array") + if lower.size not in (1, shape): + raise ValueError(f"lower and upper bounds must have {shape or 1} element(s), got {lower.size}.") + + n_bounds = lower.shape[0] + + for i in range(n_bounds): + _lower = lower[i] + _upper = upper[i] + + if not isinstance(_lower, Real) or not isinstance(_upper, Real): + raise TypeError(f"Each bound must be numeric, got {_lower} ({type(_lower)}) and {_upper} ({type(_upper)}).") + + if _lower > _upper: + raise ValueError(f"For each bound, lower bound must be smaller than upper bound, got {lower}, {upper})") + + if _upper - _lower < min_separation: + mid = (_upper + _lower) / 2 + lower[i] = mid - min_separation / 2 + upper[i] = mid + min_separation / 2 + + if shape == 0: + return lower.item(), upper.item() + + if n_bounds == 1: + lower = np.ones(shape, dtype=dtype) * lower.item() + upper = np.ones(shape, dtype=dtype) * upper.item() + + return lower, upper + + +def clip_to_norm(array, clip): + """Clips the examples of a 2-dimensional array to a given maximum norm. + + Parameters + ---------- + array : np.ndarray + Array to be clipped. After clipping, all examples have a 2-norm of at most `clip`. + + clip : float + Norm at which to clip each example + + Returns + ------- + array : np.ndarray + The clipped array. + + """ + if not isinstance(array, np.ndarray): + raise TypeError(f"Input array must be a numpy array, got {type(array)}.") + if array.ndim != 2: + raise ValueError(f"input array must be 2-dimensional, got {array.ndim} dimensions.") + if not isinstance(clip, Real): + raise TypeError(f"Clip value must be numeric, got {type(clip)}.") + if clip <= 0: + raise ValueError(f"Clip value must be strictly positive, got {clip}.") + + norms = np.linalg.norm(array, axis=1) / clip + norms[norms < 1] = 1 + + return array / norms[:, np.newaxis] + + +def clip_to_bounds(array, bounds): + """Clips the examples of a 2-dimensional array to given bounds. + + Parameters + ---------- + array : np.ndarray + Array to be clipped. After clipping, all examples have a 2-norm of at most `clip`. + + bounds : tuple + Tuple of bounds of the form (min, max) which the array is to be clipped to. `min` and `max` must be scalar, + unless array is 2-dimensional. + + Returns + ------- + array : np.ndarray + The clipped array. + + """ + if not isinstance(array, np.ndarray): + raise TypeError(f"Input array must be a numpy array, got {type(array)}.") + + lower, upper = check_bounds(bounds, np.size(bounds[0]), min_separation=0) + clipped_array = array.copy() + + if np.allclose(lower, np.min(lower)) and np.allclose(upper, np.max(upper)): + clipped_array = np.clip(clipped_array, np.min(lower), np.max(upper)) + else: + if array.ndim != 2: + raise ValueError(f"For non-scalar bounds, input array must be 2-dimensional. Got {array.ndim} dimensions.") + + for feature in range(array.shape[1]): + clipped_array[:, feature] = np.clip(array[:, feature], lower[feature], upper[feature]) + + return clipped_array + + +class DiffprivlibMixin: # pylint: disable=too-few-public-methods + """Mixin for Diffprivlib models.""" + _check_bounds = staticmethod(check_bounds) + _clip_to_norm = staticmethod(clip_to_norm) + _clip_to_bounds = staticmethod(clip_to_bounds) + _warn_unused_args = staticmethod(warn_unused_args) + + # todo: remove when scikit-learn v1.2 is a min requirement + def _validate_params(self): + pass + + @staticmethod + def _copy_parameter_constraints(cls, *args): + """Copies the parameter constraints for `*args` from `cls` + """ + if not hasattr(cls, "_parameter_constraints"): + return {} + + return {k: cls._parameter_constraints[k] for k in args if k in cls._parameter_constraints}