Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug/issue324 #297

Merged
merged 14 commits into from
Jan 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
sklearn doesn't support Logistic Regression without regularization.
"""
import json

from pathlib import Path
from random import shuffle

Expand All @@ -19,13 +18,9 @@ def get_expected(self, alg_input):
x_names = alg_input[0]["value"]
y_name = alg_input[1]["value"]
variables = x_names + "," + y_name
data = self.get_data(variables)
datasets = alg_input[3]["value"]
data = self.get_data(variables, datasets=datasets)
data = data.dropna()
n_obs = len(data)

# If n_obs < n_cols reject
if n_obs == 0 or data.shape[0] < data.shape[1]:
return None

# Select two categories at random for y
categories = list(set(data[y_name]))
Expand All @@ -35,35 +30,43 @@ def get_expected(self, alg_input):
cat_0, cat_1 = categories[:2]

# Build filter
filter_ = {
"condition": "OR",
"rules": [
{
"id": y_name,
"field": y_name,
"type": "string",
"input": "text",
"operator": "equal",
"value": cat_0,
},
{
"id": y_name,
"field": y_name,
"type": "string",
"input": "text",
"operator": "equal",
"value": cat_1,
},
],
"valid": True,
}
alg_input[4]["value"] = json.dumps(filter_)
# filter_ = {
# "condition": "OR",
# "rules": [
# {
# "id": y_name,
# "field": y_name,
# "type": "string",
# "input": "text",
# "operator": "equal",
# "value": cat_0,
# },
# {
# "id": y_name,
# "field": y_name,
# "type": "string",
# "input": "text",
# "operator": "equal",
# "value": cat_1,
# },
# ],
# "valid": True,
# }
# alg_input[4]["value"] = json.dumps(filter_)
alg_input[4]["value"] = ""
alg_input[5]["value"] = cat_0
alg_input[6]["value"] = cat_1

# Filter data according to above filter
data = data[(data[y_name] == cat_0) | (data[y_name] == cat_1)]
y = data[y_name]
X = data[x_names.split(",")]

# If n_obs < n_cols reject
n_obs = len(data)
if n_obs == 0 or data.shape[0] < data.shape[1]:
return None

# Reject when one class appears less times than then number of columns
if any([len(y[y == item]) <= X.shape[1] for item in set(y)]):
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import print_function
from __future__ import unicode_literals

import re
from collections import namedtuple

import numpy as np
Expand All @@ -19,23 +20,30 @@
CONFIDENCE,
)
from utils.algorithm_utils import ExaremeError
from utils.algorithm_utils import PrivacyError
from utils.algorithm_utils import PRIVACY_MAGIC_NUMBER


class LogisticRegression(Algorithm):
def __init__(self, cli_args):
super(LogisticRegression, self).__init__(__file__, cli_args)

def local_init(self):
y, X = self.data.variables.iloc[:, 1], self.data.covariables
negl = self.parameters.negative_level
posl = self.parameters.positive_level
X, y = self.data.covariables, self.data.variables
X, y = keep_levels(X, y, positive_level=posl, negative_level=negl)

n_obs = len(y) # todo make these variables automatically available on global
n_obs = len(y)
n_cols = len(X.columns)
y_name = y.name
x_names = list(X.columns)

n_y_pos = len(y[y == 1])
n_y_neg = len(y[y == 0])

self.store(y=y)
self.store(X=X)
self.push_and_add(n_obs=n_obs)
self.push_and_add(n_y_pos=n_y_pos)
self.push_and_add(n_y_neg=n_y_neg)
Expand Down Expand Up @@ -63,7 +71,7 @@ def global_init(self):
self.push(coeff=coeff)

def local_step(self):
y, X = self.data.variables.iloc[:, 1], self.data.covariables
y, X = self.load("y"), self.load("X")
coeff = self.fetch("coeff")

grad, hess, ll = update_local_model_parameters(X, y, coeff)
Expand Down Expand Up @@ -95,10 +103,10 @@ def global_step(self):
self.push(coeff=coeff)

def local_final(self):
y = self.data.variables.iloc[:, 1]
y, X = self.load("y"), self.load("X")

thresholds = np.linspace(1.0, 0.0, num=2 ** 7 + 1) # odd otherwise no half_idx
yhats = np.array([self.predict(threshold=thr) for thr in thresholds])
yhats = np.array([self.predict(x=X, threshold=thr) for thr in thresholds])
fn, fp, tn, tp = compute_classification_results(y, yhats)
half_idx = np.where(thresholds == 0.5)[0][0]

Expand Down Expand Up @@ -232,6 +240,28 @@ def predict(self, x=None, coeff=None, threshold=0.5):
)


def keep_levels(X, y, positive_level, negative_level):
if len(y) > 0:
posl_pattern = r"[^\[]+\[{pl}\]".format(pl=re.escape(positive_level))
posl_idx = [
re.search(posl_pattern, colname) is not None for colname in y.columns
].index(True)
negl_pattern = r"[^\[]+\[{nl}\]".format(nl=re.escape(negative_level))
negl_idx = [
re.search(negl_pattern, colname) is not None for colname in y.columns
].index(True)
keep_rows = np.logical_or(
y.iloc[:, negl_idx] == 1.0, y.iloc[:, posl_idx] == 1.0
)
X, y = X[keep_rows], y[keep_rows]
y = y.iloc[:, posl_idx]
if y.shape[0] < PRIVACY_MAGIC_NUMBER:
raise PrivacyError("Query results in illegal number of datapoints.")
else:
raise PrivacyError("Query results in illegal number of datapoints.")
return X, y


def init_model(n_cols, n_obs):
ll = -2 * n_obs * np.log(2)
coeff = np.zeros(n_cols)
Expand Down Expand Up @@ -429,35 +459,16 @@ def compute_roc(true_positives, true_negatives, false_positives, false_negatives
"-dataset",
"adni",
"-filter",
"""
{
"condition": "OR",
"rules": [
{
"id": "alzheimerbroadcategory",
"field": "alzheimerbroadcategory",
"type": "string",
"input": "text",
"operator": "equal",
"value": "AD"
},
{
"id": "alzheimerbroadcategory",
"field": "alzheimerbroadcategory",
"type": "string",
"input": "text",
"operator": "equal",
"value": "CN"
}
],
"valid": true
}
""",
"",
"-formula",
"",
"-positive_level",
"AD",
"-negative_level",
"CN",
]
runner = create_runner(
LogisticRegression, num_workers=1, algorithm_args=algorithm_args,
LogisticRegression, num_workers=10, algorithm_args=algorithm_args,
)
start = time.time()
runner.run()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
"label": "x",
"desc": "A list of variables from database. The variable should be Real, Integer. It cannot be empty",
"type": "column",
"columnValuesSQLType": "real, integer, text",
"columnValuesIsCategorical": "",
"columnValuesSQLType": "real, integer",
"columnValuesIsCategorical": "false",
"value": "leftaccumbensarea, leftacgganteriorcingulategyrus, leftainsanteriorinsula, rightaccumbensarea, rightacgganteriorcingulategyrus, rightainsanteriorinsula",
"valueNotBlank": true,
"valueMultiple": true,
Expand Down Expand Up @@ -68,6 +68,28 @@
"valueNotBlank": false,
"valueMultiple": false,
"valueType": "string"
},
{
"name": "positive_level",
"label": "other",
"desc": "Level of the target variable to assign to the positive outcome.",
"type": "other",
"value": "",
"defaultValue": "",
"valueNotBlank": true,
"valueMultiple": false,
"valueType": "string"
},
{
"name": "negative_level",
"label": "other",
"desc": "Level of the target variable to assign to the negative outcome.",
"type": "other",
"value": "",
"defaultValue": "",
"valueNotBlank": true,
"valueMultiple": false,
"valueType": "string"
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ class AlgorithmTest(object):
__metaclass__ = abc.ABCMeta
"""
A base class for generating random test-cases for algorithm testing.
The test-cases are generated based on specifications gathered from
the algorithm's properties.json file, uniformly at random whenever
possible. The class must be subclassed for each algorithm and the
The test-cases are generated based on specifications gathered from
the algorithm's properties.json file, uniformly at random whenever
possible. The class must be subclassed for each algorithm and the
`get_expected` method must be implemented by the subclass using some
standard library for computing the expected results.
"""
Expand Down
Loading