Skip to content

Commit

Permalink
Merge pull request #297 from madgik/bug/issue324
Browse files Browse the repository at this point in the history
Bug/issue324
  • Loading branch information
ThanKarab authored Jan 5, 2021
2 parents f7fd099 + 70724f0 commit 357ab73
Show file tree
Hide file tree
Showing 13 changed files with 2,669 additions and 1,816 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
sklearn doesn't support Logistic Regression without regularization.
"""
import json

from pathlib import Path
from random import shuffle

Expand All @@ -19,13 +18,9 @@ def get_expected(self, alg_input):
x_names = alg_input[0]["value"]
y_name = alg_input[1]["value"]
variables = x_names + "," + y_name
data = self.get_data(variables)
datasets = alg_input[3]["value"]
data = self.get_data(variables, datasets=datasets)
data = data.dropna()
n_obs = len(data)

# If n_obs < n_cols reject
if n_obs == 0 or data.shape[0] < data.shape[1]:
return None

# Select two categories at random for y
categories = list(set(data[y_name]))
Expand All @@ -35,35 +30,43 @@ def get_expected(self, alg_input):
cat_0, cat_1 = categories[:2]

# Build filter
filter_ = {
"condition": "OR",
"rules": [
{
"id": y_name,
"field": y_name,
"type": "string",
"input": "text",
"operator": "equal",
"value": cat_0,
},
{
"id": y_name,
"field": y_name,
"type": "string",
"input": "text",
"operator": "equal",
"value": cat_1,
},
],
"valid": True,
}
alg_input[4]["value"] = json.dumps(filter_)
# filter_ = {
# "condition": "OR",
# "rules": [
# {
# "id": y_name,
# "field": y_name,
# "type": "string",
# "input": "text",
# "operator": "equal",
# "value": cat_0,
# },
# {
# "id": y_name,
# "field": y_name,
# "type": "string",
# "input": "text",
# "operator": "equal",
# "value": cat_1,
# },
# ],
# "valid": True,
# }
# alg_input[4]["value"] = json.dumps(filter_)
alg_input[4]["value"] = ""
alg_input[5]["value"] = cat_0
alg_input[6]["value"] = cat_1

# Filter data according to above filter
data = data[(data[y_name] == cat_0) | (data[y_name] == cat_1)]
y = data[y_name]
X = data[x_names.split(",")]

# If n_obs < n_cols reject
n_obs = len(data)
if n_obs == 0 or data.shape[0] < data.shape[1]:
return None

# Reject when one class appears less times than then number of columns
if any([len(y[y == item]) <= X.shape[1] for item in set(y)]):
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import print_function
from __future__ import unicode_literals

import re
from collections import namedtuple

import numpy as np
Expand All @@ -19,23 +20,30 @@
CONFIDENCE,
)
from utils.algorithm_utils import ExaremeError
from utils.algorithm_utils import PrivacyError
from utils.algorithm_utils import PRIVACY_MAGIC_NUMBER


class LogisticRegression(Algorithm):
def __init__(self, cli_args):
super(LogisticRegression, self).__init__(__file__, cli_args)

def local_init(self):
y, X = self.data.variables.iloc[:, 1], self.data.covariables
negl = self.parameters.negative_level
posl = self.parameters.positive_level
X, y = self.data.covariables, self.data.variables
X, y = keep_levels(X, y, positive_level=posl, negative_level=negl)

n_obs = len(y) # todo make these variables automatically available on global
n_obs = len(y)
n_cols = len(X.columns)
y_name = y.name
x_names = list(X.columns)

n_y_pos = len(y[y == 1])
n_y_neg = len(y[y == 0])

self.store(y=y)
self.store(X=X)
self.push_and_add(n_obs=n_obs)
self.push_and_add(n_y_pos=n_y_pos)
self.push_and_add(n_y_neg=n_y_neg)
Expand Down Expand Up @@ -63,7 +71,7 @@ def global_init(self):
self.push(coeff=coeff)

def local_step(self):
y, X = self.data.variables.iloc[:, 1], self.data.covariables
y, X = self.load("y"), self.load("X")
coeff = self.fetch("coeff")

grad, hess, ll = update_local_model_parameters(X, y, coeff)
Expand Down Expand Up @@ -95,10 +103,10 @@ def global_step(self):
self.push(coeff=coeff)

def local_final(self):
y = self.data.variables.iloc[:, 1]
y, X = self.load("y"), self.load("X")

thresholds = np.linspace(1.0, 0.0, num=2 ** 7 + 1) # odd otherwise no half_idx
yhats = np.array([self.predict(threshold=thr) for thr in thresholds])
yhats = np.array([self.predict(x=X, threshold=thr) for thr in thresholds])
fn, fp, tn, tp = compute_classification_results(y, yhats)
half_idx = np.where(thresholds == 0.5)[0][0]

Expand Down Expand Up @@ -232,6 +240,28 @@ def predict(self, x=None, coeff=None, threshold=0.5):
)


def keep_levels(X, y, positive_level, negative_level):
if len(y) > 0:
posl_pattern = r"[^\[]+\[{pl}\]".format(pl=re.escape(positive_level))
posl_idx = [
re.search(posl_pattern, colname) is not None for colname in y.columns
].index(True)
negl_pattern = r"[^\[]+\[{nl}\]".format(nl=re.escape(negative_level))
negl_idx = [
re.search(negl_pattern, colname) is not None for colname in y.columns
].index(True)
keep_rows = np.logical_or(
y.iloc[:, negl_idx] == 1.0, y.iloc[:, posl_idx] == 1.0
)
X, y = X[keep_rows], y[keep_rows]
y = y.iloc[:, posl_idx]
if y.shape[0] < PRIVACY_MAGIC_NUMBER:
raise PrivacyError("Query results in illegal number of datapoints.")
else:
raise PrivacyError("Query results in illegal number of datapoints.")
return X, y


def init_model(n_cols, n_obs):
ll = -2 * n_obs * np.log(2)
coeff = np.zeros(n_cols)
Expand Down Expand Up @@ -429,35 +459,16 @@ def compute_roc(true_positives, true_negatives, false_positives, false_negatives
"-dataset",
"adni",
"-filter",
"""
{
"condition": "OR",
"rules": [
{
"id": "alzheimerbroadcategory",
"field": "alzheimerbroadcategory",
"type": "string",
"input": "text",
"operator": "equal",
"value": "AD"
},
{
"id": "alzheimerbroadcategory",
"field": "alzheimerbroadcategory",
"type": "string",
"input": "text",
"operator": "equal",
"value": "CN"
}
],
"valid": true
}
""",
"",
"-formula",
"",
"-positive_level",
"AD",
"-negative_level",
"CN",
]
runner = create_runner(
LogisticRegression, num_workers=1, algorithm_args=algorithm_args,
LogisticRegression, num_workers=10, algorithm_args=algorithm_args,
)
start = time.time()
runner.run()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
"label": "x",
"desc": "A list of variables from database. The variable should be Real, Integer. It cannot be empty",
"type": "column",
"columnValuesSQLType": "real, integer, text",
"columnValuesIsCategorical": "",
"columnValuesSQLType": "real, integer",
"columnValuesIsCategorical": "false",
"value": "leftaccumbensarea, leftacgganteriorcingulategyrus, leftainsanteriorinsula, rightaccumbensarea, rightacgganteriorcingulategyrus, rightainsanteriorinsula",
"valueNotBlank": true,
"valueMultiple": true,
Expand Down Expand Up @@ -68,6 +68,28 @@
"valueNotBlank": false,
"valueMultiple": false,
"valueType": "string"
},
{
"name": "positive_level",
"label": "other",
"desc": "Level of the target variable to assign to the positive outcome.",
"type": "other",
"value": "",
"defaultValue": "",
"valueNotBlank": true,
"valueMultiple": false,
"valueType": "string"
},
{
"name": "negative_level",
"label": "other",
"desc": "Level of the target variable to assign to the negative outcome.",
"type": "other",
"value": "",
"defaultValue": "",
"valueNotBlank": true,
"valueMultiple": false,
"valueType": "string"
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ class AlgorithmTest(object):
__metaclass__ = abc.ABCMeta
"""
A base class for generating random test-cases for algorithm testing.
The test-cases are generated based on specifications gathered from
the algorithm's properties.json file, uniformly at random whenever
possible. The class must be subclassed for each algorithm and the
The test-cases are generated based on specifications gathered from
the algorithm's properties.json file, uniformly at random whenever
possible. The class must be subclassed for each algorithm and the
`get_expected` method must be implemented by the subclass using some
standard library for computing the expected results.
"""
Expand Down
Loading

0 comments on commit 357ab73

Please sign in to comment.