Merge pull request #297 from madgik/bug/issue324

Bug/issue324
madgik · Jan 5, 2021 · 357ab73 · 357ab73
2 parents f7fd099 + 70724f0
commit 357ab73
Show file tree

Hide file tree

Showing 13 changed files with 2,669 additions and 1,816 deletions.
diff --git a/...e-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/generate_testcases_logistic_regression.py b/...e-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/generate_testcases_logistic_regression.py
@@ -3,7 +3,6 @@
 sklearn doesn't support Logistic Regression without regularization.
 """
 import json
-
 from pathlib import Path
 from random import shuffle
 
@@ -19,13 +18,9 @@ def get_expected(self, alg_input):
         x_names = alg_input[0]["value"]
         y_name = alg_input[1]["value"]
         variables = x_names + "," + y_name
-        data = self.get_data(variables)
+        datasets = alg_input[3]["value"]
+        data = self.get_data(variables, datasets=datasets)
         data = data.dropna()
-        n_obs = len(data)
-
-        # If n_obs < n_cols reject
-        if n_obs == 0 or data.shape[0] < data.shape[1]:
-            return None
 
         # Select two categories at random for y
         categories = list(set(data[y_name]))
@@ -35,35 +30,43 @@ def get_expected(self, alg_input):
         cat_0, cat_1 = categories[:2]
 
         # Build filter
-        filter_ = {
-            "condition": "OR",
-            "rules": [
-                {
-                    "id": y_name,
-                    "field": y_name,
-                    "type": "string",
-                    "input": "text",
-                    "operator": "equal",
-                    "value": cat_0,
-                },
-                {
-                    "id": y_name,
-                    "field": y_name,
-                    "type": "string",
-                    "input": "text",
-                    "operator": "equal",
-                    "value": cat_1,
-                },
-            ],
-            "valid": True,
-        }
-        alg_input[4]["value"] = json.dumps(filter_)
+        #  filter_ = {
+        #      "condition": "OR",
+        #      "rules": [
+        #          {
+        #              "id": y_name,
+        #              "field": y_name,
+        #              "type": "string",
+        #              "input": "text",
+        #              "operator": "equal",
+        #              "value": cat_0,
+        #          },
+        #          {
+        #              "id": y_name,
+        #              "field": y_name,
+        #              "type": "string",
+        #              "input": "text",
+        #              "operator": "equal",
+        #              "value": cat_1,
+        #          },
+        #      ],
+        #      "valid": True,
+        #  }
+        #  alg_input[4]["value"] = json.dumps(filter_)
+        alg_input[4]["value"] = ""
+        alg_input[5]["value"] = cat_0
+        alg_input[6]["value"] = cat_1
 
         # Filter data according to above filter
         data = data[(data[y_name] == cat_0) | (data[y_name] == cat_1)]
         y = data[y_name]
         X = data[x_names.split(",")]
 
+        # If n_obs < n_cols reject
+        n_obs = len(data)
+        if n_obs == 0 or data.shape[0] < data.shape[1]:
+            return None
+
         # Reject when one class appears less times than then number of columns
         if any([len(y[y == item]) <= X.shape[1] for item in set(y)]):
             return None

diff --git a/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/logistic_regression.py b/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/logistic_regression.py
@@ -2,6 +2,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+import re
 from collections import namedtuple
 
 import numpy as np
@@ -19,23 +20,30 @@
     CONFIDENCE,
 )
 from utils.algorithm_utils import ExaremeError
+from utils.algorithm_utils import PrivacyError
+from utils.algorithm_utils import PRIVACY_MAGIC_NUMBER
 
 
 class LogisticRegression(Algorithm):
     def __init__(self, cli_args):
         super(LogisticRegression, self).__init__(__file__, cli_args)
 
     def local_init(self):
-        y, X = self.data.variables.iloc[:, 1], self.data.covariables
+        negl = self.parameters.negative_level
+        posl = self.parameters.positive_level
+        X, y = self.data.covariables, self.data.variables
+        X, y = keep_levels(X, y, positive_level=posl, negative_level=negl)
 
-        n_obs = len(y)  # todo make these variables automatically available on global
+        n_obs = len(y)
         n_cols = len(X.columns)
         y_name = y.name
         x_names = list(X.columns)
 
         n_y_pos = len(y[y == 1])
         n_y_neg = len(y[y == 0])
 
+        self.store(y=y)
+        self.store(X=X)
         self.push_and_add(n_obs=n_obs)
         self.push_and_add(n_y_pos=n_y_pos)
         self.push_and_add(n_y_neg=n_y_neg)
@@ -63,7 +71,7 @@ def global_init(self):
         self.push(coeff=coeff)
 
     def local_step(self):
-        y, X = self.data.variables.iloc[:, 1], self.data.covariables
+        y, X = self.load("y"), self.load("X")
         coeff = self.fetch("coeff")
 
         grad, hess, ll = update_local_model_parameters(X, y, coeff)
@@ -95,10 +103,10 @@ def global_step(self):
         self.push(coeff=coeff)
 
     def local_final(self):
-        y = self.data.variables.iloc[:, 1]
+        y, X = self.load("y"), self.load("X")
 
         thresholds = np.linspace(1.0, 0.0, num=2 ** 7 + 1)  # odd otherwise no half_idx
-        yhats = np.array([self.predict(threshold=thr) for thr in thresholds])
+        yhats = np.array([self.predict(x=X, threshold=thr) for thr in thresholds])
         fn, fp, tn, tp = compute_classification_results(y, yhats)
         half_idx = np.where(thresholds == 0.5)[0][0]
 
@@ -232,6 +240,28 @@ def predict(self, x=None, coeff=None, threshold=0.5):
         )
 
 
+def keep_levels(X, y, positive_level, negative_level):
+    if len(y) > 0:
+        posl_pattern = r"[^\[]+\[{pl}\]".format(pl=re.escape(positive_level))
+        posl_idx = [
+            re.search(posl_pattern, colname) is not None for colname in y.columns
+        ].index(True)
+        negl_pattern = r"[^\[]+\[{nl}\]".format(nl=re.escape(negative_level))
+        negl_idx = [
+            re.search(negl_pattern, colname) is not None for colname in y.columns
+        ].index(True)
+        keep_rows = np.logical_or(
+            y.iloc[:, negl_idx] == 1.0, y.iloc[:, posl_idx] == 1.0
+        )
+        X, y = X[keep_rows], y[keep_rows]
+        y = y.iloc[:, posl_idx]
+        if y.shape[0] < PRIVACY_MAGIC_NUMBER:
+            raise PrivacyError("Query results in illegal number of datapoints.")
+    else:
+        raise PrivacyError("Query results in illegal number of datapoints.")
+    return X, y
+
+
 def init_model(n_cols, n_obs):
     ll = -2 * n_obs * np.log(2)
     coeff = np.zeros(n_cols)
@@ -429,35 +459,16 @@ def compute_roc(true_positives, true_negatives, false_positives, false_negatives
         "-dataset",
         "adni",
         "-filter",
-        """
-        {
-            "condition": "OR",
-            "rules": [
-                {
-                    "id": "alzheimerbroadcategory",
-                    "field": "alzheimerbroadcategory",
-                    "type": "string",
-                    "input": "text",
-                    "operator": "equal",
-                    "value": "AD"
-                },
-                {
-                    "id": "alzheimerbroadcategory",
-                    "field": "alzheimerbroadcategory",
-                    "type": "string",
-                    "input": "text",
-                    "operator": "equal",
-                    "value": "CN"
-                }
-            ],
-            "valid": true
-        }
-        """,
+        "",
         "-formula",
         "",
+        "-positive_level",
+        "AD",
+        "-negative_level",
+        "CN",
     ]
     runner = create_runner(
-        LogisticRegression, num_workers=1, algorithm_args=algorithm_args,
+        LogisticRegression, num_workers=10, algorithm_args=algorithm_args,
     )
     start = time.time()
     runner.run()

diff --git a/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/properties.json b/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/properties.json
@@ -9,8 +9,8 @@
             "label": "x",
             "desc": "A list of variables from database. The variable should be Real, Integer. It cannot be empty",
             "type": "column",
-            "columnValuesSQLType": "real, integer, text",
-            "columnValuesIsCategorical": "",
+            "columnValuesSQLType": "real, integer",
+            "columnValuesIsCategorical": "false",
             "value": "leftaccumbensarea, leftacgganteriorcingulategyrus, leftainsanteriorinsula, rightaccumbensarea, rightacgganteriorcingulategyrus, rightainsanteriorinsula",
             "valueNotBlank": true,
             "valueMultiple": true,
@@ -68,6 +68,28 @@
             "valueNotBlank": false,
             "valueMultiple": false,
             "valueType": "string"
+        },
+        {
+            "name": "positive_level",
+            "label": "other",
+            "desc": "Level of the target variable to assign to the positive outcome.",
+            "type": "other",
+            "value": "",
+            "defaultValue": "",
+            "valueNotBlank": true,
+            "valueMultiple": false,
+            "valueType": "string"
+        },
+        {
+            "name": "negative_level",
+            "label": "other",
+            "desc": "Level of the target variable to assign to the negative outcome.",
+            "type": "other",
+            "value": "",
+            "defaultValue": "",
+            "valueNotBlank": true,
+            "valueMultiple": false,
+            "valueType": "string"
         }
     ]
 }
diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/algorithmtest.py b/Exareme-Docker/src/mip-algorithms/mipframework/algorithmtest.py
@@ -31,9 +31,9 @@ class AlgorithmTest(object):
     __metaclass__ = abc.ABCMeta
     """
     A base class for generating random test-cases for algorithm testing.
-    The test-cases are generated based on specifications gathered from 
-    the algorithm's properties.json file, uniformly at random whenever 
-    possible. The class must be subclassed for each algorithm and the 
+    The test-cases are generated based on specifications gathered from
+    the algorithm's properties.json file, uniformly at random whenever
+    possible. The class must be subclassed for each algorithm and the
     `get_expected` method must be implemented by the subclass using some
     standard library for computing the expected results.
     """