From 6fc5808a1d9801522aa3a1b99f8f8d245cb94faf Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Sun, 3 Jan 2021 16:18:33 +0200 Subject: [PATCH 01/14] Implement Gaussian and Categorical additive NB classes --- .../NAIVE_BAYES/naive_bayes_nocv.py | 307 ++++++++++++++++++ .../NAIVE_BAYES/properties.json | 80 +++++ 2 files changed, 387 insertions(+) create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py new file mode 100644 index 000000000..ca07e4c1b --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py @@ -0,0 +1,307 @@ +from __future__ import print_function +from __future__ import division +from __future__ import unicode_literals + +from collections import Counter +import warnings + +import numpy as np +from sklearn.naive_bayes import GaussianNB +from sklearn.naive_bayes import BaseDiscreteNB + +from mipframework import Algorithm +from mipframework import AlgorithmResult + + +class CategoricalNaiveBayesTrain(Algorithm): + def __init__(self, cli_args): + super(CategoricalNaiveBayesTrain, self).__init__( + __file__, cli_args, intercept=False + ) + + def local_(self): + data = self.data.full + y, X = data[self.parameters.y], data[self.parameters.x] + y, X = np.array(y), np.array(X) + cnb = AdditiveCategoricalNB() + cnb.fit(X, y) + self.push_and_add(cnb=cnb) + + def global_(self): + cnb = self.fetch("cnb") + + self.result = AlgorithmResult( + raw_data={"category_count": [cc.tolist() for cc in cnb.category_count_]} + ) + + +class AdditiveCategoricalNB(BaseDiscreteNB): + def __init__(self, alpha=1.0): + self.alpha = alpha + self._class_log_prior_ = np.array([]) + self._feature_log_prob_ = [] + + def fit(self, X, y): + self.n_obs_, self.n_features_ = X.shape + self.classes_, self.class_count_ = np.unique(y, return_counts=True) + self.n_classes_ = len(self.classes_) + self.categories_, self.category_per_feat_count_ = list( + zip(*[np.unique(col, return_counts=True) for col in X.T]) + ) + self.n_categories_ = np.array([len(c) for c in self.categories_]) + self.category_count_ = [ + np.empty((self.n_classes_, self.n_categories_[f])) + for f in xrange(self.n_features_) + ] + for ci, c in enumerate(self.classes_): + X_where_x = X[np.where(y == c)[0]] + for fi, feature in enumerate(X_where_x.T): + counter = Counter(feature) + self.category_count_[fi][ci, :] = np.array( + [counter[cat] for cat in self.categories_[fi]] + ) + + def __add__(self, other): + def sum_elementwise(x, y): + return [xi + yi for xi, yi in zip(x, y)] + + if self.alpha != other.alpha: + raise ValueError("alphas do not agree") + result = AdditiveCategoricalNB(alpha=self.alpha) + + result.n_obs_ = self.n_obs_ + other.n_obs_ + + if self.n_features_ != other.n_features_: + raise ValueError("n_features_ do not agree") + result.n_features_ = self.n_features_ + + if (self.classes_ != other.classes_).all(): + raise ValueError("classes_ do not agree") + result.classes_ = self.classes_ + + result.class_count_ = self.class_count_ + other.class_count_ + + if self.n_classes_ != other.n_classes_: + raise ValueError("n_classes_ do not agree") + result.n_classes_ = self.n_classes_ + + result.category_per_feat_count_ = sum_elementwise( + self.category_per_feat_count_, other.category_per_feat_count_ + ) + + if not all( + [(c1 == c2).all() for c1, c2 in zip(self.categories_, other.categories_)] + ): + raise ValueError("catefories_ do not agree") + result.categories_ = self.categories_ + + result.n_categories_ = sum_elementwise(self.n_categories_, other.n_categories_) + + result.category_count_ = sum_elementwise( + self.category_count_, other.category_count_ + ) + + return result + + @property + def class_log_prior_(self): + if not self._class_log_prior_.any(): + with warnings.catch_warnings(): + # silence the warning when count is 0 because class was not yet + # observed + warnings.simplefilter("ignore", RuntimeWarning) + log_class_count = np.log(self.class_count_) + self._class_log_prior_ = log_class_count - np.log(self.class_count_.sum()) + return self._class_log_prior_ + + @property + def feature_log_prob_(self): + if not self._feature_log_prob_: + feature_log_prob = [] + for i in range(self.n_features_): + smoothed_cat_count = self.category_count_[i] + self.alpha + smoothed_class_count = smoothed_cat_count.sum(axis=1) + feature_log_prob.append( + np.log(smoothed_cat_count) + - np.log(smoothed_class_count.reshape(-1, 1)) + ) + self._feature_log_prob_ = feature_log_prob + return self._feature_log_prob_ + + def _joint_log_likelihood(self, X): + if not X.shape[1] == self.n_features_: + raise ValueError( + "Expected input with %d features, got %d instead" + % (self.n_features_, X.shape[1]) + ) + jll = np.zeros((X.shape[0], self.class_count_.shape[0])) + for i in range(self.n_features_): + categories = X[:, i] + indices = [np.where(self.categories_[i] == cat)[0][0] for cat in categories] + jll += self.feature_log_prob_[i][:, indices].T + total_ll = jll + self.class_log_prior_ + return total_ll + + def __eq__(self, other): + pass + + +def run_categorical(): + import time + from mipframework import create_runner + + algorithm_args = [ + "-x", + "gender,apoe4,agegroup", + "-y", + "alzheimerbroadcategory", + "-alpha", + "1", + "-k", + "1", + "-pathology", + "dementia", + "-dataset", + "adni", + "-filter", + "", + ] + runner = create_runner( + CategoricalNaiveBayesTrain, algorithm_args=algorithm_args, num_workers=10, + ) + start = time.time() + runner.run() + end = time.time() + print("Completed in ", end - start) + + +class GaussianNaiveBayesTrain(Algorithm): + def __init__(self, cli_args): + super(GaussianNaiveBayesTrain, self).__init__( + __file__, cli_args, intercept=False + ) + + def local_(self): + data = self.data.full + y, X = data[self.parameters.y], data[self.parameters.x] + y, X = np.array(y), np.array(X) + gnb = AdditiveGaussianNB() + gnb.fit(X, y) + self.push_and_add(gnb=gnb) + + def global_(self): + gnb = self.fetch("gnb") + + self.result = AlgorithmResult( + raw_data={"theta": gnb.theta_.tolist(), "sigma": gnb.sigma_.tolist()} + ) + + +class AdditiveGaussianNB(GaussianNB): + def fit(self, X, y): + self.n_obs_, self.n_feats_ = X.shape + super(AdditiveGaussianNB, self).fit(X, y) + + def __add__(self, other): + if self.var_smoothing != other.var_smoothing: + raise ValueError("var_smoothing values do not agree") + if self.priors != other.priors: + raise ValueError("priors do not agree") + if (self.classes_ != other.classes_).all(): + raise ValueError("classes_ do not agree") + + class_count_1 = self.class_count_[:, np.newaxis] + class_count_2 = other.class_count_[:, np.newaxis] + n_obs_total = self.n_obs_ + other.n_obs_ + class_count_total = class_count_1 + class_count_2 + + theta_total = ( + class_count_1 * self.theta_ + class_count_2 * other.theta_ + ) / class_count_total + + self.sigma_[:, :] -= self.epsilon_ + other.sigma_[:, :] -= other.epsilon_ + epsilon_total = max(self.epsilon_, other.epsilon_) + ssd_1 = class_count_1 * self.sigma_ + ssd_2 = class_count_2 * other.sigma_ + total_ssd = ( + ssd_1 + + ssd_2 + + (class_count_1 * class_count_2 / class_count_total) + * (self.theta_ - other.theta_) ** 2 + ) + sigma_total = total_ssd / class_count_total + sigma_total += epsilon_total + + result = AdditiveGaussianNB(self.priors, self.var_smoothing) + result.n_obs_ = n_obs_total + result.classes_ = self.classes_ + result.sigma_ = sigma_total + result.theta_ = theta_total + result.epsilon_ = epsilon_total + result.class_count_ = class_count_total.flatten() + result.class_prior_ = result.class_count_ / n_obs_total + return result + + def __eq__(self, other): + if self.var_smoothing != other.var_smoothing: + return False + if self.priors != other.priors: + return False + if (self.classes_ != other.classes_).all(): + return False + if not np.isclose(self.theta_, other.theta_).all(): + return False + if not np.isclose(self.sigma_, other.sigma_).all(): + return self.sigma_, other.sigma_ + if (self.class_count_ != other.class_count_).all(): + return False + if (self.class_prior_ != other.class_prior_).all(): + return False + if self.n_obs_ != other.n_obs_: + return False + if self.n_feats_ != other.n_feats_: + return False + return True + + +def run_gaussian(): + import time + from mipframework import create_runner + + algorithm_args = [ + "-x", + "lefthippocampus,righthippocampus,leftaccumbensarea," + "leftacgganteriorcingulategyrus,leftainsanteriorinsula,leftamygdala", + "-y", + "alzheimerbroadcategory", + "-alpha", + "1", + "-k", + "1", + "-pathology", + "dementia", + "-dataset", + "adni", + "-filter", + "", + ] + runner = create_runner( + GaussianNaiveBayesTrain, algorithm_args=algorithm_args, num_workers=1, + ) + start = time.time() + runner.run() + end = time.time() + print("Completed in ", end - start) + runner = create_runner( + GaussianNaiveBayesTrain, algorithm_args=algorithm_args, num_workers=1, + ) + start = time.time() + runner.run() + end = time.time() + print("Completed in ", end - start) + + +if __name__ == "__main__": + # run_gaussian() + run_categorical() diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json new file mode 100644 index 000000000..a6cfdb528 --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json @@ -0,0 +1,80 @@ +{ + "name": "NAIVE_BAYES", + "desc": "Naive Bayes classifier for numerical data (Gaussian NB) and nominal data (Categorical NB).", + "label": "Naive Bayes classifier", + "type": "python_multiple_local_global", + "status": "enabled", + "parameters": [ + { + "name": "x", + "label": "x", + "desc": "Independent variables: A list of variables from database.", + "type": "column", + "columnValuesSQLType": "", + "columnValuesIsCategorical": "", + "columnValuesNumOfEnumerations": "", + "value": "righthippocampus,lefthippocampus", + "valueNotBlank": true, + "valueMultiple": true, + "valueType": "string" + }, { + "name": "y", + "label": "y", + "desc": "Dependent variable: A categorical variable form database.", + "type": "column", + "columnValuesSQLType": "", + "columnValuesIsCategorical": "true", + "columnValuesNumOfEnumerations": "", + "value": "alzheimerbroadcategory", + "valueNotBlank": true, + "valueMultiple": false, + "valueType": "string" + }, { + "name": "alpha", + "label": "alpha", + "desc": "Additive smoothing parameter (0 for no smoothing)", + "type": "other", + "value": 0.1, + "valueNotBlank": true, + "valueMultiple": false, + "valueType": "real" + }, { + "name": "k", + "label": "number of batches", + "desc": "The number of batches that will be used in k-fold crossvalidation.", + "type": "other", + "value": 10, + "valueNotBlank": true, + "valueMultiple": false, + "valueType": "int", + "valueMin": 2 + }, { + "name": "pathology", + "label": "pathology", + "desc": "The name of the pathology in which the dataset belongs to.", + "type": "pathology", + "value": "dementia", + "valueNotBlank": true, + "valueMultiple": false, + "valueType": "string" + }, { + "name": "dataset", + "label": "dataset", + "desc": "The names of one or more datasets, in which the algorithm will be executed.", + "type": "dataset", + "value": "desd-synthdata", + "valueNotBlank": true, + "valueMultiple": true, + "valueType": "string" + }, { + "name": "filter", + "label": "filter", + "desc": "", + "type": "filter", + "value": "", + "valueNotBlank": false, + "valueMultiple": true, + "valueType": "string" + } + ] +} From 30ee1a533127acff2c3f171866f237e90962ecb5 Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Sun, 3 Jan 2021 22:30:57 +0200 Subject: [PATCH 02/14] Add Mixed NB classifier with joint predict method --- .../NAIVE_BAYES/naive_bayes_nocv.py | 80 ++++++++++++++++++- .../mipframework/runner/runner.py | 5 ++ 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py index ca07e4c1b..93bf3f15e 100644 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py @@ -13,6 +13,69 @@ from mipframework import AlgorithmResult +class MixedAdditiveNB(object): + def __init__(self, alpha=1.0): + self.alpha = alpha + + def fit(self, X_num, X_cat, y): + self.gnb = AdditiveGaussianNB() + self.gnb.fit(X_num, y) + self.cnb = AdditiveCategoricalNB(alpha=self.alpha) + self.cnb.fit(X_cat, y) + + def predict(self, X_num, X_cat): + jll = ( + self.gnb.predict_log_proba(X_num) + + self.cnb.predict_log_proba(X_cat) + - self.gnb.class_log_prior_ + ) + return np.array([self.gnb.classes_[i] for i in jll.argmax(axis=1)]) + + +class MixedNaiveBayesTrain(Algorithm): + def __init__(self, cli_args): + super(MixedNaiveBayesTrain, self).__init__(__file__, cli_args, intercept=False) + + def local_(self): + data = self.data.full + y, X = data[self.parameters.y], data[self.parameters.x] + X_num = np.array(X.iloc[:, :3]) + X_cat = np.array(X.iloc[:, 3:]) + y = np.array(y) + mnb = MixedAdditiveNB() + mnb.fit(X_num, X_cat, y) + pass + + +def run_mixed(): + import time + from mipframework import create_runner + + algorithm_args = [ + "-x", + "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", + "-y", + "alzheimerbroadcategory", + "-alpha", + "1", + "-k", + "1", + "-pathology", + "dementia", + "-dataset", + "adni", + "-filter", + "", + ] + runner = create_runner( + MixedNaiveBayesTrain, algorithm_args=algorithm_args, num_workers=1, + ) + start = time.time() + runner.run() + end = time.time() + print("Completed in ", end - start) + + class CategoricalNaiveBayesTrain(Algorithm): def __init__(self, cli_args): super(CategoricalNaiveBayesTrain, self).__init__( @@ -143,7 +206,7 @@ def _joint_log_likelihood(self, X): return total_ll def __eq__(self, other): - pass + raise NotImplementedError def run_categorical(): @@ -200,8 +263,20 @@ def global_(self): class AdditiveGaussianNB(GaussianNB): def fit(self, X, y): self.n_obs_, self.n_feats_ = X.shape + self._class_log_prior_ = np.array([]) super(AdditiveGaussianNB, self).fit(X, y) + @property + def class_log_prior_(self): + if not self._class_log_prior_.any(): + with warnings.catch_warnings(): + # silence the warning when count is 0 because class was not yet + # observed + warnings.simplefilter("ignore", RuntimeWarning) + log_class_count = np.log(self.class_count_) + self._class_log_prior_ = log_class_count - np.log(self.class_count_.sum()) + return self._class_log_prior_ + def __add__(self, other): if self.var_smoothing != other.var_smoothing: raise ValueError("var_smoothing values do not agree") @@ -304,4 +379,5 @@ def run_gaussian(): if __name__ == "__main__": # run_gaussian() - run_categorical() + # run_categorical() + run_mixed() diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/runner/runner.py b/Exareme-Docker/src/mip-algorithms/mipframework/runner/runner.py index 2477926d5..c29a403b4 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/runner/runner.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/runner/runner.py @@ -20,6 +20,11 @@ "DescriptiveStats": "local-global", "KaplanMeier": "local-global", "ThreeC": "local", + "NaiveBayes": "multiple-local-global", + "NaiveBayesTrain": "local-global", + "GaussianNaiveBayesTrain": "local-global", + "CategoricalNaiveBayesTrain": "local-global", + "MixedNaiveBayesTrain": "local-global", } From 2094b7b50a98e81dfbe9b90057899f60eb27e09b Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Mon, 4 Jan 2021 11:23:14 +0200 Subject: [PATCH 03/14] Add correct fit, predict and __add__ methods to MixedAdditiveNB --- .../NAIVE_BAYES/naive_bayes_nocv.py | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py index 93bf3f15e..77a1cf001 100644 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py @@ -16,20 +16,38 @@ class MixedAdditiveNB(object): def __init__(self, alpha=1.0): self.alpha = alpha + self.gnb = None + self.cnb = None - def fit(self, X_num, X_cat, y): - self.gnb = AdditiveGaussianNB() - self.gnb.fit(X_num, y) - self.cnb = AdditiveCategoricalNB(alpha=self.alpha) - self.cnb.fit(X_cat, y) + def fit(self, X_num=None, X_cat=None, y=None): + if X_num is not None: + self.gnb = AdditiveGaussianNB() + self.gnb.fit(X_num, y) + if X_cat is not None: + self.cnb = AdditiveCategoricalNB(alpha=self.alpha) + self.cnb.fit(X_cat, y) def predict(self, X_num, X_cat): - jll = ( - self.gnb.predict_log_proba(X_num) - + self.cnb.predict_log_proba(X_cat) - - self.gnb.class_log_prior_ - ) - return np.array([self.gnb.classes_[i] for i in jll.argmax(axis=1)]) + if X_num is not None and X_cat is not None: + jll = ( + self.gnb.predict_log_proba(X_num) + + self.cnb.predict_log_proba(X_cat) + - self.gnb.class_log_prior_ + ) + return np.array([self.gnb.classes_[i] for i in jll.argmax(axis=1)]) + elif X_num is not None: + return self.gnb.predict(X_num) + elif X_cat is not None: + return self.cnb.predict(X_cat) + + def __add__(self, other): + result = MixedAdditiveNB() + if self.gnb and other.gnb: + result.gnb = self.gnb + other.gnb + if self.cnb and other.cnb: + result.alpha = self.alpha + result.cnb = self.cnb + other.cnb + return result class MixedNaiveBayesTrain(Algorithm): From eaad3b19153cc631896e4ac2506610a7566e8972 Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Mon, 4 Jan 2021 11:24:08 +0200 Subject: [PATCH 04/14] Implement NB with prediction (multi-local-global) witouh CV --- .../NAIVE_BAYES/naive_bayes-2.py | 322 ++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes-2.py diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes-2.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes-2.py new file mode 100644 index 000000000..e6ad6ae3a --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes-2.py @@ -0,0 +1,322 @@ +from __future__ import print_function +from __future__ import division +from __future__ import unicode_literals + +from collections import Counter +import warnings + +import numpy as np +import scipy +import sklearn.metrics +from sklearn.naive_bayes import GaussianNB +from sklearn.naive_bayes import BaseDiscreteNB + +from mipframework import Algorithm +from mipframework import AlgorithmResult + + +class NaiveBayes(Algorithm): + def __init__(self, cli_args): + super(NaiveBayes, self).__init__(__file__, cli_args, intercept=False) + + def local_init(self): + data = self.data.full + y, X = data[self.parameters.y], data[self.parameters.x] + categ_names = [k for k, v in self.metadata.is_categorical.items() if v == 1] + categ_names.remove(self.parameters.y[0]) + numer_names = [k for k, v in self.metadata.is_categorical.items() if v == 0] + X_cat = np.array(X[categ_names]) if categ_names else None + X_num = np.array(X[numer_names]) if numer_names else None + y = np.array(y) + # n_splits = int(self.parameters.k) + nb_model = MixedAdditiveNB(float(self.parameters.alpha)) + nb_model.fit(y, X_num, X_cat) + + self.store(X_cat=X_cat) + self.store(X_num=X_num) + self.store(y=y) + self.push_and_add(nb_model=nb_model) + + def global_init(self): + nb_model = self.fetch("nb_model") + + self.push(nb_model=nb_model) + + def local_final(self): + y = self.load("y") + X_num = self.load("X_num") + X_cat = self.load("X_cat") + nb_model = self.fetch("nb_model") + + y_pred = nb_model.predict(X_num, X_cat) + n_hits = sum(y_pred == np.array(y).flatten()) + n_miss = len(y) - n_hits + + self.push_and_add(n_hits=n_hits) + self.push_and_add(n_miss=n_miss) + + def global_final(self): + n_hits = self.fetch("n_hits") + n_miss = self.fetch("n_miss") + + self.result = AlgorithmResult( + raw_data={"precision": n_hits / (n_hits + n_miss)} + ) + + +class MixedAdditiveNB(object): + def __init__(self, alpha=1.0): + self.alpha = alpha + self.gnb = None + self.cnb = None + + def fit(self, y, X_num=None, X_cat=None): + if X_num is not None: + self.gnb = AdditiveGaussianNB() + self.gnb.fit(X_num, y) + if X_cat is not None: + self.cnb = AdditiveCategoricalNB(alpha=self.alpha) + self.cnb.fit(X_cat, y) + + def predict(self, X_num, X_cat): + if X_num is not None and X_cat is not None: + jll = ( + self.gnb.predict_log_proba(X_num) + + self.cnb.predict_log_proba(X_cat) + - self.gnb.class_log_prior_ + ) + return np.array([self.gnb.classes_[i] for i in jll.argmax(axis=1)]) + elif X_num is not None: + return self.gnb.predict(X_num) + elif X_cat is not None: + return self.cnb.predict(X_cat) + + def __add__(self, other): + result = MixedAdditiveNB() + if self.gnb and other.gnb: + result.gnb = self.gnb + other.gnb + if self.cnb and other.cnb: + result.alpha = self.alpha + result.cnb = self.cnb + other.cnb + return result + + +class AdditiveCategoricalNB(BaseDiscreteNB): + def __init__(self, alpha=1.0): + self.alpha = alpha + self._class_log_prior_ = np.array([]) + self._feature_log_prob_ = [] + + def fit(self, X, y): + self.n_obs_, self.n_features_ = X.shape + self.classes_, self.class_count_ = np.unique(y, return_counts=True) + self.n_classes_ = len(self.classes_) + self.categories_, self.category_per_feat_count_ = list( + zip(*[np.unique(col, return_counts=True) for col in X.T]) + ) + self.n_categories_ = np.array([len(c) for c in self.categories_]) + self.category_count_ = [ + np.empty((self.n_classes_, self.n_categories_[f])) + for f in xrange(self.n_features_) + ] + for ci, c in enumerate(self.classes_): + X_where_x = X[np.where(y == c)[0]] + for fi, feature in enumerate(X_where_x.T): + counter = Counter(feature) + self.category_count_[fi][ci, :] = np.array( + [counter[cat] for cat in self.categories_[fi]] + ) + + def __add__(self, other): + def sum_elementwise(x, y): + return [xi + yi for xi, yi in zip(x, y)] + + if self.alpha != other.alpha: + raise ValueError("alphas do not agree") + result = AdditiveCategoricalNB(alpha=self.alpha) + + result.n_obs_ = self.n_obs_ + other.n_obs_ + + if self.n_features_ != other.n_features_: + raise ValueError("n_features_ do not agree") + result.n_features_ = self.n_features_ + + if (self.classes_ != other.classes_).all(): + raise ValueError("classes_ do not agree") + result.classes_ = self.classes_ + + result.class_count_ = self.class_count_ + other.class_count_ + + if self.n_classes_ != other.n_classes_: + raise ValueError("n_classes_ do not agree") + result.n_classes_ = self.n_classes_ + + result.category_per_feat_count_ = sum_elementwise( + self.category_per_feat_count_, other.category_per_feat_count_ + ) + + if not all( + [(c1 == c2).all() for c1, c2 in zip(self.categories_, other.categories_)] + ): + raise ValueError("catefories_ do not agree") + result.categories_ = self.categories_ + + result.n_categories_ = sum_elementwise(self.n_categories_, other.n_categories_) + + result.category_count_ = sum_elementwise( + self.category_count_, other.category_count_ + ) + + return result + + @property + def class_log_prior_(self): + if not self._class_log_prior_.any(): + with warnings.catch_warnings(): + # silence the warning when count is 0 because class was not yet + # observed + warnings.simplefilter("ignore", RuntimeWarning) + log_class_count = np.log(self.class_count_) + self._class_log_prior_ = log_class_count - np.log(self.class_count_.sum()) + return self._class_log_prior_ + + @property + def feature_log_prob_(self): + if not self._feature_log_prob_: + feature_log_prob = [] + for i in range(self.n_features_): + smoothed_cat_count = self.category_count_[i] + self.alpha + smoothed_class_count = smoothed_cat_count.sum(axis=1) + feature_log_prob.append( + np.log(smoothed_cat_count) + - np.log(smoothed_class_count.reshape(-1, 1)) + ) + self._feature_log_prob_ = feature_log_prob + return self._feature_log_prob_ + + def _joint_log_likelihood(self, X): + if not X.shape[1] == self.n_features_: + raise ValueError( + "Expected input with %d features, got %d instead" + % (self.n_features_, X.shape[1]) + ) + jll = np.zeros((X.shape[0], self.class_count_.shape[0])) + for i in range(self.n_features_): + categories = X[:, i] + indices = [np.where(self.categories_[i] == cat)[0][0] for cat in categories] + jll += self.feature_log_prob_[i][:, indices].T + total_ll = jll + self.class_log_prior_ + return total_ll + + def __eq__(self, other): + raise NotImplementedError + + +class AdditiveGaussianNB(GaussianNB): + def __init__(self, priors=None, var_smoothing=1e-9): + self._class_log_prior_ = np.array([]) + super(AdditiveGaussianNB, self).__init__(priors, var_smoothing) + + def fit(self, X, y): + self.n_obs_, self.n_feats_ = X.shape + super(AdditiveGaussianNB, self).fit(X, y) + + @property + def class_log_prior_(self): + if not self._class_log_prior_.any(): + with warnings.catch_warnings(): + # silence the warning when count is 0 because class was not yet + # observed + warnings.simplefilter("ignore", RuntimeWarning) + log_class_count = np.log(self.class_count_) + self._class_log_prior_ = log_class_count - np.log(self.class_count_.sum()) + return self._class_log_prior_ + + def __add__(self, other): + if self.var_smoothing != other.var_smoothing: + raise ValueError("var_smoothing values do not agree") + if self.priors != other.priors: + raise ValueError("priors do not agree") + if (self.classes_ != other.classes_).all(): + raise ValueError("classes_ do not agree") + + class_count_1 = self.class_count_[:, np.newaxis] + class_count_2 = other.class_count_[:, np.newaxis] + n_obs_total = self.n_obs_ + other.n_obs_ + class_count_total = class_count_1 + class_count_2 + + theta_total = ( + class_count_1 * self.theta_ + class_count_2 * other.theta_ + ) / class_count_total + + self.sigma_[:, :] -= self.epsilon_ + other.sigma_[:, :] -= other.epsilon_ + epsilon_total = max(self.epsilon_, other.epsilon_) + ssd_1 = class_count_1 * self.sigma_ + ssd_2 = class_count_2 * other.sigma_ + total_ssd = ( + ssd_1 + + ssd_2 + + (class_count_1 * class_count_2 / class_count_total) + * (self.theta_ - other.theta_) ** 2 + ) + sigma_total = total_ssd / class_count_total + sigma_total += epsilon_total + + result = AdditiveGaussianNB(self.priors, self.var_smoothing) + result.n_obs_ = n_obs_total + result.classes_ = self.classes_ + result.sigma_ = sigma_total + result.theta_ = theta_total + result.epsilon_ = epsilon_total + result.class_count_ = class_count_total.flatten() + result.class_prior_ = result.class_count_ / n_obs_total + return result + + def __eq__(self, other): + if self.var_smoothing != other.var_smoothing: + return False + if self.priors != other.priors: + return False + if (self.classes_ != other.classes_).all(): + return False + if not np.isclose(self.theta_, other.theta_).all(): + return False + if not np.isclose(self.sigma_, other.sigma_).all(): + return self.sigma_, other.sigma_ + if (self.class_count_ != other.class_count_).all(): + return False + if (self.class_prior_ != other.class_prior_).all(): + return False + if self.n_obs_ != other.n_obs_: + return False + if self.n_feats_ != other.n_feats_: + return False + return True + + +if __name__ == "__main__": + import time + from mipframework import create_runner + + algorithm_args = [ + "-x", + "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", + "-y", + "alzheimerbroadcategory", + "-alpha", + "1", + "-k", + "1", + "-pathology", + "dementia", + "-dataset", + "adni", + "-filter", + "", + ] + runner = create_runner(NaiveBayes, algorithm_args=algorithm_args, num_workers=2,) + start = time.time() + runner.run() + end = time.time() + print("Completed in ", end - start) From 194945a5ac533db0ff82557c0ee4cd566ddc5390 Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Tue, 5 Jan 2021 22:48:36 +0200 Subject: [PATCH 05/14] Add NB with CV, CM Highchart, route in hc server Also, first commit for a common library of usefull functions, located in mipframework/funclib --- .../{naive_bayes-2.py => naive_bayes.py} | 86 ++++++++++++------ .../mipframework/funclib/__init__.py | 0 .../mipframework/funclib/crossvalidation.py | 89 +++++++++++++++++++ .../mipframework/hichart_server/app.py | 30 +++++++ .../mipframework/highcharts/user_defined.py | 40 +++++++++ 5 files changed, 220 insertions(+), 25 deletions(-) rename Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/{naive_bayes-2.py => naive_bayes.py} (80%) create mode 100644 Exareme-Docker/src/mip-algorithms/mipframework/funclib/__init__.py create mode 100644 Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes-2.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py similarity index 80% rename from Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes-2.py rename to Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py index e6ad6ae3a..00e2690d8 100644 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes-2.py +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py @@ -6,13 +6,14 @@ import warnings import numpy as np -import scipy -import sklearn.metrics +from sklearn import metrics from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import BaseDiscreteNB from mipframework import Algorithm from mipframework import AlgorithmResult +from mipframework.funclib.crossvalidation import kfold_split_design_matrices +from mipframework.highcharts.user_defined import MultilabelConfisionMatrix class NaiveBayes(Algorithm): @@ -28,41 +29,58 @@ def local_init(self): X_cat = np.array(X[categ_names]) if categ_names else None X_num = np.array(X[numer_names]) if numer_names else None y = np.array(y) - # n_splits = int(self.parameters.k) - nb_model = MixedAdditiveNB(float(self.parameters.alpha)) - nb_model.fit(y, X_num, X_cat) + n_splits = int(self.parameters.k) - self.store(X_cat=X_cat) - self.store(X_num=X_num) + train_sets, test_sets = kfold_split_design_matrices(n_splits, y, X_num, X_cat) + models = [MixedAdditiveNB(float(self.parameters.alpha))] * n_splits + [m.fit(yt, Xnt, Xct) for m, (yt, Xnt, Xct) in zip(models, train_sets)] + + self.store(train_sets=train_sets) + self.store(test_sets=test_sets) self.store(y=y) - self.push_and_add(nb_model=nb_model) + self.push_and_agree(n_splits=n_splits) + for k in range(n_splits): + self.push_and_add(**{"model" + str(k): models[k]}) def global_init(self): - nb_model = self.fetch("nb_model") + n_splits = self.fetch("n_splits") + models = [self.fetch("model" + str(k)) for k in range(n_splits)] - self.push(nb_model=nb_model) + self.store(classes=models[0].gnb.classes_) + for k in range(n_splits): + self.push_and_add(**{"model" + str(k): models[k]}) def local_final(self): + n_splits = int(self.parameters.k) y = self.load("y") - X_num = self.load("X_num") - X_cat = self.load("X_cat") - nb_model = self.fetch("nb_model") + n_obs = len(y) + test_sets = self.load("test_sets") + models = [self.fetch("model" + str(k)) for k in range(n_splits)] + + y_preds = [m.predict(Xnt, Xct) for m, (_, Xnt, Xct) in zip(models, test_sets)] + y_pred = np.array(y).flatten() + idx = 0 + for yp in y_preds: + y_pred[idx : idx + len(yp)] = yp + idx += len(yp) - y_pred = nb_model.predict(X_num, X_cat) - n_hits = sum(y_pred == np.array(y).flatten()) - n_miss = len(y) - n_hits + confusion_matrix = metrics.confusion_matrix(y, y_pred) + accuracy = metrics.accuracy_score(y, y_pred) - self.push_and_add(n_hits=n_hits) - self.push_and_add(n_miss=n_miss) + self.push_and_add(confusion_matrix=confusion_matrix) + self.push_and_add(accuracy=Mediant(accuracy * n_obs, n_obs)) def global_final(self): - n_hits = self.fetch("n_hits") - n_miss = self.fetch("n_miss") + classes = self.load("classes") + confusion_matrix = self.fetch("confusion_matrix") + # accuracy = self.fetch("accuracy").get_value() - self.result = AlgorithmResult( - raw_data={"precision": n_hits / (n_hits + n_miss)} + cm_chart = MultilabelConfisionMatrix( + "Confusion Matrix", confusion_matrix, classes.tolist() ) + self.result = AlgorithmResult(raw_data={}, highcharts=[cm_chart]) + class MixedAdditiveNB(object): def __init__(self, alpha=1.0): @@ -100,6 +118,9 @@ def __add__(self, other): result.cnb = self.cnb + other.cnb return result + def __repr__(self): + return repr({"gnb": self.gnb.__dict__, "cnb": self.cnb.__dict__}) + class AdditiveCategoricalNB(BaseDiscreteNB): def __init__(self, alpha=1.0): @@ -158,7 +179,7 @@ def sum_elementwise(x, y): if not all( [(c1 == c2).all() for c1, c2 in zip(self.categories_, other.categories_)] ): - raise ValueError("catefories_ do not agree") + raise ValueError("categories_ do not agree") result.categories_ = self.categories_ result.n_categories_ = sum_elementwise(self.n_categories_, other.n_categories_) @@ -295,6 +316,21 @@ def __eq__(self, other): return True +class Mediant(object): + def __init__(self, num, den): + self.num = num + self.den = den + + def __add__(self, other): + return Mediant(self.num + other.num, self.den + other.den) + + def __repr__(self): + return str(self.get_value()) + + def get_value(self): + return float(self.num) / float(self.den) + + if __name__ == "__main__": import time from mipframework import create_runner @@ -307,7 +343,7 @@ def __eq__(self, other): "-alpha", "1", "-k", - "1", + "2", "-pathology", "dementia", "-dataset", @@ -315,7 +351,7 @@ def __eq__(self, other): "-filter", "", ] - runner = create_runner(NaiveBayes, algorithm_args=algorithm_args, num_workers=2,) + runner = create_runner(NaiveBayes, algorithm_args=algorithm_args, num_workers=1,) start = time.time() runner.run() end = time.time() diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/funclib/__init__.py b/Exareme-Docker/src/mip-algorithms/mipframework/funclib/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py b/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py new file mode 100644 index 000000000..d6ff0adaa --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py @@ -0,0 +1,89 @@ +from collections import namedtuple + +import numpy as np +from sklearn.model_selection import KFold + + +def kfold_split_design_matrices(n_splits, *matrices): + train_sets = [] + test_sets = [] + kf = KFold(n_splits=n_splits, random_state=0) + for train_idx, test_idx in kf.split(matrices[0]): + train_sets.append([m[train_idx] for m in matrices]) + test_sets.append([m[test_idx] for m in matrices]) + return train_sets, test_sets + + +def kfold_split_design_matrix(X, n_splits): + kf = KFold(n_splits=n_splits) + train_sets = [] + test_sets = [] + for train_idx, test_idx in kf.split(X): + train_sets.append(X[train_idx]) + test_sets.append(X[test_idx]) + return train_sets, test_sets + + +def compute_classification_results(y, yhats): + true_positives = np.array( + [sum(1 if yi == yhi == 1 else 0 for yi, yhi in zip(y, yhat)) for yhat in yhats] + ) + true_negatives = np.array( + [sum(1 if yi == yhi == 0 else 0 for yi, yhi in zip(y, yhat)) for yhat in yhats] + ) + false_positives = np.array( + [ + sum(1 if yi == 0 and yhi == 1 else 0 for yi, yhi in zip(y, yhat)) + for yhat in yhats + ] + ) + false_negatives = np.array( + [ + sum(1 if yi == 1 and yhi == 0 else 0 for yi, yhi in zip(y, yhat)) + for yhat in yhats + ] + ) + return false_negatives, false_positives, true_negatives, true_positives + + +ConfusionMatrixSummary = namedtuple( + "ConfusionMatrixSummary", "accuracy precision recall confusion_mat f1" +) + + +def compute_confusion_matrix(tp, tn, fp, fn): + confusion_mat = { + "True Positives": tp, + "True Negatives": tn, + "False Positives": fp, + "False Negatives": fn, + } + accuracy = (tp + tn) / (tp + tn + fp + fn) + try: + precision = tp / (tp + fp) + except ZeroDivisionError: + precision = 1 + try: + recall = tp / (tp + fn) + except ZeroDivisionError: + recall = 1 + try: + f1 = 2 * (precision * recall) / (precision + recall) + except ZeroDivisionError: + f1 = 2 + return ConfusionMatrixSummary(accuracy, precision, recall, confusion_mat, f1) + + +def compute_roc(true_positives, true_negatives, false_positives, false_negatives): + fp_rate = [ + fp / (fp + tn) if fp != 0 or tn != 0 else 1 + for fp, tn in zip(false_positives, true_negatives) + ] + tp_rate = [ + tp / (tp + fn) if tp != 0 or fn != 0 else 1 + for tp, fn in zip(true_positives, false_negatives) + ] + roc_curve = list(zip(fp_rate, tp_rate)) + auc = np.trapz(tp_rate, fp_rate) + gini = 2 * auc - 1 + return roc_curve, auc, gini diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py b/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py index e96b751c4..409391028 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py @@ -7,6 +7,7 @@ from LOGISTIC_REGRESSION import LogisticRegression from CALIBRATION_BELT import CalibrationBelt from KAPLAN_MEIER import KaplanMeier +from NAIVE_BAYES import NaiveBayes app = Flask(__name__) @@ -34,6 +35,10 @@ "title": "Kaplan-Meier Survival Curves", "url": "kaplan_meier_survival", }, + "naive_bayes_confusion_matrix": { + "title": "NaiveBayes CM", + "url": "naive_bayes_confusion_matrix", + }, } @@ -291,5 +296,30 @@ def kaplan_meier_survival(): return render_template("highchart_layout.html", title="Kaplan Meier", data=result,) +@app.route("/naive_bayes_confusion_matrix") +def naive_bayes_confusion_matrix(): + nb_args = [ + "-x", + "lefthippocampus,righthippocampus,leftaccumbensarea,gender,agegroup,apoe4", + "-y", + "alzheimerbroadcategory", + "-alpha", + "1", + "-k", + "2", + "-pathology", + "dementia", + "-dataset", + "adni", + "-filter", + "", + ] + result = get_algorithm_result(NaiveBayes, nb_args) + result = result["result"][1]["data"] + return render_template( + "highchart_layout.html", title="NaiveBayes Confusion Martix", data=result + ) + + if __name__ == "__main__": app.run(debug=True) diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/user_defined.py b/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/user_defined.py index c5019d4d7..20594394b 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/user_defined.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/user_defined.py @@ -132,6 +132,46 @@ def __init__(self, title, confusion_matrix): ) +class MultilabelConfisionMatrix(HighchartTemplate): + def __init__(self, title, confusion_matrix, classes): + min_val = 0 + max_val = confusion_matrix.max() + data = [ + { + "name": str(confusion_matrix[i, j]), + "x": i, + "y": j, + # "y": confusion_matrix.shape[1] - j - 1, + "value": confusion_matrix[i, j], + } + for i in range(confusion_matrix.shape[0]) + for j in range(confusion_matrix.shape[1]) + ] + data_labels = DataLabels( + format="{point.name}", + enabled=True, + color="#222222", + borderRadius=3, + backgroundColor="rgba(245, 255, 255, 0.5)", + borderWidth=2, + borderColor="#AAA", + padding=5, + ) + self.chart = ( + Heatmap_(title=Title(text=title)) + .set(xAxis=Axis(categories=classes)) + .set(yAxis=Axis(categories=list(classes), title=None,)) + .set( + colorAxis=ColorAxis( + min=min_val, max=max_val, minColor="#ffffff", maxColor="#0000ff" + ) + ) + .set(series=Series(data=data, borderWidth=1, dataLabels=data_labels)) + .set(legend=Legend(enabled=False)) + .set(tooltip=Tooltip(enabled=False)) + ) + + class ROC(HighchartTemplate): def __init__(self, title, roc_curve, auc, gini): self.chart = ( From ab9c645ab5a4c41b3044a3e808a544bcb0114bee Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Wed, 6 Jan 2021 18:37:28 +0200 Subject: [PATCH 06/14] Add a AdditiveMulticlassROCCurve class This class, like all other Additive... classes, takes care of correctly computing the quantity in question, here the ROC curve, when multiple instances are sent from the locals to the global node. --- .../mipframework/funclib/crossvalidation.py | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py b/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py index d6ff0adaa..fdcf7973b 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py @@ -1,3 +1,7 @@ +from __future__ import print_function +from __future__ import division +from __future__ import unicode_literals + from collections import namedtuple import numpy as np @@ -87,3 +91,83 @@ def compute_roc(true_positives, true_negatives, false_positives, false_negatives auc = np.trapz(tp_rate, fp_rate) gini = 2 * auc - 1 return roc_curve, auc, gini + + +class AdditiveMulticlassROCCurve(object): + def __init__( + self, + y_true=None, + y_pred_proba_per_class=None, + classes=None, + tp=None, + tn=None, + fp=None, + fn=None, + ): + if (tp, tn, fp, fn) == (None, None, None, None): + if len(y_true.shape) > 1: + y_true = y_true.flatten() + self.tp = [] + self.tn = [] + self.fp = [] + self.fn = [] + self.classes = classes + for ci, c in enumerate(classes): + y_pred_proba = y_pred_proba_per_class[:, ci] + thres = np.linspace(1.0, 0.0, num=2 ** 7 + 1) + self.tp.append( + ((y_true == c) & (y_pred_proba >= thres[:, None])).sum(axis=1) + ) + self.tn.append( + ((y_true != c) & (y_pred_proba < thres[:, None])).sum(axis=1) + ) + self.fp.append( + ((y_true != c) & (y_pred_proba >= thres[:, None])).sum(axis=1) + ) + self.fn.append( + ((y_true == c) & (y_pred_proba < thres[:, None])).sum(axis=1) + ) + elif tp and tn and fp and fn: + self.tp = tp + self.tn = tn + self.fp = fp + self.fn = fn + + def __add__(self, other): + result = AdditiveMulticlassROCCurve( + tp=[tp_1 + tp_2 for tp_1, tp_2 in zip(self.tp, other.tp)], + tn=[tn_1 + tn_2 for tn_1, tn_2 in zip(self.tn, other.tn)], + fp=[fp_1 + fp_2 for fp_1, fp_2 in zip(self.fp, other.fp)], + fn=[fn_1 + fn_2 for fn_1, fn_2 in zip(self.fn, other.fn)], + ) + if (self.classes == other.classes).all(): + result.classes = self.classes + else: + raise ValueError("classes do not agree") + return result + + def get_curves(self): + curves = [] + for ci, c in enumerate(self.classes): + tpr = self.tp[ci] / (self.tp[ci] + self.fn[ci]) + tpr[np.isnan(tpr)] = 1.0 + fpr = self.fp[ci] / (self.fp[ci] + self.tn[ci]) + fpr[np.isnan(fpr)] = 1.0 + curves.append((fpr.tolist(), tpr.tolist())) + return curves + + +if __name__ == "__main__": + classes = np.array(["AD", "CN", "Other"]) + y_true = np.array(["AD", "AD", "Other", "CN", "CN", "AD"]) + y_pred_proba_per_class = np.array( + [ + [0.6, 0.2, 0.1], + [0.38, 0.42, 0.2], + [0.4, 0.1, 0.5], + [0.3, 0.3, 0.4], + [0.2, 0.45, 0.35], + [0.5, 0.3, 0.2], + ] + ) + roc = MulticlassROCCurve(y_true, y_pred_proba_per_class, classes) From 7eeb8a450abdd8d9819043c8072e674b68b2bb18 Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Wed, 6 Jan 2021 18:39:56 +0200 Subject: [PATCH 07/14] Add ROC curve highchart to NB Also, fix behaviour for input that might have only numerical or only categorical X matrix (very bad fix, must implement better one). --- .../mip-algorithms/NAIVE_BAYES/naive_bayes.py | 118 +++++++++++++++--- .../mipframework/highcharts/__init__.py | 4 + .../mipframework/highcharts/user_defined.py | 34 +++++ 3 files changed, 142 insertions(+), 14 deletions(-) diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py index 00e2690d8..402b8f072 100644 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py @@ -13,7 +13,9 @@ from mipframework import Algorithm from mipframework import AlgorithmResult from mipframework.funclib.crossvalidation import kfold_split_design_matrices +from mipframework.funclib.crossvalidation import AdditiveMulticlassROCCurve from mipframework.highcharts.user_defined import MultilabelConfisionMatrix +from mipframework.highcharts.user_defined import MulticlassROCCurve class NaiveBayes(Algorithm): @@ -28,16 +30,35 @@ def local_init(self): numer_names = [k for k, v in self.metadata.is_categorical.items() if v == 0] X_cat = np.array(X[categ_names]) if categ_names else None X_num = np.array(X[numer_names]) if numer_names else None + if X_num is not None and X_cat is not None: + xtypes = "both" + elif X_num is not None: + xtypes = "numerical" + elif X_cat is not None: + xtypes = "categorical" y = np.array(y) n_splits = int(self.parameters.k) - train_sets, test_sets = kfold_split_design_matrices(n_splits, y, X_num, X_cat) + matrices_to_split = [y] + if X_num is not None: + matrices_to_split.append(X_num) + if X_cat is not None: + matrices_to_split.append(X_cat) + train_sets, test_sets = kfold_split_design_matrices( + n_splits, *matrices_to_split + ) models = [MixedAdditiveNB(float(self.parameters.alpha))] * n_splits - [m.fit(yt, Xnt, Xct) for m, (yt, Xnt, Xct) in zip(models, train_sets)] + if xtypes == "numerical": + [m.fit(yt, X_num=Xt) for m, (yt, Xt) in zip(models, train_sets)] + elif xtypes == "categorical": + [m.fit(yt, X_cat=Xt) for m, (yt, Xt) in zip(models, train_sets)] + elif xtypes == "both": + [m.fit(yt, Xnt, Xct) for m, (yt, Xnt, Xct) in zip(models, train_sets)] self.store(train_sets=train_sets) self.store(test_sets=test_sets) self.store(y=y) + self.store(xtypes=xtypes) self.push_and_agree(n_splits=n_splits) for k in range(n_splits): self.push_and_add(**{"model" + str(k): models[k]}) @@ -45,8 +66,12 @@ def local_init(self): def global_init(self): n_splits = self.fetch("n_splits") models = [self.fetch("model" + str(k)) for k in range(n_splits)] + if models[0].gnb: + classes = models[0].gnb.classes_ + else: + classes = models[0].cnb.classes_ - self.store(classes=models[0].gnb.classes_) + self.store(classes=classes) for k in range(n_splits): self.push_and_add(**{"model" + str(k): models[k]}) @@ -55,31 +80,74 @@ def local_final(self): y = self.load("y") n_obs = len(y) test_sets = self.load("test_sets") + xtypes = self.load("xtypes") models = [self.fetch("model" + str(k)) for k in range(n_splits)] - - y_preds = [m.predict(Xnt, Xct) for m, (_, Xnt, Xct) in zip(models, test_sets)] + classes = models[0].classes_ + n_classes = len(classes) + + if xtypes == "numerical": + y_preds = [m.predict(X_num=Xt) for m, (_, Xt) in zip(models, test_sets)] + elif xtypes == "categorical": + y_preds = [m.predict(X_cat=Xt) for m, (_, Xt) in zip(models, test_sets)] + else: + y_preds = [ + m.predict(Xnt, Xct) for m, (_, Xnt, Xct) in zip(models, test_sets) + ] y_pred = np.array(y).flatten() idx = 0 for yp in y_preds: y_pred[idx : idx + len(yp)] = yp idx += len(yp) + if xtypes == "numerical": + y_pred_proba_per_class_kfold = [ + m.predict_proba(X_num=Xt) for m, (_, Xt) in zip(models, test_sets) + ] + elif xtypes == "categorical": + y_pred_proba_per_class_kfold = [ + m.predict_proba(X_cat=Xt) for m, (_, Xt) in zip(models, test_sets) + ] + else: + y_pred_proba_per_class_kfold = [ + m.predict_proba(Xnt, Xct) for m, (_, Xnt, Xct) in zip(models, test_sets) + ] + y_pred_proba_per_class = np.empty((n_obs, n_classes)) + idx = 0 + for yp in y_pred_proba_per_class_kfold: + y_pred_proba_per_class[idx : idx + len(yp)] = yp + idx += len(yp) + confusion_matrix = metrics.confusion_matrix(y, y_pred) accuracy = metrics.accuracy_score(y, y_pred) + roc_curve = AdditiveMulticlassROCCurve( + y_true=y, y_pred_proba_per_class=y_pred_proba_per_class, classes=classes + ) + self.push_and_add(confusion_matrix=confusion_matrix) self.push_and_add(accuracy=Mediant(accuracy * n_obs, n_obs)) + self.push_and_add(roc_curve=roc_curve) def global_final(self): classes = self.load("classes") confusion_matrix = self.fetch("confusion_matrix") - # accuracy = self.fetch("accuracy").get_value() + accuracy = self.fetch("accuracy").get_value() + roc_curves = self.fetch("roc_curve").get_curves() cm_chart = MultilabelConfisionMatrix( "Confusion Matrix", confusion_matrix, classes.tolist() ) - - self.result = AlgorithmResult(raw_data={}, highcharts=[cm_chart]) + roc_chart = MulticlassROCCurve("ROC", roc_curves, classes) + + self.result = AlgorithmResult( + raw_data={ + "accuracy": accuracy, + "confusion_matrix": confusion_matrix.tolist(), + "roc_curve": roc_curves, + "classes": classes.tolist(), + }, + highcharts=[cm_chart, roc_chart], + ) class MixedAdditiveNB(object): @@ -88,6 +156,15 @@ def __init__(self, alpha=1.0): self.gnb = None self.cnb = None + @property + def classes_(self): + if self.gnb: + return self.gnb.classes_ + elif self.cnb: + return self.cnb.classes_ + else: + raise ValueError("model hasn't been trained yet") + def fit(self, y, X_num=None, X_cat=None): if X_num is not None: self.gnb = AdditiveGaussianNB() @@ -96,7 +173,7 @@ def fit(self, y, X_num=None, X_cat=None): self.cnb = AdditiveCategoricalNB(alpha=self.alpha) self.cnb.fit(X_cat, y) - def predict(self, X_num, X_cat): + def predict(self, X_num=None, X_cat=None): if X_num is not None and X_cat is not None: jll = ( self.gnb.predict_log_proba(X_num) @@ -109,6 +186,17 @@ def predict(self, X_num, X_cat): elif X_cat is not None: return self.cnb.predict(X_cat) + def predict_proba(self, X_num=None, X_cat=None): + if X_num is not None and X_cat is not None: + probs_num = self.gnb.predict_proba(X_num) + probs_cat = self.cnb.predict_proba(X_cat) + normalizations = (probs_num * probs_cat).sum(axis=1)[:, np.newaxis] + return probs_num * probs_cat / normalizations + elif X_num is not None: + return self.gnb.predict_proba(X_num) + elif X_cat is not None: + return self.cnb.predict_proba(X_cat) + def __add__(self, other): result = MixedAdditiveNB() if self.gnb and other.gnb: @@ -118,8 +206,8 @@ def __add__(self, other): result.cnb = self.cnb + other.cnb return result - def __repr__(self): - return repr({"gnb": self.gnb.__dict__, "cnb": self.cnb.__dict__}) + # def __repr__(self): + # return repr({"gnb": self.gnb.__dict__, "cnb": self.cnb.__dict__}) class AdditiveCategoricalNB(BaseDiscreteNB): @@ -337,7 +425,9 @@ def get_value(self): algorithm_args = [ "-x", - "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", + "lefthippocampus,righthippocampus,leftaccumbensarea", + # "gender,apoe4,agegroup", + # "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", "-y", "alzheimerbroadcategory", "-alpha", @@ -351,8 +441,8 @@ def get_value(self): "-filter", "", ] - runner = create_runner(NaiveBayes, algorithm_args=algorithm_args, num_workers=1,) + runner = create_runner(NaiveBayes, algorithm_args=algorithm_args, num_workers=3,) start = time.time() runner.run() end = time.time() - print("Completed in ", end - start) + # print("Completed in ", end - start) diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/__init__.py b/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/__init__.py index 7a49ef348..af010916f 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/__init__.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/__init__.py @@ -5,6 +5,8 @@ ScreePlot, CalibrationBeltPlot, SurvivalCurves, + MultilabelConfisionMatrix, + MulticlassROCCurve, ) __all__ = [ @@ -14,4 +16,6 @@ "ScreePlot", "CalibrationBeltPlot", "SurvivalCurves", + "MultilabelConfisionMatrix", + "MulticlassROCCurve", ] diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/user_defined.py b/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/user_defined.py index 20594394b..97d4dc766 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/user_defined.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/highcharts/user_defined.py @@ -194,6 +194,40 @@ def __init__(self, title, roc_curve, auc, gini): ) +class MulticlassROCCurve(HighchartTemplate): + def __init__(self, title, roc_curves, classes): + self.chart = ( + Line_(title=Title(text=title)) + .set( + xAxis=Axis(min=-0.05, max=1.05, title=Title(text="False Positive Rate")) + ) + .set( + yAxis=Axis(min=-0.05, max=1.05, title=Title(text="True Positive Rate")) + ) + .set(legend=Legend(enabled=True)) + ) + series = RenderableList( + [ + Series(data=map(list, zip(*curve)), name=class_) + for class_, curve in zip(classes, roc_curves) + ] + ) + series.append( + Series( + name="Bisector", + data=[[0, 0], [1, 1]], + zIndex=2, + color="#fc7938", + lineWidth=1.5, + dashStyle="Dash", + allowPointSelect=False, + marker={"enabled": False}, + label={"enabled": False}, + ) + ) + self.chart.set(series=series) + + class ScreePlot(HighchartTemplate): def __init__(self, title, data, xtitle): self.chart = ( From 9e8f33594ee4577d5d8470012f9135502d64413a Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Wed, 6 Jan 2021 18:46:45 +0200 Subject: [PATCH 08/14] Add NB ROC graph to hichart server --- .../mipframework/hichart_server/app.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py b/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py index 409391028..ed4f075f8 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py @@ -29,7 +29,7 @@ "title": "Logistic Regression Confusion Matrix", "url": "logistic_confmat", }, - "logistic_roc": {"title": "Logistic Regression ROC", "url": "logistic_roc",}, + "logistic_roc": {"title": "Logistic Regression ROC", "url": "logistic_roc"}, "calibration_belt": {"title": "Calibration Belt", "url": "calibration_belt"}, "kaplan_meier_survival": { "title": "Kaplan-Meier Survival Curves", @@ -39,6 +39,7 @@ "title": "NaiveBayes CM", "url": "naive_bayes_confusion_matrix", }, + "naive_bayes_roc": {"title": "NaiveBayes ROC", "url": "naive_bayes_roc",}, } @@ -321,5 +322,30 @@ def naive_bayes_confusion_matrix(): ) +@app.route("/naive_bayes_roc") +def naive_bayes_roc(): + nb_args = [ + "-x", + # "lefthippocampus,righthippocampus,leftaccumbensarea", + # "gender,apoe4,agegroup", + "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", + "-y", + "alzheimerbroadcategory", + "-alpha", + "1", + "-k", + "2", + "-pathology", + "dementia", + "-dataset", + "adni", + "-filter", + "", + ] + result = get_algorithm_result(NaiveBayes, nb_args) + result = result["result"][2]["data"] + return render_template("highchart_layout.html", title="NaiveBayes ROC", data=result) + + if __name__ == "__main__": app.run(debug=True) From 59d3923b7e63636382c3c50089c8251279bfa4c9 Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Thu, 7 Jan 2021 18:14:27 +0200 Subject: [PATCH 09/14] Add various output stats and graphs to NB --- .../mip-algorithms/NAIVE_BAYES/naive_bayes.py | 84 ++++++++++++- .../mipframework/funclib/crossvalidation.py | 114 ++++++++++++++++-- .../mipframework/hichart_server/app.py | 58 ++++----- 3 files changed, 203 insertions(+), 53 deletions(-) diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py index 402b8f072..35514275d 100644 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py @@ -12,8 +12,10 @@ from mipframework import Algorithm from mipframework import AlgorithmResult +from mipframework import TabularDataResource from mipframework.funclib.crossvalidation import kfold_split_design_matrices from mipframework.funclib.crossvalidation import AdditiveMulticlassROCCurve +from mipframework.funclib.crossvalidation import AdditiveMulticlassClassificationReport from mipframework.highcharts.user_defined import MultilabelConfisionMatrix from mipframework.highcharts.user_defined import MulticlassROCCurve @@ -47,7 +49,9 @@ def local_init(self): train_sets, test_sets = kfold_split_design_matrices( n_splits, *matrices_to_split ) - models = [MixedAdditiveNB(float(self.parameters.alpha))] * n_splits + models = [ + MixedAdditiveNB(float(self.parameters.alpha)) for _ in range(n_splits) + ] if xtypes == "numerical": [m.fit(yt, X_num=Xt) for m, (yt, Xt) in zip(models, train_sets)] elif xtypes == "categorical": @@ -124,28 +128,100 @@ def local_final(self): y_true=y, y_pred_proba_per_class=y_pred_proba_per_class, classes=classes ) + classification_report = AdditiveMulticlassClassificationReport( + y_true=y, y_pred=y_pred, classes=classes + ) + + self.push_and_add(n_obs=n_obs) self.push_and_add(confusion_matrix=confusion_matrix) self.push_and_add(accuracy=Mediant(accuracy * n_obs, n_obs)) self.push_and_add(roc_curve=roc_curve) + self.push_and_add(classification_report=classification_report) def global_final(self): classes = self.load("classes") confusion_matrix = self.fetch("confusion_matrix") + accuracy = self.fetch("accuracy").get_value() + n_obs = self.fetch("n_obs") + accuracy_ci = 1.96 * np.sqrt((accuracy * (1 - accuracy)) / n_obs) + roc_curves = self.fetch("roc_curve").get_curves() + ( + precision, + recall, + specificity, + f_score, + precision_avgs, + recall_avgs, + specificity_avgs, + f_score_avgs, + ) = self.fetch("classification_report").get_values() + precision = precision.tolist() + recall = recall.tolist() + specificity = specificity.tolist() + f_score = f_score.tolist() cm_chart = MultilabelConfisionMatrix( "Confusion Matrix", confusion_matrix, classes.tolist() ) + + aucs = [] + ginis = [] + for tpr, fpr in roc_curves: + auc = np.trapz(tpr, fpr) + gini = 2 * auc - 1 + aucs.append(auc) + ginis.append(gini) + roc_chart = MulticlassROCCurve("ROC", roc_curves, classes) + accuracy_report = TabularDataResource( + fields=["Statistic", "Value"], + data=list( + zip( + *[ + ["Accuracy", "Lower c.i.", "Upper c.i."], + [accuracy, accuracy - accuracy_ci, accuracy + accuracy_ci], + ] + ) + ), + title="Overall classification statistics", + ) + + clf_report = TabularDataResource( + fields=["", "Precision", "Recall", "Specificity", "F score"], + data=list( + zip( + *[ + classes.tolist() + ["micro avg", "macro avg", "weighted avg"], + precision + precision_avgs, + recall + recall_avgs, + specificity + specificity_avgs, + f_score + f_score_avgs, + ] + ) + ), + title="Classification Report", + ) + + roc_report = TabularDataResource( + fields=["Class", "AUC", "Gini coefficient"], + data=list(zip(*[classes.tolist(), aucs, ginis])), + title="ROC report", + ) + self.result = AlgorithmResult( raw_data={ "accuracy": accuracy, "confusion_matrix": confusion_matrix.tolist(), "roc_curve": roc_curves, "classes": classes.tolist(), + "precision": precision, + "recall": recall, + "f_score": f_score, }, + tables=[clf_report, roc_report, accuracy_report], highcharts=[cm_chart, roc_chart], ) @@ -425,15 +501,15 @@ def get_value(self): algorithm_args = [ "-x", - "lefthippocampus,righthippocampus,leftaccumbensarea", + # "lefthippocampus,righthippocampus,leftaccumbensarea", # "gender,apoe4,agegroup", - # "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", + "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", "-y", "alzheimerbroadcategory", "-alpha", "1", "-k", - "2", + "10", "-pathology", "dementia", "-dataset", diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py b/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py index fdcf7973b..9a90d8386 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/funclib/crossvalidation.py @@ -157,17 +157,105 @@ def get_curves(self): return curves -if __name__ == "__main__": - classes = np.array(["AD", "CN", "Other"]) - y_true = np.array(["AD", "AD", "Other", "CN", "CN", "AD"]) - y_pred_proba_per_class = np.array( - [ - [0.6, 0.2, 0.1], - [0.38, 0.42, 0.2], - [0.4, 0.1, 0.5], - [0.3, 0.3, 0.4], - [0.2, 0.45, 0.35], - [0.5, 0.3, 0.2], +class AdditiveMulticlassClassificationReport(object): + def __init__( + self, + y_true=None, + y_pred=None, + classes=None, + class_count=None, + tp=None, + tn=None, + fp=None, + fn=None, + ): + if tp is not None and tn is not None and fp is not None and fn is not None: + self.tp = tp + self.tn = tn + self.fp = fp + self.fn = fn + self.classes = classes + self.class_count = class_count + + elif (tp, tn, fp, fn) == (None, None, None, None): + if len(y_true.shape) > 1: + y_true = y_true.flatten() + self.classes = classes[:, np.newaxis] + _, self.class_count = np.unique(y_true, return_counts=True) + self.tp = ((y_true == self.classes) & (y_pred == self.classes)).sum(axis=1) + self.tn = ((y_true != self.classes) & (y_pred != self.classes)).sum(axis=1) + self.fp = ((y_true != self.classes) & (y_pred == self.classes)).sum(axis=1) + self.fn = ((y_true == self.classes) & (y_pred != self.classes)).sum(axis=1) + + def __add__(self, other): + return AdditiveMulticlassClassificationReport( + tp=self.tp + other.tp, + tn=self.tn + other.tn, + fp=self.fp + other.fp, + fn=self.fn + other.fn, + classes=self.classes, + class_count=self.class_count + other.class_count, + ) + + def get_values(self): + tp_sum = sum(self.tp) + tn_sum = sum(self.tn) + fp_sum = sum(self.fp) + fn_sum = sum(self.fn) + + precision = self.tp / (self.tp + self.fp) + precision[np.isnan(precision)] = 1.0 + precision_micro_avg = tp_sum / (tp_sum + fp_sum) + precision_micro_avg = ( + 1.0 if np.isnan(precision_micro_avg) else precision_micro_avg + ) + precicion_avgs = [ + precision_micro_avg, + precision.mean(), + np.average(precision, weights=self.class_count), ] - ) - roc = MulticlassROCCurve(y_true, y_pred_proba_per_class, classes) + + recall = self.tp / (self.tp + self.fn) + recall[np.isnan(recall)] = 1.0 + recall_micro_avg = tp_sum / (tp_sum + fn_sum) + recall_micro_avg = 1.0 if np.isnan(recall_micro_avg) else recall_micro_avg + recall_avgs = [ + recall_micro_avg, + recall.mean(), + np.average(recall, weights=self.class_count), + ] + + specificity = self.tn / (self.tn + self.fp) + specificity[np.isnan(specificity)] = 1.0 + specificity_micro_avg = tn_sum / (tn_sum + fp_sum) + specificity_micro_avg = ( + 1.0 if np.isnan(specificity_micro_avg) else specificity_micro_avg + ) + specificity_avgs = [ + specificity_micro_avg, + specificity.mean(), + np.average(specificity, weights=self.class_count), + ] + + f_score = 2.0 * (precision * recall) / (precision + recall) + f_score_micro_avg = ( + 2.0 + * (precision_micro_avg * recall_micro_avg) + / (precision_micro_avg + recall_micro_avg) + ) + f_score_avgs = [ + f_score_micro_avg, + f_score.mean(), + np.average(f_score, weights=self.class_count), + ] + + return ( + precision, + recall, + specificity, + f_score, + precicion_avgs, + recall_avgs, + specificity_avgs, + f_score_avgs, + ) diff --git a/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py b/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py index ed4f075f8..e34a24417 100644 --- a/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py +++ b/Exareme-Docker/src/mip-algorithms/mipframework/hichart_server/app.py @@ -297,26 +297,30 @@ def kaplan_meier_survival(): return render_template("highchart_layout.html", title="Kaplan Meier", data=result,) +nb_args = [ + "-x", + # "lefthippocampus,righthippocampus,leftaccumbensarea", + # "gender,alzheimerbroadcategory,agegroup", + "lefthippocampus,righthippocampus,leftaccumbensarea,apoe4,alzheimerbroadcategory", + "-y", + "agegroup", + "-alpha", + "1", + "-k", + "10", + "-pathology", + "dementia", + "-dataset", + "adni, ppmi", + "-filter", + "", +] + + @app.route("/naive_bayes_confusion_matrix") def naive_bayes_confusion_matrix(): - nb_args = [ - "-x", - "lefthippocampus,righthippocampus,leftaccumbensarea,gender,agegroup,apoe4", - "-y", - "alzheimerbroadcategory", - "-alpha", - "1", - "-k", - "2", - "-pathology", - "dementia", - "-dataset", - "adni", - "-filter", - "", - ] result = get_algorithm_result(NaiveBayes, nb_args) - result = result["result"][1]["data"] + result = result["result"][4]["data"] return render_template( "highchart_layout.html", title="NaiveBayes Confusion Martix", data=result ) @@ -324,26 +328,8 @@ def naive_bayes_confusion_matrix(): @app.route("/naive_bayes_roc") def naive_bayes_roc(): - nb_args = [ - "-x", - # "lefthippocampus,righthippocampus,leftaccumbensarea", - # "gender,apoe4,agegroup", - "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", - "-y", - "alzheimerbroadcategory", - "-alpha", - "1", - "-k", - "2", - "-pathology", - "dementia", - "-dataset", - "adni", - "-filter", - "", - ] result = get_algorithm_result(NaiveBayes, nb_args) - result = result["result"][2]["data"] + result = result["result"][5]["data"] return render_template("highchart_layout.html", title="NaiveBayes ROC", data=result) From efbf53d75c6f2c25ae5ccfacc984b784147232e2 Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Fri, 8 Jan 2021 10:05:20 +0200 Subject: [PATCH 10/14] Full implementation of NB --- .../src/mip-algorithms/NAIVE_BAYES/1/__init__.py | 0 .../src/mip-algorithms/NAIVE_BAYES/1/global.py | 10 ++++++++++ .../src/mip-algorithms/NAIVE_BAYES/1/local.py | 10 ++++++++++ .../src/mip-algorithms/NAIVE_BAYES/2/__init__.py | 0 .../src/mip-algorithms/NAIVE_BAYES/2/global.py | 10 ++++++++++ .../src/mip-algorithms/NAIVE_BAYES/2/local.py | 10 ++++++++++ .../src/mip-algorithms/NAIVE_BAYES/__init__.py | 3 +++ 7 files changed, 43 insertions(+) create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/__init__.py create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/global.py create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/local.py create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/__init__.py create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/global.py create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/local.py create mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/__init__.py diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/__init__.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/global.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/global.py new file mode 100644 index 000000000..9bff44ede --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/global.py @@ -0,0 +1,10 @@ +import sys +from NAIVE_BAYES import NaiveBayes + + +def main(args): + NaiveBayes(args[1:]).global_init() + + +if __name__ == "__main__": + NaiveBayes(sys.argv[1:]).global_init() diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/local.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/local.py new file mode 100644 index 000000000..efa3c8cae --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/1/local.py @@ -0,0 +1,10 @@ +import sys +from NAIVE_BAYES import NaiveBayes + + +def main(args): + NaiveBayes(args[1:]).local_init() + + +if __name__ == "__main__": + NaiveBayes(sys.argv[1:]).local_init() diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/__init__.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/global.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/global.py new file mode 100644 index 000000000..03107ca0e --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/global.py @@ -0,0 +1,10 @@ +import sys +from NAIVE_BAYES import NaiveBayes + + +def main(args): + NaiveBayes(args[1:]).global_final() + + +if __name__ == "__main__": + NaiveBayes(sys.argv[1:]).global_final() diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/local.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/local.py new file mode 100644 index 000000000..dc7fa2a4b --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/2/local.py @@ -0,0 +1,10 @@ +import sys +from NAIVE_BAYES import NaiveBayes + + +def main(args): + NaiveBayes(args[1:]).local_final() + + +if __name__ == "__main__": + NaiveBayes(sys.argv[1:]).local_final() diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/__init__.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/__init__.py new file mode 100644 index 000000000..1561642fd --- /dev/null +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/__init__.py @@ -0,0 +1,3 @@ +from .naive_bayes import NaiveBayes + +__all__ = ["NaiveBayes"] From 3189c7dd8cd2aa6f567bde535d3fe38df30b9fd0 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 08:01:43 +0000 Subject: [PATCH 11/14] Naive Bayes properties fixes. --- .../src/mip-algorithms/NAIVE_BAYES/properties.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json index a6cfdb528..2390d1b18 100644 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json @@ -34,7 +34,7 @@ "label": "alpha", "desc": "Additive smoothing parameter (0 for no smoothing)", "type": "other", - "value": 0.1, + "value": "0.1", "valueNotBlank": true, "valueMultiple": false, "valueType": "real" @@ -43,10 +43,10 @@ "label": "number of batches", "desc": "The number of batches that will be used in k-fold crossvalidation.", "type": "other", - "value": 10, + "value": "10", "valueNotBlank": true, "valueMultiple": false, - "valueType": "int", + "valueType": "integer", "valueMin": 2 }, { "name": "pathology", From 780fecebc5505698a0549f6196ec0cb7d3add1a3 Mon Sep 17 00:00:00 2001 From: ThanKarab Date: Mon, 11 Jan 2021 08:15:48 +0000 Subject: [PATCH 12/14] Changed default NAIVE BAYES dataset. --- Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json index 2390d1b18..334684d05 100644 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/properties.json @@ -62,7 +62,7 @@ "label": "dataset", "desc": "The names of one or more datasets, in which the algorithm will be executed.", "type": "dataset", - "value": "desd-synthdata", + "value": "edsd", "valueNotBlank": true, "valueMultiple": true, "valueType": "string" From d2bee671c60ae632b0db11e99cc1f41735212d07 Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Sat, 16 Jan 2021 10:39:54 +0200 Subject: [PATCH 13/14] Fix wrong AUC calculation --- Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py index 35514275d..31e81b875 100644 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py +++ b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes.py @@ -168,7 +168,7 @@ def global_final(self): aucs = [] ginis = [] - for tpr, fpr in roc_curves: + for fpr, tpr in roc_curves: auc = np.trapz(tpr, fpr) gini = 2 * auc - 1 aucs.append(auc) From 3be424832e40161de5eab7566e295f70fb7e38fb Mon Sep 17 00:00:00 2001 From: Jason Sakellariou Date: Mon, 18 Jan 2021 10:30:54 +0200 Subject: [PATCH 14/14] Remove naive_bayes_nocv from repo --- .../NAIVE_BAYES/naive_bayes_nocv.py | 401 ------------------ 1 file changed, 401 deletions(-) delete mode 100644 Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py diff --git a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py b/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py deleted file mode 100644 index 77a1cf001..000000000 --- a/Exareme-Docker/src/mip-algorithms/NAIVE_BAYES/naive_bayes_nocv.py +++ /dev/null @@ -1,401 +0,0 @@ -from __future__ import print_function -from __future__ import division -from __future__ import unicode_literals - -from collections import Counter -import warnings - -import numpy as np -from sklearn.naive_bayes import GaussianNB -from sklearn.naive_bayes import BaseDiscreteNB - -from mipframework import Algorithm -from mipframework import AlgorithmResult - - -class MixedAdditiveNB(object): - def __init__(self, alpha=1.0): - self.alpha = alpha - self.gnb = None - self.cnb = None - - def fit(self, X_num=None, X_cat=None, y=None): - if X_num is not None: - self.gnb = AdditiveGaussianNB() - self.gnb.fit(X_num, y) - if X_cat is not None: - self.cnb = AdditiveCategoricalNB(alpha=self.alpha) - self.cnb.fit(X_cat, y) - - def predict(self, X_num, X_cat): - if X_num is not None and X_cat is not None: - jll = ( - self.gnb.predict_log_proba(X_num) - + self.cnb.predict_log_proba(X_cat) - - self.gnb.class_log_prior_ - ) - return np.array([self.gnb.classes_[i] for i in jll.argmax(axis=1)]) - elif X_num is not None: - return self.gnb.predict(X_num) - elif X_cat is not None: - return self.cnb.predict(X_cat) - - def __add__(self, other): - result = MixedAdditiveNB() - if self.gnb and other.gnb: - result.gnb = self.gnb + other.gnb - if self.cnb and other.cnb: - result.alpha = self.alpha - result.cnb = self.cnb + other.cnb - return result - - -class MixedNaiveBayesTrain(Algorithm): - def __init__(self, cli_args): - super(MixedNaiveBayesTrain, self).__init__(__file__, cli_args, intercept=False) - - def local_(self): - data = self.data.full - y, X = data[self.parameters.y], data[self.parameters.x] - X_num = np.array(X.iloc[:, :3]) - X_cat = np.array(X.iloc[:, 3:]) - y = np.array(y) - mnb = MixedAdditiveNB() - mnb.fit(X_num, X_cat, y) - pass - - -def run_mixed(): - import time - from mipframework import create_runner - - algorithm_args = [ - "-x", - "lefthippocampus,righthippocampus,leftaccumbensarea,gender,apoe4,agegroup", - "-y", - "alzheimerbroadcategory", - "-alpha", - "1", - "-k", - "1", - "-pathology", - "dementia", - "-dataset", - "adni", - "-filter", - "", - ] - runner = create_runner( - MixedNaiveBayesTrain, algorithm_args=algorithm_args, num_workers=1, - ) - start = time.time() - runner.run() - end = time.time() - print("Completed in ", end - start) - - -class CategoricalNaiveBayesTrain(Algorithm): - def __init__(self, cli_args): - super(CategoricalNaiveBayesTrain, self).__init__( - __file__, cli_args, intercept=False - ) - - def local_(self): - data = self.data.full - y, X = data[self.parameters.y], data[self.parameters.x] - y, X = np.array(y), np.array(X) - cnb = AdditiveCategoricalNB() - cnb.fit(X, y) - self.push_and_add(cnb=cnb) - - def global_(self): - cnb = self.fetch("cnb") - - self.result = AlgorithmResult( - raw_data={"category_count": [cc.tolist() for cc in cnb.category_count_]} - ) - - -class AdditiveCategoricalNB(BaseDiscreteNB): - def __init__(self, alpha=1.0): - self.alpha = alpha - self._class_log_prior_ = np.array([]) - self._feature_log_prob_ = [] - - def fit(self, X, y): - self.n_obs_, self.n_features_ = X.shape - self.classes_, self.class_count_ = np.unique(y, return_counts=True) - self.n_classes_ = len(self.classes_) - self.categories_, self.category_per_feat_count_ = list( - zip(*[np.unique(col, return_counts=True) for col in X.T]) - ) - self.n_categories_ = np.array([len(c) for c in self.categories_]) - self.category_count_ = [ - np.empty((self.n_classes_, self.n_categories_[f])) - for f in xrange(self.n_features_) - ] - for ci, c in enumerate(self.classes_): - X_where_x = X[np.where(y == c)[0]] - for fi, feature in enumerate(X_where_x.T): - counter = Counter(feature) - self.category_count_[fi][ci, :] = np.array( - [counter[cat] for cat in self.categories_[fi]] - ) - - def __add__(self, other): - def sum_elementwise(x, y): - return [xi + yi for xi, yi in zip(x, y)] - - if self.alpha != other.alpha: - raise ValueError("alphas do not agree") - result = AdditiveCategoricalNB(alpha=self.alpha) - - result.n_obs_ = self.n_obs_ + other.n_obs_ - - if self.n_features_ != other.n_features_: - raise ValueError("n_features_ do not agree") - result.n_features_ = self.n_features_ - - if (self.classes_ != other.classes_).all(): - raise ValueError("classes_ do not agree") - result.classes_ = self.classes_ - - result.class_count_ = self.class_count_ + other.class_count_ - - if self.n_classes_ != other.n_classes_: - raise ValueError("n_classes_ do not agree") - result.n_classes_ = self.n_classes_ - - result.category_per_feat_count_ = sum_elementwise( - self.category_per_feat_count_, other.category_per_feat_count_ - ) - - if not all( - [(c1 == c2).all() for c1, c2 in zip(self.categories_, other.categories_)] - ): - raise ValueError("catefories_ do not agree") - result.categories_ = self.categories_ - - result.n_categories_ = sum_elementwise(self.n_categories_, other.n_categories_) - - result.category_count_ = sum_elementwise( - self.category_count_, other.category_count_ - ) - - return result - - @property - def class_log_prior_(self): - if not self._class_log_prior_.any(): - with warnings.catch_warnings(): - # silence the warning when count is 0 because class was not yet - # observed - warnings.simplefilter("ignore", RuntimeWarning) - log_class_count = np.log(self.class_count_) - self._class_log_prior_ = log_class_count - np.log(self.class_count_.sum()) - return self._class_log_prior_ - - @property - def feature_log_prob_(self): - if not self._feature_log_prob_: - feature_log_prob = [] - for i in range(self.n_features_): - smoothed_cat_count = self.category_count_[i] + self.alpha - smoothed_class_count = smoothed_cat_count.sum(axis=1) - feature_log_prob.append( - np.log(smoothed_cat_count) - - np.log(smoothed_class_count.reshape(-1, 1)) - ) - self._feature_log_prob_ = feature_log_prob - return self._feature_log_prob_ - - def _joint_log_likelihood(self, X): - if not X.shape[1] == self.n_features_: - raise ValueError( - "Expected input with %d features, got %d instead" - % (self.n_features_, X.shape[1]) - ) - jll = np.zeros((X.shape[0], self.class_count_.shape[0])) - for i in range(self.n_features_): - categories = X[:, i] - indices = [np.where(self.categories_[i] == cat)[0][0] for cat in categories] - jll += self.feature_log_prob_[i][:, indices].T - total_ll = jll + self.class_log_prior_ - return total_ll - - def __eq__(self, other): - raise NotImplementedError - - -def run_categorical(): - import time - from mipframework import create_runner - - algorithm_args = [ - "-x", - "gender,apoe4,agegroup", - "-y", - "alzheimerbroadcategory", - "-alpha", - "1", - "-k", - "1", - "-pathology", - "dementia", - "-dataset", - "adni", - "-filter", - "", - ] - runner = create_runner( - CategoricalNaiveBayesTrain, algorithm_args=algorithm_args, num_workers=10, - ) - start = time.time() - runner.run() - end = time.time() - print("Completed in ", end - start) - - -class GaussianNaiveBayesTrain(Algorithm): - def __init__(self, cli_args): - super(GaussianNaiveBayesTrain, self).__init__( - __file__, cli_args, intercept=False - ) - - def local_(self): - data = self.data.full - y, X = data[self.parameters.y], data[self.parameters.x] - y, X = np.array(y), np.array(X) - gnb = AdditiveGaussianNB() - gnb.fit(X, y) - self.push_and_add(gnb=gnb) - - def global_(self): - gnb = self.fetch("gnb") - - self.result = AlgorithmResult( - raw_data={"theta": gnb.theta_.tolist(), "sigma": gnb.sigma_.tolist()} - ) - - -class AdditiveGaussianNB(GaussianNB): - def fit(self, X, y): - self.n_obs_, self.n_feats_ = X.shape - self._class_log_prior_ = np.array([]) - super(AdditiveGaussianNB, self).fit(X, y) - - @property - def class_log_prior_(self): - if not self._class_log_prior_.any(): - with warnings.catch_warnings(): - # silence the warning when count is 0 because class was not yet - # observed - warnings.simplefilter("ignore", RuntimeWarning) - log_class_count = np.log(self.class_count_) - self._class_log_prior_ = log_class_count - np.log(self.class_count_.sum()) - return self._class_log_prior_ - - def __add__(self, other): - if self.var_smoothing != other.var_smoothing: - raise ValueError("var_smoothing values do not agree") - if self.priors != other.priors: - raise ValueError("priors do not agree") - if (self.classes_ != other.classes_).all(): - raise ValueError("classes_ do not agree") - - class_count_1 = self.class_count_[:, np.newaxis] - class_count_2 = other.class_count_[:, np.newaxis] - n_obs_total = self.n_obs_ + other.n_obs_ - class_count_total = class_count_1 + class_count_2 - - theta_total = ( - class_count_1 * self.theta_ + class_count_2 * other.theta_ - ) / class_count_total - - self.sigma_[:, :] -= self.epsilon_ - other.sigma_[:, :] -= other.epsilon_ - epsilon_total = max(self.epsilon_, other.epsilon_) - ssd_1 = class_count_1 * self.sigma_ - ssd_2 = class_count_2 * other.sigma_ - total_ssd = ( - ssd_1 - + ssd_2 - + (class_count_1 * class_count_2 / class_count_total) - * (self.theta_ - other.theta_) ** 2 - ) - sigma_total = total_ssd / class_count_total - sigma_total += epsilon_total - - result = AdditiveGaussianNB(self.priors, self.var_smoothing) - result.n_obs_ = n_obs_total - result.classes_ = self.classes_ - result.sigma_ = sigma_total - result.theta_ = theta_total - result.epsilon_ = epsilon_total - result.class_count_ = class_count_total.flatten() - result.class_prior_ = result.class_count_ / n_obs_total - return result - - def __eq__(self, other): - if self.var_smoothing != other.var_smoothing: - return False - if self.priors != other.priors: - return False - if (self.classes_ != other.classes_).all(): - return False - if not np.isclose(self.theta_, other.theta_).all(): - return False - if not np.isclose(self.sigma_, other.sigma_).all(): - return self.sigma_, other.sigma_ - if (self.class_count_ != other.class_count_).all(): - return False - if (self.class_prior_ != other.class_prior_).all(): - return False - if self.n_obs_ != other.n_obs_: - return False - if self.n_feats_ != other.n_feats_: - return False - return True - - -def run_gaussian(): - import time - from mipframework import create_runner - - algorithm_args = [ - "-x", - "lefthippocampus,righthippocampus,leftaccumbensarea," - "leftacgganteriorcingulategyrus,leftainsanteriorinsula,leftamygdala", - "-y", - "alzheimerbroadcategory", - "-alpha", - "1", - "-k", - "1", - "-pathology", - "dementia", - "-dataset", - "adni", - "-filter", - "", - ] - runner = create_runner( - GaussianNaiveBayesTrain, algorithm_args=algorithm_args, num_workers=1, - ) - start = time.time() - runner.run() - end = time.time() - print("Completed in ", end - start) - runner = create_runner( - GaussianNaiveBayesTrain, algorithm_args=algorithm_args, num_workers=1, - ) - start = time.time() - runner.run() - end = time.time() - print("Completed in ", end - start) - - -if __name__ == "__main__": - # run_gaussian() - # run_categorical() - run_mixed()