diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/queryProcessor/HBP/AlgorithmProperties.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/queryProcessor/HBP/AlgorithmProperties.java index f25435f31..b2c0af8ea 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/queryProcessor/HBP/AlgorithmProperties.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/queryProcessor/HBP/AlgorithmProperties.java @@ -264,6 +264,8 @@ private static void validateAlgorithmParameterValueType( ParameterProperties parameterProperties ) throws AlgorithmException, BadUserInputException { if (parameterProperties.getValueType().equals(ParameterProperties.ParameterValueType.json)) { + if (value.equals("")) + return; try { new JSONObject(value); } catch (JSONException ex) { diff --git a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/queryProcessor/HBP/ParameterProperties.java b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/queryProcessor/HBP/ParameterProperties.java index fd172ab2a..4308f62fa 100644 --- a/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/queryProcessor/HBP/ParameterProperties.java +++ b/Exareme-Docker/src/exareme/exareme-master/src/main/java/madgik/exareme/master/queryProcessor/HBP/ParameterProperties.java @@ -21,6 +21,7 @@ public class ParameterProperties { public enum ParameterType { column, // used for selecting specific columns from the database formula, // used for parsing the input as a formula of R, '+ - * : 0' are allowed. + formula_description, // used for providing a formula description to the algorithm in json format filter, // used for filtering on the database input dataset, // used for choosing database input pathology, // used for specifying what database to use @@ -33,7 +34,6 @@ public enum ParameterValueType { real, json } - public ParameterProperties() { } @@ -50,13 +50,15 @@ public void validateParameterPropertiesInitialization(String algorithmName) thro if (type == null) { throw new AlgorithmException(algorithmName, "The parameter field 'type' was not initialized in the properties.json file."); } else if (type.equals(ParameterType.column) || type.equals(ParameterType.formula)) { - if (columnValuesSQLType == null) { - } - if (columnValuesIsCategorical == null) { throw new AlgorithmException(algorithmName, "The parameter field 'columnValuesIsCategorical' was not initialized in the properties.json file."); } - } else if (valueType.equals(ParameterValueType.json)){ + }else if (type.equals(ParameterType.formula_description)) { + if (!valueType.equals(ParameterValueType.json)) { + throw new AlgorithmException(algorithmName, "The parameter field 'valueType' must be json since the 'type' is formula_description."); + } + } + if (valueType.equals(ParameterValueType.json)){ if(valueMultiple) { throw new AlgorithmException(algorithmName, "The parameter field 'valueMultiple' cannot be true because the 'valueType' is json."); } diff --git a/Exareme-Docker/src/mip-algorithms/.gitignore b/Exareme-Docker/src/mip-algorithms/.gitignore index 26f176ada..884353c99 100644 --- a/Exareme-Docker/src/mip-algorithms/.gitignore +++ b/Exareme-Docker/src/mip-algorithms/.gitignore @@ -8,7 +8,7 @@ # db files *.db -# !mipframework/runner/dbs/datasets.db +!mipframework/runner/dbs/datasets.db # !mipframework/runner/dbs/1LocalDBs/local_dataset0.db # !mipframework/runner/dbs/10LocalDBs/local_dataset0.db # !mipframework/runner/dbs/10LocalDBs/local_dataset1.db diff --git a/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/__init__.py b/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/__init__.py index 94aeba881..4bd104ba0 100644 --- a/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/__init__.py +++ b/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/__init__.py @@ -1,3 +1,3 @@ -from descriptive_stats import DescriptiveStats +from .descriptive_stats import DescriptiveStats __all__ = ["DescriptiveStats"] diff --git a/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/descriptive_stats.py b/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/descriptive_stats.py index 7e083eb35..0d9f9b159 100644 --- a/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/descriptive_stats.py +++ b/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/descriptive_stats.py @@ -2,11 +2,16 @@ from __future__ import print_function from __future__ import unicode_literals -from itertools import ifilterfalse, ifilter from collections import Counter +import re + +import numpy as np # type: ignore XXX needed for patsy to know how to take logs and exps +import pandas as pd +import patsy from mipframework import Algorithm, AlgorithmResult from mipframework.constants import PRIVACY_THRESHOLD +from mipframework.formula import generate_formula class DescriptiveStats(Algorithm): @@ -16,205 +21,297 @@ def __init__(self, cli_args): ) def local_(self): - numericals = list( - ifilterfalse( - lambda var: self.metadata.is_categorical[var], self.parameters.y - ) - ) - categoricals = list( - ifilter(lambda var: self.metadata.is_categorical[var], self.parameters.y) - ) - self.push_and_agree(numericals=numericals) - self.push_and_agree(categoricals=categoricals) - self.push_and_agree(labels=self.metadata.label) - # Single variables - df = self.data.full - var_names = self.parameters.var_names - datasets = self.parameters.dataset - - for var_name in var_names: + is_categorical = self.metadata.is_categorical # type: ignore + var_names = self.parameters.var_names # type: ignore + datasets = self.parameters.dataset # type: ignore + data = self.data.full # type: ignore + labels = self.metadata.label # type: ignore + + all_single_stats = MonoidMapping() + for varname in var_names: for dataset in datasets: - if var_name != "dataset": - varlst = [var_name, "dataset"] - else: - varlst = [var_name] - single_df = df[varlst] - single_df = single_df.dropna() - single_df = single_df[single_df.dataset == dataset] - single_df = single_df[var_name] - n_obs = len(single_df) - kwarg = {"single__" + "n_obs_" + var_name + "_" + dataset: n_obs} - self.push_and_add(**kwarg) - if var_name in numericals: - X = single_df - if n_obs <= PRIVACY_THRESHOLD: - sx, sxx, min_, max_ = 0, 0, int(1e9), -int(1e9) - else: - sx = X.sum() - sxx = (X * X).sum() - min_ = X.min() - max_ = X.max() - kwarg = {"single__" + "sx_" + var_name + "_" + dataset: sx} - self.push_and_add(**kwarg) - kwarg = {"single__" + "sxx_" + var_name + "_" + dataset: sxx} - self.push_and_add(**kwarg) - kwarg = {"single__" + "min_" + var_name + "_" + dataset: min_} - self.push_and_min(**kwarg) - kwarg = {"single__" + "max_" + var_name + "_" + dataset: max_} - self.push_and_max(**kwarg) - elif var_name in categoricals: - if n_obs <= PRIVACY_THRESHOLD: - counter = Counter() - else: - counter = Counter(single_df) - kwarg = { - "single__" + "counter_" + var_name + "_" + dataset: counter - } - self.push_and_add(**kwarg) - - # Set of variables - data = self.data.full.dropna() + var_df = get_df_for_single_var(varname, data, dataset) + stats = get_single_stats_monoid(var_df, is_categorical[varname]) + all_single_stats[varname, dataset] = stats + self.push_and_add(all_single_stats=all_single_stats) + + if self.parameters.formula: # type: ignore + formula = generate_formula(formula_data=self.parameters.formula) # type: ignore + dummy = get_formula_transformed_data(formula, data) + included_columns = [ + column for column in dummy.columns if column != "dataset" + ] + group_var_names = included_columns + is_categorical = get_extended_iscategorical( + is_categorical, + included_columns, + ) + labels = get_extended_labels(labels, included_columns) + else: + formula = None + group_var_names = var_names + + all_group_stats = MonoidMapping() for dataset in datasets: data_group = data[data.dataset == dataset] - n_obs = len(data_group) - self.push_and_add(**{"model__" + "n_obs_" + dataset: n_obs}) - for var in numericals + categoricals: - if var in numericals: - numerical = var - numvar = data_group[numerical] - if n_obs <= PRIVACY_THRESHOLD: - sx, sxx, min_, max_ = 0, 0, int(1e9), -int(1e9) - else: - sx = numvar.sum() - sxx = (numvar * numvar).sum() - min_ = numvar.min() - max_ = numvar.max() - kwarg = {"model__" + "sx_" + numerical + "_" + dataset: sx} - self.push_and_add(**kwarg) - kwarg = {"model__" + "sxx_" + numerical + "_" + dataset: sxx} - self.push_and_add(**kwarg) - kwarg = {"model__" + "min_" + numerical + "_" + dataset: min_} - self.push_and_min(**kwarg) - kwarg = {"model__" + "max_" + numerical + "_" + dataset: max_} - self.push_and_max(**kwarg) - elif var in categoricals: - categorical = var - if n_obs <= PRIVACY_THRESHOLD: - counter = Counter() - else: - counter = Counter(data_group[categorical]) - kwarg = { - "model__" + "counter_" + categorical + "_" + dataset: counter - } - self.push_and_add(**kwarg) + for varname in group_var_names: + if formula: + stats = get_model_stats_monoid_from_formula( + varname, + data_group, + is_categorical[varname], + formula, + ) + else: + stats = get_model_stats_monoid( + varname, + data_group, + is_categorical[varname], + ) + all_group_stats[dataset, varname] = stats + self.push_and_add(all_group_stats=all_group_stats) + self.push_and_agree(var_names=var_names) + self.push_and_agree(is_categorical=is_categorical) + self.push_and_agree(labels=labels) def global_(self): - numericals = self.fetch("numericals") - categoricals = self.fetch("categoricals") + var_names = self.fetch("var_names") + is_categorical = self.fetch("is_categorical") + labels = self.fetch("labels") + datasets = self.parameters.dataset # type: ignore - global fields - raw_out = dict() - datasets = self.parameters.dataset - - # Single variables - raw_out["single"] = dict() - for numerical in numericals: - raw_out["single"][numerical] = dict() - for dataset in datasets: - raw_out["single"][numerical][dataset] = dict() - n_obs = self.fetch("single__" + "n_obs_" + numerical + "_" + dataset) - if n_obs <= PRIVACY_THRESHOLD: - raw_out["single"][numerical][dataset]["num_datapoints"] = n_obs - raw_out["single"][numerical][dataset]["data"] = "NOT ENOUGH DATA" - else: - sx = self.fetch("single__" + "sx_" + numerical + "_" + dataset) - sxx = self.fetch("single__" + "sxx_" + numerical + "_" + dataset) - min_ = self.fetch("single__" + "min_" + numerical + "_" + dataset) - max_ = self.fetch("single__" + "max_" + numerical + "_" + dataset) - mean = sx / n_obs - std = ((sxx - n_obs * (mean ** 2)) / (n_obs - 1)) ** 0.5 - upper_ci = mean + std - lower_ci = mean - std - raw_out["single"][numerical][dataset]["num_datapoints"] = n_obs - raw_out["single"][numerical][dataset]["data"] = { - "mean": mean, - "std": std, - "min": min_, - "max": max_, - "upper_confidence": upper_ci, - "lower_confidence": lower_ci, - } - for categorical in categoricals: - raw_out["single"][categorical] = dict() - for dataset in datasets: - raw_out["single"][categorical][dataset] = dict() - n_obs = self.fetch("single__" + "n_obs_" + categorical + "_" + dataset) - if n_obs <= PRIVACY_THRESHOLD: - raw_out["single"][categorical][dataset]["num_datapoints"] = n_obs - raw_out["single"][categorical][dataset]["data"] = "NOT ENOUGH DATA" - else: - counter = self.fetch( - "single__" + "counter_" + categorical + "_" + dataset - ) - raw_out["single"][categorical][dataset]["num_datapoints"] = n_obs - raw_out["single"][categorical][dataset]["data"] = dict(counter) + raw_out = init_raw_out([labels[var] for var in var_names], datasets) - # Model - raw_out["model"] = dict() - for dataset in datasets: - n_obs = self.fetch("model__" + "n_obs_" + dataset) - raw_out["model"][dataset] = dict() - raw_out["model"][dataset]["data"] = dict() - raw_out["model"][dataset]["num_datapoints"] = n_obs - for numerical in numericals: - if n_obs <= PRIVACY_THRESHOLD: - raw_out["model"][dataset]["data"][numerical] = "NOT ENOUGH DATA" - continue - sx = self.fetch("model__" + "sx_" + numerical + "_" + dataset) - sxx = self.fetch("model__" + "sxx_" + numerical + "_" + dataset) - min_ = self.fetch("model__" + "min_" + numerical + "_" + dataset) - max_ = self.fetch("model__" + "max_" + numerical + "_" + dataset) - mean = sx / n_obs - std = ((sxx - n_obs * (mean ** 2)) / (n_obs - 1)) ** 0.5 - upper_ci = mean + std - lower_ci = mean - std - raw_out["model"][dataset]["data"][numerical] = { - "mean": mean, - "std": std, - "min": min_, - "max": max_, - "upper_confidence": upper_ci, - "lower_confidence": lower_ci, + single_out = raw_out["single"] + all_single_stats = self.fetch("all_single_stats") + for (varname, dataset), single_stats in all_single_stats.items(): + current_out = single_out[labels[varname]][dataset] + current_out["num_datapoints"] = single_stats.n_obs + current_out["num_nulls"] = single_stats.n_nulls + current_out["num_total"] = single_stats.n_obs + single_stats.n_nulls + if not single_stats.enough_data: + current_out["data"] = "NOT ENOUGH DATA" + continue + if is_categorical[varname]: + current_out["data"] = get_counts_and_percentages(single_stats.counter) + else: + current_out["data"] = { + "mean": round(single_stats.mean, 2), + "std": round(single_stats.std, 2), + "min": round(single_stats.min_, 2), + "max": round(single_stats.max_, 2), } - for categorical in categoricals: - if n_obs <= PRIVACY_THRESHOLD: - raw_out["model"][dataset]["data"][categorical] = "NOT ENOUGH DATA" - continue - counter = self.fetch( - "model__" + "counter_" + categorical + "_" + dataset + + group_out = raw_out["model"] + all_group_stats = self.fetch("all_group_stats") + for (dataset, varname), group_stats in all_group_stats.items(): + current_out = group_out[dataset] + group_stats = all_group_stats[dataset, varname] + current_out["num_datapoints"] = group_stats.n_obs + current_out["num_nulls"] = group_stats.n_nulls + current_out["num_total"] = group_stats.n_obs + group_stats.n_nulls + if not group_stats.enough_data: + current_out["data"][labels[varname]] = "NOT ENOUGH DATA" + continue + if is_categorical[varname]: + current_out["data"][labels[varname]] = get_counts_and_percentages( + group_stats.counter ) - raw_out["model"][dataset]["data"][categorical] = dict(counter) + else: + current_out["data"][labels[varname]] = { + "mean": round(group_stats.mean, 2), + "std": round(group_stats.std, 2), + "min": round(group_stats.min_, 2), + "max": round(group_stats.max_, 2), + } + self.result = AlgorithmResult(raw_data=raw_out) -if __name__ == "__main__": - import time - from mipframework import create_runner - - algorithm_args = [ - "-y", - "rightphgparahippocampalgyrus, gender, alzheimerbroadcategory, rs10498633_t", - "-pathology", - "dementia", - "-dataset", - "lille_simulation, lille_simulation1", - "-filter", - "", - ] - runner = create_runner( - DescriptiveStats, algorithm_args=algorithm_args, num_workers=2, +def get_formula_transformed_data(formula, data): + processed_data = patsy.dmatrix( + formula, + data, + return_type="dataframe", ) - start = time.time() - runner.run() - end = time.time() - print("Completed in ", end - start) + del processed_data["Intercept"] + return processed_data + + +def init_raw_out(varnames, datasets): + raw_out = dict() + + raw_out["single"] = dict() + for varname in varnames: + raw_out["single"][varname] = dict() + for dataset in datasets: + raw_out["single"][varname][dataset] = dict() + + raw_out["model"] = dict() + for dataset in datasets: + raw_out["model"][dataset] = dict() + raw_out["model"][dataset]["data"] = dict() + return raw_out + + +def get_df_for_single_var(var_name, df, dataset): + if var_name != "dataset": + varlst = [var_name, "dataset"] + else: + varlst = [var_name] + df = df[varlst] + df = df[df.dataset == dataset] + df = df[var_name] + return df + + +def get_extended_labels(labels, new_columns): + labels = dict(labels) + for column in new_columns: + if column not in labels: + labels[column] = column + formatted_labels = {} + for column, label in labels.items(): + # Format id labels + label = re.sub(r"I(\([^()]+\))", r"\g<1>", label) + # Format numpy labels + label = re.sub(r"np.(exp|log)(.+)", r"\g<1>\g<2>", label) + # Format patsy labels + label = re.sub(r"patsy.(center|standardize)(.+)", r"\g<1>\g<2>", label) + formatted_labels[column] = label + return formatted_labels + + +def get_extended_iscategorical(is_categorical, new_columns): + is_categorical = dict(is_categorical) + for column in new_columns: + if column not in is_categorical: + is_categorical[column] = 0 + return is_categorical + + +class NumericalVarStats(object): + def __init__(self, n_obs, n_nulls, sx, sxx, min_, max_): + self.n_obs = n_obs + self.n_nulls = n_nulls + self.enough_data = n_obs >= PRIVACY_THRESHOLD + self.sx = sx if self.enough_data else 0 + self.sxx = sxx if self.enough_data else 0 + self.min_ = min_ if self.enough_data else int(1e9) + self.max_ = max_ if self.enough_data else -int(1e9) + + @property + def mean(self): + return self.sx / self.n_obs + + @property + def std(self): + return ((self.sxx - self.n_obs * (self.mean ** 2)) / (self.n_obs - 1)) ** 0.5 + + @property + def upper_ci(self): + return self.mean + self.std + + @property + def lower_ci(self): + return self.mean - self.std + + def __add__(self, other): + return NumericalVarStats( + n_obs=self.n_obs + other.n_obs, + n_nulls=self.n_nulls + other.n_nulls, + sx=self.sx + other.sx, + sxx=self.sxx + other.sxx, + min_=min(self.min_, other.min_), + max_=max(self.max_, other.max_), + ) + + +class CategoricalVarStats(object): + def __init__(self, n_obs, n_nulls, counter): + self.n_obs = n_obs + self.n_nulls = n_nulls + self.enough_data = n_obs >= PRIVACY_THRESHOLD + self.counter = counter if self.enough_data else Counter() + + def __add__(self, other): + return CategoricalVarStats( + n_obs=self.n_obs + other.n_obs, + n_nulls=self.n_nulls + other.n_nulls, + counter=self.counter + other.counter, + ) + + +def get_single_stats_monoid(df, is_categorical): + n_tot = len(df) + df = df.dropna() + n_obs = len(df) + n_nulls = n_tot - n_obs + if is_categorical: + return get_categorical_stats_monoid(df, n_obs, n_nulls) + return get_numerical_stats_monoid(df, n_obs, n_nulls) + + +def get_numerical_stats_monoid(df, n_obs, n_nulls): + sx = df.sum() + sxx = (df * df).sum() + min_ = df.min() + max_ = df.max() + return NumericalVarStats(n_obs, n_nulls, sx, sxx, min_, max_) + + +def get_categorical_stats_monoid(df, n_obs, n_nulls): + counter = Counter(df) + return CategoricalVarStats(n_obs, n_nulls, counter) + + +def get_model_stats_monoid_from_formula( + varname, + data_group, + is_categorical, + formula, +): + n_tot = len(data_group) + data_group = data_group.dropna() + data_group = get_formula_transformed_data(formula, data_group) + n_obs = len(data_group) + n_nulls = n_tot - n_obs + df = data_group[varname] + if is_categorical: + return get_categorical_stats_monoid(df, n_obs, n_nulls) + return get_numerical_stats_monoid(df, n_obs, n_nulls) + + +def get_model_stats_monoid(varname, data_group, is_categorical): + n_tot = len(data_group) + data_group = data_group.dropna() + n_obs = len(data_group) + n_nulls = n_tot - n_obs + df = data_group[varname] + if is_categorical: + return get_categorical_stats_monoid(df, n_obs, n_nulls) + return get_numerical_stats_monoid(df, n_obs, n_nulls) + + +class MonoidMapping(dict): + def __add__(self, other): + all_keys = set(self.keys()) | set(other.keys()) + result = {} + for key in all_keys: + if key in self and key in other: + result[key] = self[key] + other[key] + elif key in self and key not in other: + result[key] = self[key] + else: + result[key] = other[key] + return MonoidMapping(result) + + +def get_counts_and_percentages(counter): + if isinstance(counter, pd.Series): + counter = {key: counter[key] for key in counter.index} + total = sum(counter.values()) + return { + key: {"count": value, "percentage": round(100 * value / total, ndigits=2)} + for key, value in counter.items() + } diff --git a/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/generate_testcases_descrstats.py b/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/generate_testcases_descrstats.py index a79fa225e..416516a09 100644 --- a/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/generate_testcases_descrstats.py +++ b/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/generate_testcases_descrstats.py @@ -2,6 +2,7 @@ from pathlib import Path from mipframework.algorithmtest import AlgorithmTest +from DESCRIPTIVE_STATS.descriptive_stats import get_counts_and_percentages class DescriptiveStatisticsTest(AlgorithmTest): @@ -20,11 +21,14 @@ def get_expected(self, alg_input): # Single out["single"] = dict() for numerical in numericals: - out["single"][numerical] = dict() - vartab = self.get_data(numerical + ",dataset", datasets) + varlabel = metadata.label[numerical] + out["single"][varlabel] = dict() + data_df = self.get_data(numerical + ",dataset", datasets) for dataset in datasets.split(","): - numvar = vartab[vartab.dataset == dataset][numerical] - out["single"][numerical][dataset] = dict() + numvar = data_df[data_df.dataset == dataset][numerical] + out["single"][varlabel][dataset] = dict() + n_total = len(numvar) + numvar = numvar.dropna() n_obs = len(numvar) if n_obs <= 0: return None @@ -32,27 +36,34 @@ def get_expected(self, alg_input): std = numvar.std() min_ = numvar.min() max_ = numvar.max() - out["single"][numerical][dataset]["num_datapoints"] = n_obs - out["single"][numerical][dataset]["data"] = { + out["single"][varlabel][dataset]["num_total"] = n_total + out["single"][varlabel][dataset]["num_datapoints"] = n_obs + out["single"][varlabel][dataset]["num_nulls"] = n_total - n_obs + out["single"][varlabel][dataset]["data"] = { "mean": mean, "std": std, "min": min_, "max": max_, - "upper_confidence": mean + std, - "lower_confidence": mean - std, } for categorical in categoricals: - out["single"][categorical] = dict() - vartab = self.get_data(categorical + ",dataset", datasets) + varlabel = metadata.label[categorical] + out["single"][varlabel] = dict() + data_df = self.get_data(categorical + ",dataset", datasets) for dataset in datasets.split(","): - out["single"][categorical][dataset] = dict() - catvar = vartab[vartab.dataset == dataset][categorical] + out["single"][varlabel][dataset] = dict() + catvar = data_df[data_df.dataset == dataset][categorical] + n_total = len(catvar) + catvar = catvar.dropna() n_obs = len(catvar) if n_obs <= 0: return None counts = catvar.value_counts() - out["single"][categorical][dataset]["num_datapoints"] = n_obs - out["single"][categorical][dataset]["data"] = dict(counts) + out["single"][varlabel][dataset]["num_total"] = n_total + out["single"][varlabel][dataset]["num_datapoints"] = n_obs + out["single"][varlabel][dataset]["num_nulls"] = n_total - n_obs + out["single"][varlabel][dataset]["data"] = get_counts_and_percentages( + counts + ) # # Model data = self.get_data(y_names + ",dataset", datasets) @@ -61,8 +72,11 @@ def get_expected(self, alg_input): out["model"] = dict() for dataset in datasets.split(","): data_group = data[data.dataset == dataset] - # if len(data_group) == 0: - # continue + n_total = len(data_group) + data_group = data_group.dropna() + n_obs = len(data_group) + if n_obs == 0: + return None df_num = data_group[numericals] df_cat = data_group[categoricals] means = df_num.mean() @@ -74,25 +88,31 @@ def get_expected(self, alg_input): ] out["model"][dataset] = dict() out["model"][dataset]["data"] = dict() - out["model"][dataset]["num_datapoints"] = len(data_group) + out["model"][dataset]["num_datapoints"] = n_obs + out["model"][dataset]["num_total"] = n_total + out["model"][dataset]["num_nulls"] = n_total - n_obs for numerical in numericals: - out["model"][dataset]["data"][numerical] = { + varlabel = metadata.label[numerical] + out["model"][dataset]["data"][varlabel] = { "mean": means[numerical], "std": stds[numerical], "min": mins[numerical], "max": maxs[numerical], - "upper_confidence": means[numerical] + stds[numerical], - "lower_confidence": means[numerical] - stds[numerical], } for i, categorical in enumerate(categoricals): + varlabel = metadata.label[categorical] if counts[i].name != categorical: raise ValueError("WAT??") - out["model"][dataset]["data"][categorical] = dict(counts[i]) + out["model"][dataset]["data"][varlabel] = get_counts_and_percentages( + counts[i] + ) return out if __name__ == "__main__": prop_path = dbs_folder = Path(__file__).parent / "properties.json" - descriptive_stats_test = DescriptiveStatisticsTest(prop_path.as_posix()) + descriptive_stats_test = DescriptiveStatisticsTest( + prop_path.as_posix(), dropna=False + ) descriptive_stats_test.generate_test_cases(num_tests=100) descriptive_stats_test.to_json("descriptive_stats_expected.json") diff --git a/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/properties.json b/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/properties.json index e703d7025..7517cbab1 100644 --- a/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/properties.json +++ b/Exareme-Docker/src/mip-algorithms/DESCRIPTIVE_STATS/properties.json @@ -11,7 +11,7 @@ "columnValuesSQLType": "real, integer, text", "columnValuesIsCategorical": "", "columnValuesNumOfEnumerations": "", - "value": "rightcuncuneus,rightaorganteriororbitalgyrus,leftpogpostcentralgyrus,leftmcggmiddlecingulategyrus,leftsmcsupplementarymotorcortex,leftsogsuperioroccipitalgyrus,leftmtgmiddletemporalgyrus,rightpoparietaloperculum", + "value": "lefthippocampus,righthippocampus", "valueNotBlank": true, "valueMultiple": true, "valueType": "string" @@ -20,11 +20,23 @@ "label": "dataset", "desc": "It contains the names of one or more datasets, in which the algorithm will be executed. It cannot be empty", "type": "dataset", - "value": "adni,edsd", + "value": "adni,edsd, ppmi", "valueNotBlank": true, "valueMultiple": true, "valueType": "string" - }, { + }, + { + "name": "formula", + "label": "formula", + "desc": "Patsy formula (R language syntax).", + "type": "formula_description", + "value": "{\"single\":[{\"var_name\":\"lefthippocampus\",\"unary_operation\":\"log\"},{\"var_name\":\"righthippocampus\",\"unary_operation\":\"exp\"}],\"interactions\":[]}", + "defaultValue": "", + "valueNotBlank": false, + "valueMultiple": false, + "valueType": "json" + }, + { "name": "filter", "label": "filter", "desc": "", diff --git a/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/logistic_regression.py b/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/logistic_regression.py index 2849eb34f..d8e87403b 100644 --- a/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/logistic_regression.py +++ b/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/logistic_regression.py @@ -118,7 +118,7 @@ def local_final(self): self.push_and_agree(half_idx=half_idx) def global_final(self): - x_names = self.load("x_names") + x_names = remove_prefix_from_varnames(self.load("x_names")) coeff = self.load("coeff") ll = self.load("ll") hess = self.load("hess") @@ -424,6 +424,16 @@ def compute_roc(true_positives, true_negatives, false_positives, false_negatives return roc_curve, auc, gini +def remove_prefix_from_varnames(varnames): + new_varnames = [] + for varname in varnames: + varname = re.sub(r"^np.", "", varname) + varname = re.sub(r"^patsy.", "", varname) + varname = re.sub(r"I(\([^()]+\))", r"\g<1>", varname) + new_varnames.append(varname) + return new_varnames + + LogisticRegressionSummary = namedtuple( "LogisticRegressionSummary", [ @@ -468,7 +478,9 @@ def compute_roc(true_positives, true_negatives, false_positives, false_negatives "CN", ] runner = create_runner( - LogisticRegression, num_workers=10, algorithm_args=algorithm_args, + LogisticRegression, + num_workers=10, + algorithm_args=algorithm_args, ) start = time.time() runner.run() diff --git a/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/properties.json b/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/properties.json index 71ba2dab1..3fe612a67 100644 --- a/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/properties.json +++ b/Exareme-Docker/src/mip-algorithms/LOGISTIC_REGRESSION/properties.json @@ -62,12 +62,12 @@ "name": "formula", "label": "formula", "desc": "Patsy formula (R language syntax).", - "type": "other", + "type": "formula_description", "value": "", "defaultValue": "", "valueNotBlank": false, "valueMultiple": false, - "valueType": "string" + "valueType": "json" }, { "name": "positive_level", diff --git a/Exareme-Docker/src/mip-algorithms/README.md b/Exareme-Docker/src/mip-algorithms/README.md index 379b01a34..2412908c2 100644 --- a/Exareme-Docker/src/mip-algorithms/README.md +++ b/Exareme-Docker/src/mip-algorithms/README.md @@ -41,7 +41,8 @@ The parameter has the following properties: