From 40d76ba84f02fb1aab15a2eb06c53b053cc39758 Mon Sep 17 00:00:00 2001 From: CamDavidsonPilon Date: Thu, 14 May 2020 11:25:48 -0400 Subject: [PATCH 01/10] adding npmle --- lifelines/fitters/npmle.py | 173 +++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 lifelines/fitters/npmle.py diff --git a/lifelines/fitters/npmle.py b/lifelines/fitters/npmle.py new file mode 100644 index 000000000..07187657c --- /dev/null +++ b/lifelines/fitters/npmle.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict, namedtuple +import numpy as np +from numpy.linalg import norm +import pandas as pd + +interval = namedtuple("Interval", ["left", "right"]) + + +def E_step_M_step(observation_intervals, p_old, turnball_interval_lookup): + + N = len(observation_intervals) + T = p_old.shape[0] + + p_new = np.zeros_like(p_old) + + for observation_interval in observation_intervals: + p_temp = np.zeros_like(p_old) + + # find all turnball intervals, t, that are contained in (ol, or). Call this set T + # the denominator is sum of p_old[T] probabilities + # the numerator is p_old[t] + for ix_t in turnball_interval_lookup[observation_interval]: + t_p = p_old[ix_t] + p_temp[ix_t] = t_p + + p_new = p_new + p_temp / p_temp.sum() + + return p_new / N + + +def create_turnball_intervals(left, right): + """ + TIHI X 10000 + """ + + left = [[l, 0, "1l"] for l in left] + right = [[r, 0, "0r"] for r in right] + + for l, r in zip(left, right): + if l[0] == r[0]: + l[1] -= 0.01 + r[1] += 0.01 + + import copy + + union = sorted(list(left) + list(right)) + + """ + # fix ties + for k in range(len(union)-1): + e_, e__ = union[k], union[k+1] + if e_[2] == "1l" and e__[2] == "0r" and e_[0] == e__[0]: + union_[k][1] += 0.01 + """ + intervals = [] + + for k in range(len(union) - 1): + e_, e__ = union[k], union[k + 1] + if e_[2] == "1l" and e__[2] == "0r": + intervals.append(interval(e_[0], e__[0])) + + return sorted(set(intervals)) + + +def is_subset(query_interval, super_interval): + return super_interval.left <= query_interval.left and query_interval.right <= super_interval.right + + +def create_turnball_lookup(turnball_intervals, observation_intervals): + + turnball_lookup = defaultdict(set) + + for i, turnball_interval in enumerate(turnball_intervals): + # ask: which observations is this t_interval part of? + for observation_interval in observation_intervals: + # since left and right are sorted by left, we can stop after left > turnball_interval[1] value + if observation_interval.left > turnball_interval.right: + break + if is_subset(turnball_interval, observation_interval): + turnball_lookup[observation_interval].add(i) + + return turnball_lookup + + +def check_convergence(p_new, p_old, tol, i, verbose=False): + if verbose: + print("Iteration %d: norm(p_new - p_old): %.6f" % (i, norm(p_new - p_old))) + if norm(p_new - p_old) < tol: + return True + return False + + +def create_observation_intervals(left, right): + return [interval(l, r) for l, r in zip(left, right)] + + +def npmle(left, right, tol=1e-5, verbose=False): + + left, right = np.asarray(left, dtype=float), np.asarray(right, dtype=float) + assert left.shape == right.shape + + # sort left, right arrays by (left, right). + ix = np.lexsort((right, left)) + left = left[ix] + right = right[ix] + + turnball_intervals = list(create_turnball_intervals(left, right)) + observation_intervals = create_observation_intervals(left, right) + turnball_lookup = create_turnball_lookup(turnball_intervals, sorted(set(observation_intervals))) + + T = len(turnball_intervals) + + converged = False + + # initialize to equal weight + p = 1 / T * np.ones(T) + i = 0 + while not converged: + i += 1 + p_new = E_step_M_step(observation_intervals, p, turnball_lookup) + converged = check_convergence(p_new, p, tol, i, verbose=verbose) + p = p_new + + return p, turnball_intervals + + +def reconstruct_survival_function(probabilities, turnball_intervals, timeline, label="NPMLE"): + index = [0.0] + values = [1.0] + + for p, interval in zip(probabilities, turnball_intervals): + if interval.left != index[-1]: + index.append(interval.left) + values.append(values[-1]) + + if interval.left == interval.right: + values[-1] -= p + else: + index.append(interval.right) + values.append(values[-1] - p) + + full_dataframe = pd.DataFrame(index=timeline, columns=[label + "_upper"]) + + turnball_dataframe = pd.DataFrame(values, index=index, columns=[label + "_upper"]) + + dataframe = full_dataframe.combine_first(turnball_dataframe).ffill() + dataframe[label + "_lower"] = dataframe[label + "_upper"].shift(1).fillna(1) + return dataframe + + +def npmle_compute_confidence_intervals(left, right, mle_, alpha=0.05, samples=1000): + """ + uses basic bootstrap + """ + left, right = np.asarray(left, dtype=float), np.asarray(right, dtype=float) + all_times = np.unique(np.concatenate((left, right, [0]))) + + N = left.shape[0] + + bootstrapped_samples = np.empty((all_times.shape[0], samples)) + + for i in range(samples): + ix = np.random.randint(low=0, high=N, size=N) + left_ = left[ix] + right_ = right[ix] + + bootstrapped_samples[:, i] = reconstruct_survival_function(*npmle(left_, right_), all_times).values[:, 0] + + return ( + 2 * mle_.squeeze() - pd.Series(np.percentile(bootstrapped_samples, (alpha / 2) * 100, axis=1), index=all_times), + 2 * mle_.squeeze() - pd.Series(np.percentile(bootstrapped_samples, (1 - alpha / 2) * 100, axis=1), index=all_times), + ) From dd34b2749a4904b00eed8cb17cad4e25bab386e4 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 14 May 2020 11:34:05 -0400 Subject: [PATCH 02/10] Revert "Revert "Npmle"" --- experiments/working_npmle.py | 187 +++++++++++++++++++++++ lifelines/fitters/kaplan_meier_fitter.py | 70 ++++++++- lifelines/plotting.py | 2 +- 3 files changed, 257 insertions(+), 2 deletions(-) create mode 100644 experiments/working_npmle.py diff --git a/experiments/working_npmle.py b/experiments/working_npmle.py new file mode 100644 index 000000000..28cb32774 --- /dev/null +++ b/experiments/working_npmle.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- +from matplotlib import pyplot as plt +from collections import defaultdict, namedtuple +from numpy.linalg import norm + +interval = namedtuple("Interval", ["left", "right"]) + + +def E_step_M_step(observation_intervals, p_old, turnball_interval_lookup): + + N = len(observation_intervals) + T = p_old.shape[0] + + p_new = np.zeros_like(p_old) + + for observation_interval in observation_intervals: + p_temp = np.zeros_like(p_old) + + # find all turnball intervals, t, that are contained in (ol, or). Call this set T + # the denominator is sum of p_old[T] probabilities + # the numerator is p_old[t] + for ix_t in turnball_interval_lookup[observation_interval]: + t_p = p_old[ix_t] + p_temp[ix_t] = t_p + + p_new = p_new + p_temp / p_temp.sum() + + return p_new / N + + +def create_turnball_intervals(left, right): + """ + TIHI X 10000 + """ + + left = [[l, 0, "1l"] for l in left] + right = [[r, 0, "0r"] for r in right] + + for l, r in zip(left, right): + if l[0] == r[0]: + l[1] -= 0.01 + r[1] += 0.01 + + import copy + + union = sorted(list(left) + list(right)) + + """ + # fix ties + for k in range(len(union)-1): + e_, e__ = union[k], union[k+1] + if e_[2] == "1l" and e__[2] == "0r" and e_[0] == e__[0]: + union_[k][1] += 0.01 + """ + intervals = [] + + for k in range(len(union) - 1): + e_, e__ = union[k], union[k + 1] + if e_[2] == "1l" and e__[2] == "0r": + intervals.append(interval(e_[0], e__[0])) + + return sorted(set(intervals)) + + +def is_subset(query_interval, super_interval): + return super_interval.left <= query_interval.left and query_interval.right <= super_interval.right + + +def create_turnball_lookup(turnball_intervals, observation_intervals): + + turnball_lookup = defaultdict(set) + + for i, turnball_interval in enumerate(turnball_intervals): + # ask: which observations is this t_interval part of? + for observation_interval in observation_intervals: + # since left and right are sorted by left, we can stop after left > turnball_interval[1] value + if observation_interval.left > turnball_interval.right: + break + if is_subset(turnball_interval, observation_interval): + turnball_lookup[observation_interval].add(i) + + return turnball_lookup + + +def check_convergence(p_new, p_old): + if norm(p_new - p_old) < 1e-4: + return True + return False + + +def create_observation_intervals(left, right): + return [interval(l, r) for l, r in zip(left, right)] + + +def npmle(left, right): + + left, right = np.asarray(left, dtype=float), np.asarray(right, dtype=float) + assert left.shape == right.shape + + # sort left, right arrays by (left, right). + ix = np.lexsort((right, left)) + left = left[ix] + right = right[ix] + + turnball_intervals = list(create_turnball_intervals(left, right)) + observation_intervals = create_observation_intervals(left, right) + turnball_lookup = create_turnball_lookup(turnball_intervals, sorted(set(observation_intervals))) + + T = len(turnball_intervals) + + converged = False + + # initialize to equal weight + p = 1 / T * np.ones(T) + + while not converged: + + p_new = E_step_M_step(observation_intervals, p, turnball_lookup) + converged = check_convergence(p_new, p) + p = p_new + + return p, turnball_intervals + + +def reconstruct_survival_function(probabilities, turnball_intervals, timeline): + index = [0.0] + values = [1.0] + + for p, interval in zip(probabilities, turnball_intervals): + if interval.left != index[-1]: + index.append(interval.left) + values.append(values[-1]) + + if interval.left == interval.right: + values[-1] -= p + else: + index.append(interval.right) + values.append(values[-1] - p) + + full_dataframe = pd.DataFrame(index=timeline, columns=["survival function"]) + + turnball_dataframe = pd.DataFrame(values, index=index, columns=["survival function"]) + + dataframe = full_dataframe.combine_first(turnball_dataframe).ffill() + return dataframe + + +def compute_confidence_intervals(left, right, mle_, alpha=0.05, samples=1000): + """ + uses basic bootstrap + """ + left, right = np.asarray(left, dtype=float), np.asarray(right, dtype=float) + all_times = np.unique(np.concatenate((left, right, [np.inf, 0]))) + + N = left.shape[0] + + bootstrapped_samples = np.empty((all_times.shape[0], samples)) + + for i in range(samples): + ix = np.random.randint(low=0, high=N, size=N) + left_ = left[ix] + right_ = right[ix] + + bootstrapped_samples[:, i] = reconstruct_survival_function(*npmle(left_, right_), all_times).values[:, 0] + + return ( + 2 * mle_.squeeze() - pd.Series(np.percentile(bootstrapped_samples, (alpha / 2) * 100, axis=1), index=all_times), + 2 * mle_.squeeze() - pd.Series(np.percentile(bootstrapped_samples, (1 - alpha / 2) * 100, axis=1), index=all_times), + ) + + +from lifelines.datasets import load_diabetes + +data = load_diabetes() + + +left = [1, 7, 8, 7, 7, 17, 37, 46, 46, 45] +right = [7, 8, 10, 16, 14, 100, 44, 100, 100, 100] + +# left, right = data['left'], data['right'] + +results = npmle(left, right) + + +timeline = np.unique(np.concatenate((left, right, [np.inf, 0]))) +df = reconstruct_survival_function(*results, timeline) +# CIs = compute_confidence_intervals(left, right, df) diff --git a/lifelines/fitters/kaplan_meier_fitter.py b/lifelines/fitters/kaplan_meier_fitter.py index facbd39fa..4b2e0d3b5 100644 --- a/lifelines/fitters/kaplan_meier_fitter.py +++ b/lifelines/fitters/kaplan_meier_fitter.py @@ -17,8 +17,11 @@ StatisticalWarning, coalesce, CensoringType, + pass_for_numeric_dtypes_or_raise_array, + check_nans_or_infs, ) from lifelines.plotting import loglogs_plot, _plot_estimate +from lifelines.fitters.npmle import npmle, reconstruct_survival_function, npmle_compute_confidence_intervals class KaplanMeierFitter(UnivariateFitter): @@ -112,6 +115,62 @@ def fit( return self._fit(durations, event_observed, timeline, entry, label, alpha, ci_labels, weights) + @CensoringType.interval_censoring + def fit_interval_censoring( + self, + lower_bound, + upper_bound, + event_observed=None, + timeline=None, + label=None, + alpha=None, + ci_labels=None, + show_progress=False, + entry=None, + weights=None, + tol=1e-5, + ) -> "KaplanMeierFitter": + + if entry is not None or weights is not None: + raise NotImplementedError("entry / weights is not supported yet") + + self.upper_bound = np.atleast_1d(pass_for_numeric_dtypes_or_raise_array(upper_bound)) + self.lower_bound = np.atleast_1d(pass_for_numeric_dtypes_or_raise_array(lower_bound)) + check_nans_or_infs(self.lower_bound) + + self.timeline = coalesce(timeline, np.unique(np.concatenate((self.upper_bound, self.lower_bound)))) + + if (self.upper_bound < self.lower_bound).any(): + raise ValueError("All upper_bound times must be greater than or equal to lower_bound times.") + + if event_observed is None: + event_observed = self.upper_bound == self.lower_bound + + if ((self.lower_bound == self.upper_bound) != event_observed).any(): + raise ValueError( + "For all rows, lower_bound == upper_bound if and only if event observed = 1 (uncensored). Likewise, lower_bound < upper_bound if and only if event observed = 0 (censored)" + ) + + self._label = coalesce(label, self._label, "KM_estimate") + + probs, t_intervals = npmle(self.lower_bound, self.upper_bound, verbose=show_progress) + self.survival_function_ = reconstruct_survival_function(probs, t_intervals, self.timeline, label=self._label) + self.cumulative_density_ = 1 - self.survival_function_ + + self._median = median_survival_times(self.survival_function_) + self.percentile = functools.partial(qth_survival_time, model_or_survival_function=self.survival_function_) + + """ + self.confidence_interval_ = npmle_compute_confidence_intervals(self.lower_bound, self.upper_bound, self.survival_function_, self.alpha) + self.confidence_interval_survival_function_ = self.confidence_interval_ + self.confidence_interval_cumulative_density_ = 1 - self.confidence_interval_ + """ + # estimation methods + self._estimation_method = "survival_function_" + self._estimate_name = "survival_function_" + self._update_docstrings() + return self + @CensoringType.left_censoring def fit_left_censoring( self, durations, event_observed=None, timeline=None, entry=None, label=None, alpha=None, ci_labels=None, weights=None @@ -147,6 +206,7 @@ def fit_left_censoring( self with new properties like ``survival_function_``, ``plot()``, ``median_survival_time_`` """ + # left censoring is then defined in CensoringType.is_left_censoring(self) return self._fit(durations, event_observed, timeline, entry, label, alpha, ci_labels, weights) def _fit( @@ -293,9 +353,17 @@ def cumulative_density_at_times(self, times, label=None) -> pd.Series: label = coalesce(label, self._label) return pd.Series(1 - self.predict(times), index=_to_1d_array(times), name=label) + def plot(self, **kwargs): + return self.plot_survival_function(**kwargs) + def plot_survival_function(self, **kwargs): """Alias of ``plot``""" - return _plot_estimate(self, estimate="survival_function_", **kwargs) + if not CensoringType.is_interval_censoring(self): + return _plot_estimate(self, estimate="survival_function_", **kwargs) + else: + # hack for now. + color = coalesce(kwargs.get("c"), kwargs.get("color"), "k") + self.survival_function_.plot(drawstyle="steps", color=color, **kwargs) def plot_cumulative_density(self, **kwargs): """ diff --git a/lifelines/plotting.py b/lifelines/plotting.py index ec44f1518..720d9dbdc 100644 --- a/lifelines/plotting.py +++ b/lifelines/plotting.py @@ -732,7 +732,7 @@ def _plot_estimate( ): """ - Plots a pretty figure of {0}.{1} + Plots a pretty figure of estimates Matplotlib plot arguments can be passed in inside the kwargs, plus From 56c140ba98f2dc57d1fc18bc54abfb91a36b1a69 Mon Sep 17 00:00:00 2001 From: CamDavidsonPilon Date: Thu, 14 May 2020 11:35:11 -0400 Subject: [PATCH 03/10] adding AIC --- lifelines/fitters/__init__.py | 13 ++++++ lifelines/fitters/cox_time_varying_fitter.py | 37 +++++---------- lifelines/fitters/coxph_fitter.py | 4 +- lifelines/utils/printer.py | 47 +++++++++++++++----- 4 files changed, 62 insertions(+), 39 deletions(-) diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py index c17ef1539..ec49248a3 100644 --- a/lifelines/fitters/__init__.py +++ b/lifelines/fitters/__init__.py @@ -1214,6 +1214,15 @@ def predict_expectation(self, df, conditional_after=None) -> pd.Series: raise NotImplementedError() +class SemiParametricRegressionFittter(RegressionFitter): + @property + def AIC_partial_(self) -> float: + """ + "partial" because the log-likelihood is partial + """ + return -2 * self.log_likelihood_ + 2 * self.params_.shape[0] + + class ParametricRegressionFitter(RegressionFitter): _scipy_fit_method = "BFGS" @@ -2482,6 +2491,10 @@ def concordance_index_(self) -> float: return self.concordance_index_ return self._concordance_index_ + @property + def AIC_(self) -> float: + return -2 * self.log_likelihood_ + 2 * self.params_.shape[0] + class ParametericAFTRegressionFitter(ParametricRegressionFitter): diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index b27fe5657..a54151839 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -16,7 +16,7 @@ from autograd import numpy as anp -from lifelines.fitters import RegressionFitter +from lifelines.fitters import SemiParametricRegressionFittter from lifelines.fitters.mixins import ProportionalHazardMixin from lifelines.utils.printer import Printer from lifelines.statistics import _chisq_test_p_value, StatisticalResult @@ -47,7 +47,7 @@ matrix_axis_0_sum_to_1d_array = lambda m: np.sum(m, 0) -class CoxTimeVaryingFitter(RegressionFitter, ProportionalHazardMixin): +class CoxTimeVaryingFitter(SemiParametricRegressionFittter, ProportionalHazardMixin): r""" This class implements fitting Cox's time-varying proportional hazard model: @@ -173,9 +173,7 @@ def fit( if weights_col is None: self.weights_col = None - assert ( - "__weights" not in df.columns - ), "__weights is an internal lifelines column, please rename your column first." + assert "__weights" not in df.columns, "__weights is an internal lifelines column, please rename your column first." df["__weights"] = 1.0 else: self.weights_col = weights_col @@ -217,9 +215,7 @@ def fit( ) self.params_ = pd.Series(params_, index=df.columns, name="coef") / self._norm_std - self.variance_matrix_ = pd.DataFrame( - -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std), index=df.columns - ) + self.variance_matrix_ = pd.DataFrame(-inv(self._hessian_) / np.outer(self._norm_std, self._norm_std), index=df.columns) self.standard_errors_ = self._compute_standard_errors( normalize(df, self._norm_mean, self._norm_std), events, start, stop, weights ) @@ -260,10 +256,9 @@ def _partition_by_strata(self, X, events, start, stop, weights): ), stratum def _partition_by_strata_and_apply(self, X, events, start, stop, weights, function, *args): - for ( - (stratified_X, stratified_events, stratified_start, stratified_stop, stratified_W), - _, - ) in self._partition_by_strata(X, events, start, stop, weights): + for ((stratified_X, stratified_events, stratified_start, stratified_stop, stratified_W), _) in self._partition_by_strata( + X, events, start, stop, weights + ): yield function(stratified_X, stratified_events, stratified_start, stratified_stop, stratified_W, *args) def _compute_z_values(self): @@ -375,9 +370,7 @@ def _newton_rhaphson( i += 1 if self.strata is None: - h, g, ll = self._get_gradients( - df.values, events.values, start.values, stop.values, weights.values, beta - ) + h, g, ll = self._get_gradients(df.values, events.values, start.values, stop.values, weights.values, beta) else: g = np.zeros_like(beta) h = np.zeros((d, d)) @@ -714,11 +707,7 @@ def log_likelihood_ratio_test(self): degrees_freedom = self.params_.shape[0] p_value = _chisq_test_p_value(test_stat, degrees_freedom=degrees_freedom) return StatisticalResult( - p_value, - test_stat, - name="log-likelihood ratio test", - degrees_freedom=degrees_freedom, - null_distribution="chi squared", + p_value, test_stat, name="log-likelihood ratio test", degrees_freedom=degrees_freedom, null_distribution="chi squared" ) def plot(self, columns=None, ax=None, **errorbar_kwargs): @@ -777,18 +766,14 @@ def plot(self, columns=None, ax=None, **errorbar_kwargs): return ax - def _compute_cumulative_baseline_hazard( - self, tv_data, events, start, stop, weights - ): # pylint: disable=too-many-locals + def _compute_cumulative_baseline_hazard(self, tv_data, events, start, stop, weights): # pylint: disable=too-many-locals with warnings.catch_warnings(): warnings.simplefilter("ignore") hazards = self.predict_partial_hazard(tv_data).values unique_death_times = np.unique(stop[events.values]) - baseline_hazard_ = pd.DataFrame( - np.zeros_like(unique_death_times), index=unique_death_times, columns=["baseline hazard"] - ) + baseline_hazard_ = pd.DataFrame(np.zeros_like(unique_death_times), index=unique_death_times, columns=["baseline hazard"]) for t in unique_death_times: ix = (start.values < t) & (t <= stop.values) diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 90c0af63b..93fcbfdab 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -15,7 +15,7 @@ from autograd import numpy as anp from autograd import elementwise_grad -from lifelines.fitters import RegressionFitter, ParametricRegressionFitter +from lifelines.fitters import RegressionFitter, SemiParametricRegressionFittter, ParametricRegressionFitter from lifelines.fitters.mixins import SplineFitterMixin, ProportionalHazardMixin from lifelines.plotting import set_kwargs_drawstyle from lifelines.statistics import _chisq_test_p_value, StatisticalResult @@ -125,7 +125,7 @@ def decide(self, batch_mode: Optional[bool], n_unique: int, n_total: int, n_vars return self.SINGLE -class CoxPHFitter(RegressionFitter, ProportionalHazardMixin): +class CoxPHFitter(SemiParametricRegressionFittter, ProportionalHazardMixin): r""" This class implements fitting Cox's proportional hazard model using Efron's method for ties. diff --git a/lifelines/utils/printer.py b/lifelines/utils/printer.py index 5206165d1..7363acb7d 100644 --- a/lifelines/utils/printer.py +++ b/lifelines/utils/printer.py @@ -6,11 +6,20 @@ class Printer: - def __init__(self, model, headers: List[Tuple[str, Any]], justify: Callable, decimals: int, header_kwargs: Dict): + def __init__( + self, + model, + headers: List[Tuple[str, Any]], + footers: List[Tuple[str, Any]], + justify: Callable, + decimals: int, + header_kwargs: Dict, + ): self.headers = headers self.model = model self.decimals = decimals self.justify = justify + self.footers = footers # TODO: use this variable for tuple_ in header_kwargs.items(): self.add_to_headers(tuple_) @@ -85,18 +94,25 @@ def to_html(self): except AttributeError: pass + try: + footers.append(("AIC", "{:.{prec}f}".format(self.model.AIC_, prec=decimals))) + except AttributeError: + pass + + try: + footers.append(("Partial AIC", "{:.{prec}f}".format(self.model.AIC_partial_, prec=decimals))) + except AttributeError: + pass + try: sr = self.model.log_likelihood_ratio_test() - footers.extend( - [ - ("Concordance", "{:.{prec}f}".format(self.model.concordance_index_, prec=decimals)), - ( - "Log-likelihood ratio test", - "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), - ), - ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals)), - ] + footers.append( + ( + "Log-likelihood ratio test", + "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), + ) ) + footers.append(("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals))) except AttributeError: pass @@ -177,12 +193,21 @@ def ascii_print(self): with np.errstate(invalid="ignore", divide="ignore"): + print("---") try: - print("---") if utils.CensoringType.is_right_censoring(self.model) and self.model._KNOWN_MODEL: print("Concordance = {:.{prec}f}".format(self.model.concordance_index_, prec=decimals)) except AttributeError: pass + try: + print("AIC = {:.{prec}f}".format(self.model.AIC_, prec=decimals)) + except AttributeError: + pass + + try: + print("Partial AIC = {:.{prec}f}".format(self.model.AIC_partial_, prec=decimals)) + except AttributeError: + pass try: sr = self.model.log_likelihood_ratio_test() From 06871e92de9d144302aed9b13919be8497e8098e Mon Sep 17 00:00:00 2001 From: CamDavidsonPilon Date: Thu, 14 May 2020 12:22:11 -0400 Subject: [PATCH 04/10] adding support for weighted penalty --- lifelines/fitters/__init__.py | 19 ++++--- lifelines/fitters/cox_time_varying_fitter.py | 14 +++-- lifelines/fitters/coxph_fitter.py | 20 ++++--- .../generalized_gamma_regression_fitter.py | 9 ++-- lifelines/fitters/log_logistic_aft_fitter.py | 5 +- lifelines/fitters/log_normal_aft_fitter.py | 7 +-- ...piecewise_exponential_regression_fitter.py | 2 +- lifelines/fitters/weibull_aft_fitter.py | 4 +- lifelines/tests/test_estimation.py | 53 +++++++++++++++++++ 9 files changed, 100 insertions(+), 33 deletions(-) diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py index ec49248a3..b103266e8 100644 --- a/lifelines/fitters/__init__.py +++ b/lifelines/fitters/__init__.py @@ -1228,7 +1228,7 @@ class ParametricRegressionFitter(RegressionFitter): _scipy_fit_method = "BFGS" _scipy_fit_options: Dict[str, Any] = dict() - def __init__(self, alpha=0.05, penalizer=0.0, l1_ratio=0.0): + def __init__(self, alpha: float = 0.05, penalizer: Union[float, np.array] = 0.0, l1_ratio: float = 0.0): super(ParametricRegressionFitter, self).__init__(alpha=alpha) self.penalizer = penalizer self.l1_ratio = l1_ratio @@ -1744,13 +1744,18 @@ def _add_penalty(self, params: Dict, neg_ll: float): params_array, _ = flatten(params) # remove intercepts from being penalized params_array = params_array[~self._constant_cols] - if self.penalizer > 0 and self.l1_ratio > 0: - penalty = self.l1_ratio * anp.abs(params_array).sum() + 0.5 * (1.0 - self.l1_ratio) * (params_array ** 2).sum() - elif self.penalizer > 0 and self.l1_ratio <= 0: - penalty = 0.5 * (params_array ** 2).sum() + if (isinstance(self.penalizer, np.ndarray) or self.penalizer > 0) and self.l1_ratio > 0: + penalty = ( + self.l1_ratio * (self.penalizer * anp.abs(params_array)).sum() + + 0.5 * (1.0 - self.l1_ratio) * (self.penalizer * (params_array) ** 2).sum() + ) + + elif (isinstance(self.penalizer, np.ndarray) or self.penalizer > 0) and self.l1_ratio <= 0: + penalty = 0.5 * (self.penalizer * (params_array) ** 2).sum() + else: penalty = 0 - return neg_ll + self.penalizer * penalty + return neg_ll + penalty def _create_neg_likelihood_with_penalty_function( self, params_array, Ts, E, weights, entries, Xs, likelihood=None, penalty=None @@ -2096,7 +2101,7 @@ def print_summary(self, decimals=2, style=None, **kwargs): headers.append(("weights col", "'%s'" % self.weights_col)) if self.entry_col: headers.append(("entry col", "'%s'" % self.entry_col)) - if self.penalizer > 0: + if isinstance(self.penalizer, np.ndarray) or self.penalizer > 0: headers.append(("penalizer", self.penalizer)) if self.robust: headers.append(("robust variance", True)) diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index a54151839..cf104b2af 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -86,9 +86,6 @@ class CoxTimeVaryingFitter(SemiParametricRegressionFittter, ProportionalHazardMi def __init__(self, alpha=0.05, penalizer=0.0, l1_ratio: float = 0.0, strata=None): super(CoxTimeVaryingFitter, self).__init__(alpha=alpha) - if penalizer < 0: - raise ValueError("penalizer parameter must be >= 0.") - self.alpha = alpha self.penalizer = penalizer self.strata = strata @@ -338,14 +335,15 @@ def _newton_rhaphson( """ assert precision <= 1.0, "precision must be less than or equal to 1." - # soft penalizer functions, from https://www.cs.ubc.ca/cgi-bin/tr/2009/TR-2009-19.pdf # soft penalizer functions, from https://www.cs.ubc.ca/cgi-bin/tr/2009/TR-2009-19.pdf soft_abs = lambda x, a: 1 / a * (anp.logaddexp(0, -a * x) + anp.logaddexp(0, a * x)) penalizer = ( lambda beta, a: n * 0.5 - * self.penalizer - * (self.l1_ratio * soft_abs(beta, a).sum() + (1 - self.l1_ratio) * ((beta) ** 2).sum()) + * ( + self.l1_ratio * (self.penalizer * soft_abs(beta, a)).sum() + + (1 - self.l1_ratio) * (self.penalizer * beta ** 2).sum() + ) ) d_penalizer = elementwise_grad(penalizer) dd_penalizer = elementwise_grad(d_penalizer) @@ -388,7 +386,7 @@ def _newton_rhaphson( # if the user supplied a non-trivial initial point, we need to delay this. self._log_likelihood_null = ll - if self.penalizer > 0: + if isinstance(self.penalizer, np.ndarray) or self.penalizer > 0: ll -= penalizer(beta, 1.5 ** i) g -= d_penalizer(beta, 1.5 ** i) h[np.diag_indices(d)] -= dd_penalizer(beta, 1.5 ** i) @@ -653,7 +651,7 @@ def print_summary(self, decimals=2, style=None, **kwargs): headers.append(("event col", "'%s'" % self.event_col)) if self.weights_col: headers.append(("weights col", "'%s'" % self.weights_col)) - if self.penalizer > 0: + if isinstance(self.penalizer, np.ndarray) or self.penalizer > 0: headers.append(("penalizer", self.penalizer)) if self.strata: headers.append(("strata", self.strata)) diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 93fcbfdab..05113e99b 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -143,12 +143,15 @@ class CoxPHFitter(SemiParametricRegressionFittter, ProportionalHazardMixin): baseline_estimation_method: string, optional specify how the fitter should estimate the baseline. ``"breslow"`` or ``"spline"`` - penalizer: float, optional (default=0.0) + penalizer: float or array, optional (default=0.0) Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. For example, this shrinks the magnitude value of :math:`\beta_i`. See ``l1_ratio`` below. The penalty term is :math:`\frac{1}{2} \text{penalizer} \left( (1-\text{l1_ratio}) ||\beta||_2^2 + \text{l1_ratio}||\beta||_1\right)`. + Alternatively, penalizer is an array equal in size to the number of parameters, with penalty coefficients for specific variables. For + example, `penalizer=0.01 * np.ones(p)` is the same as `penalizer=0.01` + l1_ratio: float, optional (default=0.0) Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See ``penalizer`` above. @@ -202,7 +205,7 @@ class CoxPHFitter(SemiParametricRegressionFittter, ProportionalHazardMixin): def __init__( self, baseline_estimation_method: str = "breslow", - penalizer: float = 0.0, + penalizer: Union[float, np.ndarray] = 0.0, strata: Optional[Union[List[str], str]] = None, l1_ratio: float = 0.0, n_baseline_knots: int = 1, @@ -210,8 +213,7 @@ def __init__( ) -> None: super(CoxPHFitter, self).__init__(**kwargs) - if penalizer < 0: - raise ValueError("penalizer parameter must be >= 0.") + if l1_ratio < 0 or l1_ratio > 1: raise ValueError("l1_ratio parameter must in [0, 1].") @@ -555,8 +557,10 @@ def _newton_rhapson_for_efron_model( penalizer = ( lambda beta, a: n * 0.5 - * self.penalizer - * (self.l1_ratio * soft_abs(beta, a).sum() + (1 - self.l1_ratio) * ((beta) ** 2).sum()) + * ( + self.l1_ratio * (self.penalizer * soft_abs(beta, a)).sum() + + (1 - self.l1_ratio) * (self.penalizer * beta ** 2).sum() + ) ) d_penalizer = elementwise_grad(penalizer) dd_penalizer = elementwise_grad(d_penalizer) @@ -605,7 +609,7 @@ def _newton_rhapson_for_efron_model( # if the user supplied a non-trivial initial point, we need to delay this. self._ll_null_ = ll_ - if self.penalizer > 0: + if isinstance(self.penalizer, np.ndarray) or self.penalizer > 0: ll_ -= penalizer(beta, 1.3 ** i) g -= d_penalizer(beta, 1.3 ** i) h[np.diag_indices(d)] -= dd_penalizer(beta, 1.3 ** i) @@ -1323,7 +1327,7 @@ def print_summary(self, decimals: int = 2, style: Optional[str] = None, **kwargs headers.append(("weights col", "'%s'" % self.weights_col)) if self.cluster_col: headers.append(("cluster col", "'%s'" % self.cluster_col)) - if self.penalizer > 0: + if isinstance(self.penalizer, np.ndarray) or self.penalizer > 0: headers.append(("penalizer", self.penalizer)) headers.append(("l1 ratio", self.l1_ratio)) if self.robust or self.cluster_col: diff --git a/lifelines/fitters/generalized_gamma_regression_fitter.py b/lifelines/fitters/generalized_gamma_regression_fitter.py index a11f84e4d..f1af8e425 100644 --- a/lifelines/fitters/generalized_gamma_regression_fitter.py +++ b/lifelines/fitters/generalized_gamma_regression_fitter.py @@ -58,7 +58,10 @@ class GeneralizedGammaRegressionFitter(ParametricRegressionFitter): ----------- alpha: float, optional (default=0.05) the level in the confidence intervals. - + penalizer: float or array, optional (default=0.0) + the penalizer coefficient to the size of the coefficients. See `l1_ratio`. Must be equal to or greater than 0. + Alternatively, penalizer is an array equal in size to the number of parameters, with penalty coefficients for specific variables. For + example, `penalizer=0.01 * np.ones(p)` is the same as `penalizer=0.01` Examples -------- @@ -126,9 +129,7 @@ def _create_initial_point(self, Ts, E, entries, weights, Xs): # we may use this later in print_summary self._ll_null_ = uni_model.log_likelihood_ - default_point = super(GeneralizedGammaRegressionFitter, self)._create_initial_point( - Ts, E, entries, weights, Xs - ) + default_point = super(GeneralizedGammaRegressionFitter, self)._create_initial_point(Ts, E, entries, weights, Xs) nested_point = {} nested_point["mu_"] = np.array([0.0] * (len(Xs.mappings["mu_"]))) diff --git a/lifelines/fitters/log_logistic_aft_fitter.py b/lifelines/fitters/log_logistic_aft_fitter.py index 1502d8dc1..0d99d6bee 100644 --- a/lifelines/fitters/log_logistic_aft_fitter.py +++ b/lifelines/fitters/log_logistic_aft_fitter.py @@ -35,8 +35,11 @@ class LogLogisticAFTFitter(ParametericAFTRegressionFitter): fit_intercept: boolean, optional (default=True) Allow lifelines to add an intercept column of 1s to df, and ancillary_df if applicable. - penalizer: float, optional (default=0.0) + penalizer: float or array, optional (default=0.0) the penalizer coefficient to the size of the coefficients. See `l1_ratio`. Must be equal to or greater than 0. + Alternatively, penalizer is an array equal in size to the number of parameters, with penalty coefficients for specific variables. For + example, `penalizer=0.01 * np.ones(p)` is the same as `penalizer=0.01` + l1_ratio: float, optional (default=0.0) how much of the penalizer should be attributed to an l1 penalty (otherwise an l2 penalty). The penalty function looks like diff --git a/lifelines/fitters/log_normal_aft_fitter.py b/lifelines/fitters/log_normal_aft_fitter.py index 75d7d06e2..d8d4e3dca 100644 --- a/lifelines/fitters/log_normal_aft_fitter.py +++ b/lifelines/fitters/log_normal_aft_fitter.py @@ -37,8 +37,10 @@ class LogNormalAFTFitter(ParametericAFTRegressionFitter): fit_intercept: bool, optional (default=True) Allow lifelines to add an intercept column of 1s to df, and ancillary_df if applicable. - penalizer: float, optional (default=0.0) + penalizer: float or array, optional (default=0.0) the penalizer coefficient to the size of the coefficients. See `l1_ratio`. Must be equal to or greater than 0. + Alternatively, penalizer is an array equal in size to the number of parameters, with penalty coefficients for specific variables. For + example, `penalizer=0.01 * np.ones(p)` is the same as `penalizer=0.01` l1_ratio: float, optional (default=0.0) how much of the penalizer should be attributed to an l1 penalty (otherwise an l2 penalty). The penalty function looks like @@ -158,8 +160,7 @@ def predict_percentile( S = norm.sf(Z) return pd.Series( - exp_mu_ * np.exp(np.sqrt(2) * sigma_ * erfinv(2 * (1 - p * S) - 1)) - conditional_after, - index=_get_index(df), + exp_mu_ * np.exp(np.sqrt(2) * sigma_ * erfinv(2 * (1 - p * S) - 1)) - conditional_after, index=_get_index(df) ) def predict_expectation(self, df: pd.DataFrame, ancillary_df: Optional[pd.DataFrame] = None) -> pd.Series: diff --git a/lifelines/fitters/piecewise_exponential_regression_fitter.py b/lifelines/fitters/piecewise_exponential_regression_fitter.py index e96c3743c..5efca2634 100644 --- a/lifelines/fitters/piecewise_exponential_regression_fitter.py +++ b/lifelines/fitters/piecewise_exponential_regression_fitter.py @@ -53,13 +53,13 @@ def __init__(self, breakpoints, alpha=0.05, penalizer=0.0): self.breakpoints = breakpoints self.n_breakpoints = len(self.breakpoints) + assert isinstance(self.penalizer, float), "penalizer must be a float" self.penalizer = penalizer self._fitted_parameter_names = ["lambda_%d_" % i for i in range(self.n_breakpoints + 1)] def _add_penalty(self, params, neg_ll): params_stacked = np.stack(params.values()) coef_penalty = 0 - if self.penalizer > 0: for i in range(params_stacked.shape[1]): if not self._constant_cols[i]: diff --git a/lifelines/fitters/weibull_aft_fitter.py b/lifelines/fitters/weibull_aft_fitter.py index d9370fe46..ad627ba2d 100644 --- a/lifelines/fitters/weibull_aft_fitter.py +++ b/lifelines/fitters/weibull_aft_fitter.py @@ -43,8 +43,10 @@ class WeibullAFTFitter(ParametericAFTRegressionFitter, ProportionalHazardMixin): fit_intercept: boolean, optional (default=True) Allow lifelines to add an intercept column of 1s to df, and ancillary_df if applicable. - penalizer: float, optional (default=0.0) + penalizer: float or array, optional (default=0.0) the penalizer coefficient to the size of the coefficients. See `l1_ratio`. Must be equal to or greater than 0. + Alternatively, penalizer is an array equal in size to the number of parameters, with penalty coefficients for specific variables. For + example, `penalizer=0.01 * np.ones(p)` is the same as `penalizer=0.01` l1_ratio: float, optional (default=0.0) how much of the penalizer should be attributed to an l1 penalty (otherwise an l2 penalty). The penalty function looks like diff --git a/lifelines/tests/test_estimation.py b/lifelines/tests/test_estimation.py index 602dd85f8..b09239bfd 100644 --- a/lifelines/tests/test_estimation.py +++ b/lifelines/tests/test_estimation.py @@ -1481,6 +1481,21 @@ def rossi(self): rossi["_int"] = 1.0 return rossi + def test_penalizer_can_be_an_array(self, rossi): + + wf_array = WeibullAFTFitter(penalizer=0.01 * np.ones(7)).fit(rossi, "week", "arrest") + wf_float = WeibullAFTFitter(penalizer=0.01).fit(rossi, "week", "arrest") + + assert_frame_equal(wf_array.summary, wf_float.summary) + + def test_penalizer_can_be_an_array_and_check_it_behaves_as_expected(self, rossi): + + penalty = np.array([0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]) + wf_array = WeibullAFTFitter(penalizer=penalty).fit(rossi, "week", "arrest") + wf_float = WeibullAFTFitter(penalizer=0.01).fit(rossi, "week", "arrest") + + assert abs(wf_array.summary.loc[("lambda_", "fin"), "coef"]) > abs(wf_float.summary.loc[("lambda_", "fin"), "coef"]) + def test_custom_weibull_model_gives_the_same_data_as_implemented_weibull_model(self, rossi): class CustomWeibull(ParametricRegressionFitter): _scipy_fit_method = "SLSQP" @@ -2534,6 +2549,21 @@ def cph(self): def cph_spline(self): return CoxPHFitter(baseline_estimation_method="spline") + def test_penalizer_can_be_an_array(self, rossi): + + cph_array = CoxPHFitter(penalizer=0.01 * np.ones(7)).fit(rossi, "week", "arrest") + cph_float = CoxPHFitter(penalizer=0.01).fit(rossi, "week", "arrest") + + assert_frame_equal(cph_array.summary, cph_float.summary) + + def test_penalizer_can_be_an_array_and_check_it_behaves_as_expected(self, rossi): + + penalty = np.array([0, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]) + cph_array = CoxPHFitter(penalizer=penalty).fit(rossi, "week", "arrest") + cph_float = CoxPHFitter(penalizer=0.01).fit(rossi, "week", "arrest") + + assert abs(cph_array.summary.loc["fin", "coef"]) > abs(cph_float.summary.loc["fin", "coef"]) + def test_compute_followup_hazard_ratios(self, cph, cph_spline, rossi): cph.fit(rossi, "week", "arrest") cph.compute_followup_hazard_ratios(rossi, [15, 25, 35, 45]) @@ -4353,6 +4383,29 @@ def dfcv(self): def heart(self): return load_stanford_heart_transplants() + def test_penalizer_can_be_an_array(self, dfcv): + + cph_array = CoxTimeVaryingFitter(penalizer=0.01 * np.ones(2)).fit( + dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event" + ) + cph_float = CoxTimeVaryingFitter(penalizer=0.01).fit( + dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event" + ) + + assert_frame_equal(cph_array.summary, cph_float.summary) + + def test_penalizer_can_be_an_array_and_check_it_behaves_as_expected(self, dfcv): + + penalty = np.array([0, 0.01]) + cph_array = CoxTimeVaryingFitter(penalizer=penalty).fit( + dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event" + ) + cph_float = CoxTimeVaryingFitter(penalizer=0.01).fit( + dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event" + ) + + assert abs(cph_array.summary.loc["z", "coef"]) > abs(cph_float.summary.loc["z", "coef"]) + def test_model_can_accept_null_covariates(self, ctv, dfcv): ctv.fit(dfcv[["id", "start", "stop", "event"]], id_col="id", start_col="start", stop_col="stop", event_col="event") From 348281a21bb77b7d0bdd5de73d7731c0fbd1243a Mon Sep 17 00:00:00 2001 From: CamDavidsonPilon Date: Thu, 14 May 2020 13:42:07 -0400 Subject: [PATCH 05/10] refactor footer stuff in Printer --- docs/Survival Regression.rst | 57 ++++++++++++------ lifelines/fitters/__init__.py | 16 ++++- lifelines/fitters/cox_time_varying_fitter.py | 15 ++++- lifelines/fitters/coxph_fitter.py | 15 ++++- lifelines/utils/printer.py | 63 ++------------------ 5 files changed, 88 insertions(+), 78 deletions(-) diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst index 3471d0fe6..480dd78ac 100644 --- a/docs/Survival Regression.rst +++ b/docs/Survival Regression.rst @@ -103,7 +103,9 @@ The implementation of the Cox model in *lifelines* is under :class:`~lifelines.f prio 3.19 <0.005 9.48 --- Concordance = 0.64 - Log-likelihood ratio test = 33.27 on 7 df, -log2(p)=15.37 + Partial AIC = 1331.50 + log-likelihood ratio test = 33.27 on 7 df + -log2(p) of ll-ratio test = 15.37 """ To access the coefficients and the baseline hazard directly, you can use :attr:`~lifelines.fitters.coxph_fitter.CoxPHFitter.params_` and :attr:`~lifelines.fitters.coxph_fitter.CoxPHFitter.baseline_hazard_` respectively. Taking a look at these coefficients for a moment, ``prio`` (the number of prior arrests) has a coefficient of about 0.09. Thus, a one unit increase in ``prio`` means the the baseline hazard will increase by a factor of :math:`\exp{(0.09)} = 1.10` - about a 10% increase. Recall, in the Cox proportional hazard model, a higher hazard means more at risk of the event occurring. The value :math:`\exp{(0.09)}` is called the *hazard ratio*, a name that will be clear with another example. @@ -181,7 +183,7 @@ Back to our original problem of predicting the event time of censored individual Penalties and sparse regression ----------------------------------------------- -It's possible to add a penalizer term to the Cox regression as well. One can use these to i) stabilize the coefficients, ii) shrink the estimates to 0, iii) encourages a Bayesian interpretation, and iv) create sparse coefficients. Regression models, including the Cox model, include both an L1 and L2 penalty: +It's possible to add a penalizer term to the Cox regression as well. One can use these to i) stabilize the coefficients, ii) shrink the estimates to 0, iii) encourages a Bayesian viewpoint, and iv) create sparse coefficients. Regression models, including the Cox model, include both an L1 and L2 penalty: .. math:: \frac{1}{2} \text{penalizer} \left((1-\text{l1\_ratio}) \cdot ||\beta||_2^2 + \text{l1\_ratio} \cdot ||\beta||_1\right) @@ -204,6 +206,29 @@ To use this in *lifelines*, both the ``penalizer`` and ``l1_ratio`` can be speci cph.print_summary() +Instead of a float, an *array* can be provided that is the same size as the number of estimated parameters. The values in the array +are specific penalty coefficients for each covariate. This is useful for more complicated covariate structure. Some examples: + +i) you have lots of confounders you wish to penalizer, but not the main treatment(s). + +.. code:: python + + from lifelines import CoxPHFitter + from lifelines.datasets import load_rossi + + rossi = load_rossi() + + # variable `fin` is the treatment of interest so don't penalize it at all + penalty = np.array([0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]) + + cph = CoxPHFitter(penalizer=penalty) + cph.fit(rossi, 'week', 'arrest') + cph.print_summary() + +ii) you have to `fuse categories together `_. + + + Plotting the coefficients ------------------------------ @@ -300,15 +325,7 @@ To specify variables to be used in stratification, we define them in the call to .. code:: python - from lifelines.datasets import load_rossi - from lifelines import CoxPHFitter - - rossi_dataset = load_rossi() - cph = CoxPHFitter() - cph.fit(rossi_dataset, 'week', event_col='arrest', strata=['race']) - - cph.print_summary() # access the results using cph.summary - +p """ duration col = 'week' @@ -328,8 +345,10 @@ To specify variables to be used in stratification, we define them in the call to paro -0.09 0.92 0.20 -0.44 0.66 0.60 -0.47 0.30 prio 0.09 1.10 0.03 3.21 <0.005 9.56 0.04 0.15 --- - Concordance = 0.64 - Likelihood ratio test = 109.63 on 6 df, -log2(p)=68.48 + Concordance = 0.63 + Partial AIC = 1253.13 + log-likelihood ratio test = 32.73 on 6 df + -log2(p) of ll-ratio test = 16.37 """ cph.baseline_cumulative_hazard_.shape @@ -492,7 +511,9 @@ The Weibull AFT model is implemented under :class:`~lifelines.fitters.weibull_af rho_ _intercept 0.339 1.404 0.089 3.809 <0.0005 12.808 0.165 0.514 --- Concordance = 0.640 - Log-likelihood ratio test = 33.416 on 7 df, -log2(p)=15.462 + AIC = 1377.833 + log-likelihood ratio test = 33.416 on 7 df + -log2(p) of ll-ratio test = 15.462 """ From above, we can see that ``prio``, which is the number of previous incarcerations, has a large negative coefficient. This means that each addition incarcerations changes a subject's mean/median survival time by :math:`\exp(-0.066) = 0.936`, approximately a 7% decrease in mean/median survival time. What is the mean/median survival time? @@ -646,7 +667,7 @@ When predicting time remaining for uncensored individuals, you can use the `cond aft.predict_percentile(X, p=0.9, ancillary_df=censored_X, conditional_after=censored_subjects_last_obs) -There are two hyper-parameters that can be used to to achieve a better test score. These are ``penalizer`` and ``l1_ratio`` in the call to :class:`~lifelines.fitters.weibull_aft_fitter.WeibullAFTFitter`. The penalizer is similar to scikit-learn's ``ElasticNet`` model, see their `docs `_. +There are two hyper-parameters that can be used to to achieve a better test score. These are ``penalizer`` and ``l1_ratio`` in the call to :class:`~lifelines.fitters.weibull_aft_fitter.WeibullAFTFitter`. The penalizer is similar to scikit-learn's ``ElasticNet`` model, see their `docs `_. (However, *lifelines* will also accept an array for custom penalizer per variable, see `Cox docs above `_) .. code:: python @@ -680,8 +701,10 @@ There are two hyper-parameters that can be used to to achieve a better test scor _intercept 0.00 1.00 0.19 0.00 1.00 0.00 -0.38 0.38 rho_ _intercept -0.00 1.00 nan nan nan nan nan nan --- - Concordance = 0.60 - Log-likelihood ratio test = -4028.65 on 7 df, -log2(p)=-0.00 + Concordance = 0.64 + AIC = 1377.91 + log-likelihood ratio test = 33.34 on 7 df + -log2(p) of ll-ratio test = 15.42 """ diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py index b103266e8..4da7d9a8b 100644 --- a/lifelines/fitters/__init__.py +++ b/lifelines/fitters/__init__.py @@ -2115,7 +2115,21 @@ def print_summary(self, decimals=2, style=None, **kwargs): ] ) - p = Printer(self, headers, justify, decimals, kwargs) + sr = self.log_likelihood_ratio_test() + footers = [] + footers.extend( + [ + ("Concordance", "{:.{prec}f}".format(self.concordance_index_, prec=decimals)), + ("AIC", "{:.{prec}f}".format(self.AIC_, prec=decimals)), + ( + "log-likelihood ratio test", + "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), + ), + ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals)), + ] + ) + + p = Printer(self, headers, footers, justify, decimals, kwargs) p.print(style=style) diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index cf104b2af..71f568795 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -666,8 +666,21 @@ def print_summary(self, decimals=2, style=None, **kwargs): ] ) - p = Printer(self, headers, justify, decimals, kwargs) + sr = self.log_likelihood_ratio_test() + footers = [] + footers.extend( + [ + ("Concordance", "{:.{prec}f}".format(self.concordance_index_, prec=decimals)), + ("Partial AIC", "{:.{prec}f}".format(self.AIC_partial_, prec=decimals)), + ( + "log-likelihood ratio test", + "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), + ), + ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals)), + ] + ) + p = Printer(self, headers, footers, justify, decimals, kwargs) p.print(style=style) def log_likelihood_ratio_test(self): diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 05113e99b..fdcbcd75d 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -1350,8 +1350,21 @@ def print_summary(self, decimals: int = 2, style: Optional[str] = None, **kwargs ] ) - p = Printer(self, headers, justify, decimals, kwargs) + sr = self.log_likelihood_ratio_test() + footers = [] + footers.extend( + [ + ("Concordance", "{:.{prec}f}".format(self.concordance_index_, prec=decimals)), + ("Partial AIC", "{:.{prec}f}".format(self.AIC_partial_, prec=decimals)), + ( + "log-likelihood ratio test", + "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), + ), + ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals)), + ] + ) + p = Printer(self, headers, footers, justify, decimals, kwargs) p.print(style=style) def _trivial_log_likelihood(self): diff --git a/lifelines/utils/printer.py b/lifelines/utils/printer.py index 7363acb7d..5c80f7efc 100644 --- a/lifelines/utils/printer.py +++ b/lifelines/utils/printer.py @@ -19,7 +19,7 @@ def __init__( self.model = model self.decimals = decimals self.justify = justify - self.footers = footers # TODO: use this variable + self.footers = footers for tuple_ in header_kwargs.items(): self.add_to_headers(tuple_) @@ -85,39 +85,8 @@ def to_html(self): }, ) - footers = [] - with np.errstate(invalid="ignore", divide="ignore"): - - try: - if utils.CensoringType.is_right_censoring(self.model) and self.model._KNOWN_MODEL: - footers.append(("Concordance", "{:.{prec}f}".format(self.model.score_, prec=decimals))) - except AttributeError: - pass - - try: - footers.append(("AIC", "{:.{prec}f}".format(self.model.AIC_, prec=decimals))) - except AttributeError: - pass - - try: - footers.append(("Partial AIC", "{:.{prec}f}".format(self.model.AIC_partial_, prec=decimals))) - except AttributeError: - pass - - try: - sr = self.model.log_likelihood_ratio_test() - footers.append( - ( - "Log-likelihood ratio test", - "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), - ) - ) - footers.append(("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals))) - except AttributeError: - pass - - if footers: - footer_df = pd.DataFrame.from_records(footers).set_index(0) + if self.footers: + footer_df = pd.DataFrame.from_records(self.footers).set_index(0) footer_html = footer_df.to_html(header=False, notebook=True, index_names=False) else: footer_html = "" @@ -194,28 +163,6 @@ def ascii_print(self): with np.errstate(invalid="ignore", divide="ignore"): print("---") - try: - if utils.CensoringType.is_right_censoring(self.model) and self.model._KNOWN_MODEL: - print("Concordance = {:.{prec}f}".format(self.model.concordance_index_, prec=decimals)) - except AttributeError: - pass - try: - print("AIC = {:.{prec}f}".format(self.model.AIC_, prec=decimals)) - except AttributeError: - pass - - try: - print("Partial AIC = {:.{prec}f}".format(self.model.AIC_partial_, prec=decimals)) - except AttributeError: - pass - - try: - sr = self.model.log_likelihood_ratio_test() - print( - "Log-likelihood ratio test = {:.{prec}f} on {} df, -log2(p)={:.{prec}f}".format( - sr.test_statistic, sr.degrees_freedom, -np.log2(sr.p_value), prec=decimals - ) - ) - except AttributeError: - pass + for string, value in self.footers: + print("{} = {}".format(string, value)) print() From 9660f6c42187d75b05bb9f228b3c319fdc44bc84 Mon Sep 17 00:00:00 2001 From: CamDavidsonPilon Date: Fri, 15 May 2020 09:40:20 -0400 Subject: [PATCH 06/10] adding new functionality to find_best.. --- lifelines/tests/utils/test_utils.py | 21 ++++++++ lifelines/utils/__init__.py | 83 ++++++++++++++++++++++++----- 2 files changed, 92 insertions(+), 12 deletions(-) diff --git a/lifelines/tests/utils/test_utils.py b/lifelines/tests/utils/test_utils.py index 93c4c5724..e616ab3da 100644 --- a/lifelines/tests/utils/test_utils.py +++ b/lifelines/tests/utils/test_utils.py @@ -1065,6 +1065,27 @@ def test_find_best_parametric_model_with_BIC(): assert True +def test_find_best_parametric_model_works_for_left_censoring(): + T = np.random.exponential(2, 100) + model, score = utils.find_best_parametric_model(T, censoring_type="left", show_progress=True) + assert True + + +def test_find_best_parametric_model_works_for_interval_censoring(): + T_1 = np.random.exponential(2, 100) + T_2 = T_1 + 1 + model, score = utils.find_best_parametric_model((T_1, T_2), censoring_type="interval", show_progress=True) + assert True + + +def test_find_best_parametric_model_works_with_weights_and_entry(): + T = np.random.exponential(2, 100) + W = np.random.randint(1, 5, size=100) + entry = np.random.exponential(0.01, 100) + model, score = utils.find_best_parametric_model(T, weights=W, entry=entry, show_progress=True) + assert True + + def test_safe_exp(): from lifelines.utils.safe_exp import MAX diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 724d08aa7..f5cf5e47b 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -42,6 +42,10 @@ class CensoringType: INTERVAL = 2 RIGHT = 3 + MAP = {"right": RIGHT, "left": LEFT, "interval": INTERVAL} + + HUMAN_MAP = {LEFT: "left", RIGHT: "right", INTERVAL: "interval"} + @classmethod def right_censoring(cls, function: Callable) -> Callable: @wraps(function) @@ -84,11 +88,13 @@ def is_interval_censoring(cls, model) -> bool: @classmethod def get_human_readable_censoring_type(cls, model) -> str: if cls.is_interval_censoring(model): - return "interval" + return cls.HUMAN_MAP[cls.INTERVAL] elif cls.is_right_censoring(model): - return "right" + return cls.HUMAN_MAP[cls.RIGHT] + elif cls.is_left_censoring(model): + return cls.HUMAN_MAP[cls.LEFT] else: - return "left" + return class StatError(Exception): @@ -1675,7 +1681,19 @@ def iterdicts(self): yield DataframeSliceDict(x.to_frame().T, self.mappings) -def find_best_parametric_model(event_times, event_observed=None, scoring_method: str = "AIC", additional_models=None): +def find_best_parametric_model( + event_times, + event_observed=None, + scoring_method: str = "AIC", + additional_models=None, + censoring_type="right", + timeline=None, + alpha=None, + ci_labels=None, + entry=None, + weights=None, + show_progress=False, +): """ To quickly determine the best¹ univariate model, this function will iterate through each parametric model available in lifelines and select the one that minimizes a particular measure of fit. @@ -1685,13 +1703,27 @@ def find_best_parametric_model(event_times, event_observed=None, scoring_method: Parameters ------------- event_times: list, np.array, pd.Series - a (n,) array of observed survival times. + a (n,) array of observed survival times. If interval censoring, a tuple of (lower_bound, upper_bound). event_observed: list, np.array, pd.Series a (n,) array of censored flags, 1 if observed, 0 if not. Default None assumes all observed. scoring_method: string one of {"AIC", "BIC"} additional_models: list list of other parametric models that implement the lifelines API. + censoring_type: str + {"right", "left", "interval"} + timeline: list, optional + return the model at the values in timeline (positively increasing) + alpha: float, optional + the alpha value in the confidence intervals. Overrides the initializing + alpha for this call to fit only. + ci_labels: list, optional + add custom column names to the generated confidence intervals as a length-2 list: [, ]. Default: