From 15547cd19826b4487b6576d2cff6c12614ef1e27 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 17 May 2018 10:29:37 -0400 Subject: [PATCH 1/4] some questionable improvements --- lifelines/fitters/cox_time_varying_fitter.py | 88 +++++++++++++++----- lifelines/fitters/coxph_fitter.py | 2 +- lifelines/utils/__init__.py | 4 +- lifelines/version.py | 2 +- 4 files changed, 73 insertions(+), 23 deletions(-) diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 3c2c15d14..dfa09e913 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -2,6 +2,8 @@ from __future__ import print_function from __future__ import division import warnings +import time + import numpy as np import pandas as pd from scipy import stats @@ -58,6 +60,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std self.confidence_intervals_ = self._compute_confidence_intervals() self._n_examples = df.shape[0] + self._n_unique = df.index.unique().shape[0] return self @@ -110,7 +113,7 @@ def summary(self): df['upper %.2f' % self.alpha] = self.confidence_intervals_.loc['upper-bound'].values return df - def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-6): + def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-5): """ Newton Rhaphson algorithm for fitting CPH model. @@ -137,6 +140,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size i = 0 converging = True ll, previous_ll = 0, 0 + start = time.time() step_sizer = StepSizer(step_size) step_size = step_sizer.next() @@ -158,7 +162,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size hessian, gradient = h, g if show_progress: - print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f" % (i, norm(delta), step_size, ll)) + print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start)) # convergence criteria if norm(delta) < precision: @@ -204,16 +208,12 @@ def _get_gradients(self, df, stops_events, beta): _, d = df.shape hessian = np.zeros((d, d)) - gradient = np.zeros((1, d)) + gradient = np.zeros(d) log_lik = 0 unique_death_times = np.unique(stops_events['stop'].loc[stops_events['event']]) for t in unique_death_times: - x_death_sum = np.zeros((1, d)) - tie_phi, risk_phi = 0, 0 - risk_phi_x, tie_phi_x = np.zeros((1, d)), np.zeros((1, d)) - risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d)) ix = (stops_events['start'] < t) & (t <= stops_events['stop']) df_at_t = df.loc[ix] @@ -224,9 +224,9 @@ def _get_gradients(self, df, stops_events, beta): phi_x_x_i = dot(df_at_t.T, phi_x_i) # dot(df_at_t.T, phi_i * df_at_t) # Calculate sums of Risk set - risk_phi += phi_i.sum() - risk_phi_x += phi_x_i.sum(0).values.reshape((1, d)) - risk_phi_x_x += phi_x_x_i + risk_phi = phi_i.sum() + risk_phi_x = phi_x_i.sum(0).values + risk_phi_x_x = phi_x_x_i # Calculate the sums of Tie set deaths = stops_events_at_t['event'] & (stops_events_at_t['stop'] == t) @@ -234,15 +234,15 @@ def _get_gradients(self, df, stops_events, beta): xi_deaths = df_at_t.loc[deaths] - x_death_sum += xi_deaths.sum(0).values.reshape((1, d)) + x_death_sum = xi_deaths.sum(0).values if death_counts > 1: # it's faster if we can skip computing these when we don't need to. - tie_phi += phi_i[deaths.values].sum() - tie_phi_x += phi_x_i.loc[deaths].sum(0).values.reshape((1, d)) - tie_phi_x_x += dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths) + tie_phi = phi_i[deaths.values].sum() + tie_phi_x = phi_x_i.loc[deaths].sum(0).values + tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths) - partial_gradient = np.zeros((1, d)) + partial_gradient = np.zeros(d) for l in range(death_counts): @@ -262,7 +262,7 @@ def _get_gradients(self, df, stops_events, beta): partial_gradient += z / denom # In case z and denom both are really small numbers, # make sure to do division before multiplications - a2 = dot(z.T / denom, z / denom) + a2 = np.outer(z / denom, z / denom) hessian -= (a1 - a2) @@ -270,8 +270,49 @@ def _get_gradients(self, df, stops_events, beta): # Values outside tie sum gradient += x_death_sum - partial_gradient - log_lik += dot(x_death_sum, beta)[0][0] - return hessian, gradient, log_lik + log_lik += dot(x_death_sum, beta)[0] + + return hessian, gradient.reshape(1, d), log_lik + + def predict_log_partial_hazard(self, X): + """ + X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns + can be in any order. If a numpy array, columns must be in the + same order as the training data. + + This is equivalent to R's linear.predictors. + Returns the log of the partial hazard for the individuals, partial since the + baseline hazard is not included. Equal to \beta (X - \bar{X}) + + If X is a dataframe, the order of the columns do not matter. But + if X is an array, then the column ordering is assumed to be the + same as the training dataset. + """ + if isinstance(X, pd.DataFrame): + order = self.hazards_.columns + X = X[order] + + index = _get_index(X) + X = normalize(X, self._norm_mean.values, 1) + return pd.DataFrame(np.dot(X, self.hazards_.T), index=index) + + + def predict_partial_hazard(self, X): + """ + X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns + can be in any order. If a numpy array, columns must be in the + same order as the training data. + + + If X is a dataframe, the order of the columns do not matter. But + if X is an array, then the column ordering is assumed to be the + same as the training dataset. + + Returns the partial hazard for the individuals, partial since the + baseline hazard is not included. Equal to \exp{\beta X} + """ + return exp(self.predict_log_partial_hazard(X)) + def print_summary(self): """ @@ -283,7 +324,7 @@ def print_summary(self): df[''] = [significance_code(p) for p in df['p']] # Print information about data first - print('n={}, number of events={}'.format(self._n_examples, + print('periods={}, uniques={}, number of events={}'.format(self._n_examples, self._n_unique, np.where(self.event_observed)[0].shape[0]), end='\n\n') print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f))) @@ -324,3 +365,12 @@ def plot(self, standardized=False, **kwargs): plt.yticks(yaxis_locations, tick_labels) plt.xlabel("standardized coef" if standardized else "coef") return ax + + def __repr__(self): + classname = self.__class__.__name__ + try: + s = """""" % ( + classname, self._n_examples, self._n_unique, self.event_observed.shape[0] - np.where(self.event_observed)[0].shape[0]) + except AttributeError: + s = """""" % classname + return s diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index c9979d591..e7e487c38 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -336,7 +336,7 @@ def _get_efron_values(self, X, beta, T, E, weights): continue # There was atleast one event and no more ties remain. Time to sum. - partial_gradient = np.zeros((1, d)) + partial_gradient = np.zeros((tie_count, d)) for l in range(tie_count): c = l / tie_count diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 69035da1d..95ac64fdb 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -1005,9 +1005,9 @@ def concordance_value(time_a, time_b, pred_a, pred_b): def pass_for_numeric_dtypes_or_raise(df): - nonnumeric_cols = df.select_dtypes(exclude=[np.number, bool]).columns.tolist() + nonnumeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist() if len(nonnumeric_cols) > 0: - raise TypeError("DataFrame contains nonnumeric columns: %s. Try using pandas.get_dummies to convert the column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols) + raise TypeError("DataFrame contains nonnumeric columns: %s. Try converting boolean column(s) to integers, or using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols) def check_for_overlapping_intervals(df): diff --git a/lifelines/version.py b/lifelines/version.py index 789b2f93c..c1fdf9229 100644 --- a/lifelines/version.py +++ b/lifelines/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '0.14.1' +__version__ = '0.14.2' From 55140c2132e7e819f00cffec4f297a63fcc456db Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 17 May 2018 11:01:36 -0400 Subject: [PATCH 2/4] simpler print messages --- lifelines/fitters/cox_time_varying_fitter.py | 4 ++-- lifelines/fitters/coxph_fitter.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index dfa09e913..7bdded1ec 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -325,7 +325,7 @@ def print_summary(self): # Print information about data first print('periods={}, uniques={}, number of events={}'.format(self._n_examples, self._n_unique, - np.where(self.event_observed)[0].shape[0]), + self.event_observed.sum()), end='\n\n') print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f))) # Significance code explanation @@ -370,7 +370,7 @@ def __repr__(self): classname = self.__class__.__name__ try: s = """""" % ( - classname, self._n_examples, self._n_unique, self.event_observed.shape[0] - np.where(self.event_observed)[0].shape[0]) + classname, self._n_examples, self._n_unique, self.event_observed.sum()) except AttributeError: s = """""" % classname return s diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index e7e487c38..fc0899a4d 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -336,7 +336,7 @@ def _get_efron_values(self, X, beta, T, E, weights): continue # There was atleast one event and no more ties remain. Time to sum. - partial_gradient = np.zeros((tie_count, d)) + partial_gradient = np.zeros((1, d)) for l in range(tie_count): c = l / tie_count @@ -433,7 +433,7 @@ def print_summary(self): # Print information about data first print('n={}, number of events={}'.format(self._n_examples, - np.where(self.event_observed)[0].shape[0]), + self.event_observed.sum()), end='\n\n') print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f))) # Significance code explanation From dc3a57762eadde90e74a531aca5a69134d402f1b Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 17 May 2018 11:12:44 -0400 Subject: [PATCH 3/4] keep current limits --- lifelines/fitters/cox_time_varying_fitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 7bdded1ec..56cf15a1f 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -113,7 +113,7 @@ def summary(self): df['upper %.2f' % self.alpha] = self.confidence_intervals_.loc['upper-bound'].values return df - def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-5): + def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-6): """ Newton Rhaphson algorithm for fitting CPH model. @@ -162,7 +162,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size hessian, gradient = h, g if show_progress: - print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start)) + print("Iteration %d: norm_delta = %.6f, step_size = %.6f, ll = %.6f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start)) # convergence criteria if norm(delta) < precision: From 53d616e01b4a2798acacaccf7b0e74563e6b00bd Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 17 May 2018 11:52:33 -0400 Subject: [PATCH 4/4] adding docs and a bool test --- lifelines/fitters/cox_time_varying_fitter.py | 26 +++++++++++++++++++- lifelines/fitters/coxph_fitter.py | 1 + lifelines/utils/__init__.py | 4 +-- tests/test_estimation.py | 10 ++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 56cf15a1f..601c9572d 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -36,6 +36,29 @@ def __init__(self, alpha=0.95, penalizer=0.0): self.penalizer = penalizer def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None): + """ + Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times + are handled using Efron's tie-method. + + Parameters: + df: a Pandas dataframe with necessary columns `duration_col` and + `event_col`, plus other covariates. `duration_col` refers to + the lifetimes of the subjects. `event_col` refers to whether + the 'death' events was observed: 1 if observed, 0 else (censored). + id_col: A subject could have multiple rows in the dataframe. This column contains + the unique identifer per subject. + event_col: the column in dataframe that contains the subjects' death + observation. If left as None, assume all individuals are non-censored. + start_col: the column that contains the start of a subject's time period. + end_col: the column that contains the end of a subject's time period. + show_progress: since the fitter is iterative, show convergence + diagnostics. + step_size: set an initial step size for the fitting algorithm. + + Returns: + self, with additional properties: hazards_ + + """ df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): @@ -50,6 +73,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr stop_times_events = df[["event", "stop", "start"]] df = df.drop(["event", "stop", "start"], axis=1) + df = df.astype(float) self._norm_mean = df.mean(0) self._norm_std = df.std(0) @@ -162,7 +186,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size hessian, gradient = h, g if show_progress: - print("Iteration %d: norm_delta = %.6f, step_size = %.6f, ll = %.6f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start)) + print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start)) # convergence criteria if norm(delta) < precision: diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index fc0899a4d..3bc3cb8b4 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -83,6 +83,7 @@ def fit(self, df, duration_col, event_col=None, catagorical covariate does not obey the proportional hazard assumption. This is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. + step_size: set an initial step size for the fitting algorithm. Returns: self, with additional properties: hazards_ diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 95ac64fdb..29bf9e8d5 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -1005,9 +1005,9 @@ def concordance_value(time_a, time_b, pred_a, pred_b): def pass_for_numeric_dtypes_or_raise(df): - nonnumeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist() + nonnumeric_cols = df.select_dtypes(exclude=[np.number, bool]).columns.tolist() if len(nonnumeric_cols) > 0: - raise TypeError("DataFrame contains nonnumeric columns: %s. Try converting boolean column(s) to integers, or using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols) + raise TypeError("DataFrame contains nonnumeric columns: %s. Try using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols) def check_for_overlapping_intervals(df): diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 7f738b2d2..85e748fa5 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1482,6 +1482,16 @@ def test_fitter_will_raise_an_error_if_overlapping_intervals(self, ctv): with pytest.raises(ValueError): ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event") + def test_fitter_accept_boolean_columns(self, ctv): + df = pd.DataFrame.from_records([ + {'id': 1, 'start': 0, 'stop': 5, 'var': -1.2, 'bool': True, 'event': 1}, + {'id': 2, 'start': 0, 'stop': 5, 'var': 1.3, 'bool': False, 'event': 1}, + {'id': 3, 'start': 0, 'stop': 5, 'var': -1.3, 'bool': False, 'event': 1}, + ]) + + ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event") + + def test_warning_is_raised_if_df_has_a_near_constant_column(self, ctv, dfcv): dfcv['constant'] = 1.0