From 423e1d075b0ecf32b96966b2f9c2758272229849 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Wed, 18 Jun 2014 22:04:23 -0400 Subject: [PATCH 1/5] removing unecessary indexing --- lifelines/estimation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lifelines/estimation.py b/lifelines/estimation.py index bc6fbce90..d6d2c9f60 100644 --- a/lifelines/estimation.py +++ b/lifelines/estimation.py @@ -493,12 +493,13 @@ def _fit_static(self, dataframe, duration_col="T", event_col="E", C = pd.Series(df[event_col].values, dtype=bool, index=ids) T = pd.Series(df[duration_col].values, index=ids) - df = df.set_index([duration_col, id_col]) + df = df.set_index(id_col) ix = T.argsort() T, C = T.iloc[ix], C.iloc[ix] del df[event_col] + del df[duration_col] n, d = df.shape columns = df.columns From c09528a0cd63dd49e320f2478e82e1b8f38d2de8 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Wed, 18 Jun 2014 23:05:41 -0400 Subject: [PATCH 2/5] 0.4.2: better CoxPHFitter, new prediction methods, new lifelines.datasets methods --- CHANGELOG.md | 11 ++++++++-- docs/Intro to lifelines.rst | 3 ++- docs/Quickstart.rst | 10 +++++---- docs/survival_regression.rst | 6 +++-- lifelines/datasets.py | 17 +++++++-------- lifelines/estimation.py | 24 ++++++++++++++++---- lifelines/tests/test_suite.py | 41 ++++++++++++++++++++++++----------- lifelines/utils.py | 9 ++++++-- setup.py | 2 +- 9 files changed, 85 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 348516bc2..2e0ec8492 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,15 @@ ### Changelogs + + ####0.4.2 - - - Add Breslows method to CoxPHFitter. + + - Massive speed improvements to CoxPHFitter. + - Additional prediction method: `predict_percentile` is available on CoxPHFitter and AalenAdditiveFitter. Given a percentile, p, this function returns the value t such that *S(t | x) = p*. It is a generalization of `predict_median`. + - Additional kwargs in `k_fold_cross_validation` that will accept different prediction methods (default is `predict_median`). +- Bug fix in CoxPHFitter `predict_expectation` function. +- Correct spelling mistake in newton-rhapson algorithm. +- `datasets` now contains functions for generating the respective datasets, ex: `generate_waltons_dataset`. ####0.4.1.1 diff --git a/docs/Intro to lifelines.rst b/docs/Intro to lifelines.rst index 4bf546d8c..5e51f99ce 100644 --- a/docs/Intro to lifelines.rst +++ b/docs/Intro to lifelines.rst @@ -587,7 +587,8 @@ instruments could only detect the measurement was *less* than some upperbound. .. code:: python - from lifelines.datasets import lcd_dataset + from lifelines.datasets import generate_lcd_dataset + lcd_dataset = generate_lcd_dataset() T = lcd_dataset['alluvial_fan']['T'] C = lcd_dataset['alluvial_fan']['C'] #boolean array, True if observed. diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst index 3434eec26..430c2b6d2 100644 --- a/docs/Quickstart.rst +++ b/docs/Quickstart.rst @@ -23,10 +23,11 @@ Let's start by importing some data. We need the durations that individuals are o .. code:: python - from lifelines.datasets import waltons_dataset + from lifelines.datasets import generate_waltons_dataset + data = generate_waltons_dataset() - T = waltons_dataset['T'] - E = waltons_dataset['E'] + T = data['T'] + E = data['E'] ``T`` is an array of durations, ``E`` is a either boolean or binary array representing whether the "death" was observed (alternatively an individual can be censored). @@ -85,7 +86,8 @@ While the above ``KaplanMeierFitter`` and ``NelsonAalenFitter`` are useful, they .. code:: python - from lifelines.datasets import regression_dataset + from lifelines.datasets import generate_regression_dataset + regression_dataset = generate_regression_dataset() regression_dataset.head() diff --git a/docs/survival_regression.rst b/docs/survival_regression.rst index 788f0d643..3847298e6 100644 --- a/docs/survival_regression.rst +++ b/docs/survival_regression.rst @@ -395,9 +395,10 @@ This example data is from the paper `here Date: Wed, 18 Jun 2014 23:07:59 -0400 Subject: [PATCH 3/5] changelong --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e0ec8492..a327bdd4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ - Bug fix in CoxPHFitter `predict_expectation` function. - Correct spelling mistake in newton-rhapson algorithm. - `datasets` now contains functions for generating the respective datasets, ex: `generate_waltons_dataset`. - +- Bumping up the number of samples in statistical tests to prevent them from failing so often (this a stop-gap) ####0.4.1.1 From 67634ab43b1fa50fdd31a836495c12f03f982d3d Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Wed, 18 Jun 2014 23:13:37 -0400 Subject: [PATCH 4/5] pep8 everything --- CHANGELOG.md | 1 + lifelines/datasets.py | 13 +++++-------- lifelines/estimation.py | 15 ++++++++------- lifelines/statistics.py | 12 ++++++------ lifelines/tests/test_suite.py | 25 ++++++++++++------------- lifelines/utils.py | 27 ++++++++++++++------------- 6 files changed, 46 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a327bdd4e..1316c69b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - Correct spelling mistake in newton-rhapson algorithm. - `datasets` now contains functions for generating the respective datasets, ex: `generate_waltons_dataset`. - Bumping up the number of samples in statistical tests to prevent them from failing so often (this a stop-gap) +- pep8 everything ####0.4.1.1 diff --git a/lifelines/datasets.py b/lifelines/datasets.py index 902c248c1..7454c587b 100644 --- a/lifelines/datasets.py +++ b/lifelines/datasets.py @@ -3,11 +3,11 @@ import numpy as np from io import StringIO -__all__ = [ 'generate_waltons_dataset', - 'generate_regression_dataset', - 'generate_dd_dataset', - 'generate_lcd_dataset', - 'generate_rossi_dataset'] +__all__ = ['generate_waltons_dataset', + 'generate_regression_dataset', + 'generate_dd_dataset', + 'generate_lcd_dataset', + 'generate_rossi_dataset'] def generate_lcd_dataset(): @@ -2548,6 +2548,3 @@ def generate_regression_dataset(): 0.031865 1.753759 0.252040 8.519852 1 1.631269 1.588621 3.709899 4.480448 1 """), header=0, delim_whitespace=True) - - - diff --git a/lifelines/estimation.py b/lifelines/estimation.py index d9f405a9b..2cc11e7a2 100644 --- a/lifelines/estimation.py +++ b/lifelines/estimation.py @@ -139,7 +139,7 @@ def smoothed_hazard_(self, bandwidth): hazard_name = "smoothed-" + cumulative_hazard_name hazard_ = self.cumulative_hazard_.diff().fillna(self.cumulative_hazard_.iloc[0]) C = (hazard_[cumulative_hazard_name] != 0.0).values - return pd.DataFrame( 1./(2*bandwidth)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth), hazard_.values[C,:]), + return pd.DataFrame( 1./(2*bandwidth)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None,:], bandwidth), hazard_.values[C,:]), columns=[hazard_name], index=timeline) def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None): @@ -156,7 +156,7 @@ def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None): self._cumulative_sq.iloc[0] = 0 var_hazard_ = self._cumulative_sq.diff().fillna(self._cumulative_sq.iloc[0]) C = (var_hazard_.values != 0.0) # only consider the points with jumps - std_hazard_ = np.sqrt(1./(2*bandwidth**2)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth)**2, var_hazard_.values[C])) + std_hazard_ = np.sqrt(1./(2*bandwidth**2)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None,:], bandwidth)**2, var_hazard_.values[C])) values = { self.ci_labels[0]: hazard_ * np.exp(alpha2 * std_hazard_ / hazard_), self.ci_labels[1]: hazard_ * np.exp(-alpha2 * std_hazard_ / hazard_) @@ -747,6 +747,7 @@ def predict_expectation(self, X): class CoxPHFitter(BaseFitter): + """ This class implements fitting Cox's proportional hazard model: @@ -802,7 +803,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False): # Iterate backwards to utilize recursive relationship for i, (ti, ei) in reversed(list(enumerate(zip(T, E)))): # Doing it like this to preserve shape - xi = X[i:i+1] + xi = X[i:i + 1] # Calculate phi values phi_i = exp(dot(xi, beta)) phi_x_i = dot(phi_i, xi) @@ -823,7 +824,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False): # Keep track of count tie_count += 1 - if i > 0 and T[i-1] == ti: + if i > 0 and T[i - 1] == ti: # There are more ties/members of the risk set continue elif tie_count == 0: @@ -999,7 +1000,7 @@ def _compute_confidence_intervals(self): def _compute_standard_errors(self): se = np.sqrt(inv(-self._hessian_).diagonal()) - return pd.DataFrame(se[None, :], + return pd.DataFrame(se[None,:], index=['se'], columns=self.hazards_.columns) def _compute_z_values(self): @@ -1207,10 +1208,10 @@ def qth_survival_times(q, survival_functions): sv_b = (1.0 * (survival_functions < q)).cumsum() > 0 try: v = sv_b.idxmax(0) - v[sv_b.iloc[-1, :] == 0] = np.inf + v[sv_b.iloc[-1,:] == 0] = np.inf except: v = sv_b.argmax(0) - v[sv_b[-1, :] == 0] = np.inf + v[sv_b[-1,:] == 0] = np.inf return v diff --git a/lifelines/statistics.py b/lifelines/statistics.py index d8f0d28f7..898fda132 100644 --- a/lifelines/statistics.py +++ b/lifelines/statistics.py @@ -53,8 +53,8 @@ def concordance_index(event_times, predicted_event_times, event_observed=None): event_observed) -def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None, - alpha=0.95, t_0=-1, suppress_print=False, **kwargs): +def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None, + alpha=0.95, t_0=-1, suppress_print=False, **kwargs): """ Measures and reports on whether two intensity processes are different. That is, given two event series, determines whether the data generating processes are statistically different. @@ -91,8 +91,8 @@ def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_obse event_times = np.r_[event_times_A, event_times_B] groups = np.r_[np.zeros(event_times_A.shape[0]), np.ones(event_times_B.shape[0])] event_observed = np.r_[event_observed_A, event_observed_B] - return multivariate_logrank_test(event_times, groups, event_observed, - alpha=alpha, t_0=t_0, suppress_print=suppress_print, **kwargs) + return multivariate_logrank_test(event_times, groups, event_observed, + alpha=alpha, t_0=t_0, suppress_print=suppress_print, **kwargs) def pairwise_logrank_test(event_durations, groups, event_observed=None, @@ -175,7 +175,7 @@ def pairwise_logrank_test(event_durations, groups, event_observed=None, return [pd.DataFrame(x, columns=unique_groups, index=unique_groups) for x in [S, P, T]] -def multivariate_logrank_test(event_durations, groups, event_observed=None, +def multivariate_logrank_test(event_durations, groups, event_observed=None, alpha=0.95, t_0=-1, suppress_print=False, **kwargs): """ This test is a generalization of the logrank_test: it can deal with n>2 populations (and should @@ -238,7 +238,7 @@ def multivariate_logrank_test(event_durations, groups, event_observed=None, df=n_groups - 1, **kwargs) if not suppress_print: - print(summary) + print(summary) return summary, p_value, test_result diff --git a/lifelines/tests/test_suite.py b/lifelines/tests/test_suite.py index d675f417b..c3bac402f 100644 --- a/lifelines/tests/test_suite.py +++ b/lifelines/tests/test_suite.py @@ -28,7 +28,7 @@ from ..plotting import plot_lifetimes from ..utils import * from ..datasets import generate_lcd_dataset, generate_rossi_dataset, \ - generate_waltons_dataset, generate_regression_dataset + generate_waltons_dataset, generate_regression_dataset class MiscTests(unittest.TestCase): @@ -98,15 +98,15 @@ def test_cross_validator(self): def test_cross_validator_with_predictor(self): cf = CoxPHFitter() - k_fold_cross_validation(cf, generate_regression_dataset(), + k_fold_cross_validation(cf, generate_regression_dataset(), duration_col='T', event_col='E', k=3, predictor="predict_expectation") def test_cross_validator_with_predictor_and_kwargs(self): cf = CoxPHFitter() - k_fold_cross_validation(cf, generate_regression_dataset(), + k_fold_cross_validation(cf, generate_regression_dataset(), duration_col='T', event_col='E', k=3, - predictor="predict_percentile", predictor_kwargs={'p':0.6}) + predictor="predict_percentile", predictor_kwargs={'p': 0.6}) class StatisticalTests(unittest.TestCase): @@ -241,18 +241,17 @@ def test_pairwise_logrank_test(self): npt.assert_array_equal(R, V) def test_multivariate_inputs(self): - T = np.array([1,2,3]) - E = np.array([1,1,0], dtype=bool) - G = np.array([1,2,1]) - multivariate_logrank_test(T,G,E) - pairwise_logrank_test(T,G,E) + T = np.array([1, 2, 3]) + E = np.array([1, 1, 0], dtype=bool) + G = np.array([1, 2, 1]) + multivariate_logrank_test(T, G, E) + pairwise_logrank_test(T, G, E) T = pd.Series(T) E = pd.Series(E) G = pd.Series(G) - multivariate_logrank_test(T,G,E) - pairwise_logrank_test(T,G,E) - + multivariate_logrank_test(T, G, E) + pairwise_logrank_test(T, G, E) def test_lists_to_KaplanMeierFitter(self): T = [2, 3, 4., 1., 6, 5.] @@ -798,7 +797,7 @@ def test_output_against_R(self): OBSERVED = np.array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0]) N = len(LIFETIMES) -#walton's data +# walton's data waltons_dataset = generate_waltons_dataset() ix = waltons_dataset['group'] == 'miR-137' waltonT1 = waltons_dataset.ix[ix]['T'] diff --git a/lifelines/utils.py b/lifelines/utils.py index 0bc92dab5..ea7fb1410 100644 --- a/lifelines/utils.py +++ b/lifelines/utils.py @@ -88,7 +88,7 @@ def group_survival_table_from_events(groups, durations, event_observed, min_obse g_name = str(g) data = data.join(survival_table_from_events(T, C, B, columns=['removed:' + g_name, "observed:" + g_name, 'censored:' + g_name, 'entrance' + g_name]), - how='outer') + how='outer') data = data.fillna(0) # hmmm pandas its too bad I can't do data.ix[:limit] and leave out the if. if int(limit) != -1: @@ -265,10 +265,10 @@ def AandS_approximation(p): def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E', - k=5, loss_function="concordance", predictor="predict_median", - predictor_kwargs={}): + k=5, loss_function="concordance", predictor="predict_median", + predictor_kwargs={}): """ - Perform cross validation on a dataset. + Perform cross validation on a dataset. fitter: either an instance of AalenAdditiveFitter or CoxPHFitter. df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus @@ -279,28 +279,28 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E', k: the number of folds to perform. n/k data will be withheld for testing on. loss_function: "concordance" only. "concordance": concordance index (C-index) between two series of event times - predictor: a string that matches a prediction method on the fitter instances. For example, + predictor: a string that matches a prediction method on the fitter instances. For example, "predict_mean" or "predict_percentile". Default is "predict_median" predictor_kwargs: keyward args to pass into predictor. Returns: - (k,1) array of scores for each fold. + (k,1) array of scores for each fold. """ from .statistics import concordance_index - n,d = df.shape + n, d = df.shape scores = np.zeros((k,)) df = df.copy() - df = df.reindex(np.random.permutation(df.index)).sort(event_col) + df = df.reindex(np.random.permutation(df.index)).sort(event_col) - assignments = np.array((n//k + 1)*list(range(1,k+1))) + assignments = np.array((n // k + 1) * list(range(1, k + 1))) assignments = assignments[:n] testing_columns = df.columns - [duration_col, event_col] - for i in range(1,k+1): + for i in range(1, k + 1): - ix = assignments == i + ix = assignments == i training_data = df.ix[~ix] testing_data = df.ix[ix] @@ -308,14 +308,15 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E', E_actual = testing_data[event_col].values X_testing = testing_data[testing_columns] - #fit the fitter to the training data + # fit the fitter to the training data fitter.fit(training_data, duration_col=duration_col, event_col=event_col) T_pred = getattr(fitter, predictor)(X_testing, **predictor_kwargs).values - scores[i-1] = concordance_index(T_actual, T_pred, E_actual) + scores[i - 1] = concordance_index(T_actual, T_pred, E_actual) return scores + def epanechnikov_kernel(t, T, bandwidth=1.): M = 0.75 * (1 - (t - T) / bandwidth) ** 2 M[abs((t - T)) >= bandwidth] = 0 From 1093f96f93e7eea1fbe99fdc6ace99c4f5486240 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 19 Jun 2014 07:43:48 -0400 Subject: [PATCH 5/5] no predict_mean --- lifelines/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lifelines/utils.py b/lifelines/utils.py index ea7fb1410..8c46a6a10 100644 --- a/lifelines/utils.py +++ b/lifelines/utils.py @@ -280,7 +280,7 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E', loss_function: "concordance" only. "concordance": concordance index (C-index) between two series of event times predictor: a string that matches a prediction method on the fitter instances. For example, - "predict_mean" or "predict_percentile". Default is "predict_median" + "predict_expectation" or "predict_percentile". Default is "predict_median" predictor_kwargs: keyward args to pass into predictor. Returns: