From 423e1d075b0ecf32b96966b2f9c2758272229849 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 18 Jun 2014 22:04:23 -0400
Subject: [PATCH 1/5] removing unecessary indexing

---
 lifelines/estimation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lifelines/estimation.py b/lifelines/estimation.py
index bc6fbce90..d6d2c9f60 100644
--- a/lifelines/estimation.py
+++ b/lifelines/estimation.py
@@ -493,12 +493,13 @@ def _fit_static(self, dataframe, duration_col="T", event_col="E",
         C = pd.Series(df[event_col].values, dtype=bool, index=ids)
         T = pd.Series(df[duration_col].values, index=ids)
 
-        df = df.set_index([duration_col, id_col])
+        df = df.set_index(id_col)
 
         ix = T.argsort()
         T, C = T.iloc[ix], C.iloc[ix]
 
         del df[event_col]
+        del df[duration_col]
         n, d = df.shape
         columns = df.columns
 

From c09528a0cd63dd49e320f2478e82e1b8f38d2de8 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 18 Jun 2014 23:05:41 -0400
Subject: [PATCH 2/5] 0.4.2: better CoxPHFitter, new prediction methods, new
 lifelines.datasets methods

---
 CHANGELOG.md                  | 11 ++++++++--
 docs/Intro to lifelines.rst   |  3 ++-
 docs/Quickstart.rst           | 10 +++++----
 docs/survival_regression.rst  |  6 +++--
 lifelines/datasets.py         | 17 +++++++--------
 lifelines/estimation.py       | 24 ++++++++++++++++----
 lifelines/tests/test_suite.py | 41 ++++++++++++++++++++++++-----------
 lifelines/utils.py            |  9 ++++++--
 setup.py                      |  2 +-
 9 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 348516bc2..2e0ec8492 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,15 @@
 ### Changelogs
 
+
+
 ####0.4.2
- 
- - Add Breslows method to CoxPHFitter. 
+
+ - Massive speed improvements to CoxPHFitter. 
+ - Additional prediction method: `predict_percentile` is available on CoxPHFitter and AalenAdditiveFitter. Given a percentile, p, this function returns the value t such that *S(t | x) = p*. It is a generalization of `predict_median`. 
+ - Additional kwargs in `k_fold_cross_validation` that will accept different prediction methods (default is `predict_median`). 
+- Bug fix in CoxPHFitter `predict_expectation` function. 
+- Correct spelling mistake in newton-rhapson algorithm.
+- `datasets` now contains functions for generating the respective datasets, ex: `generate_waltons_dataset`.
 
 
 ####0.4.1.1
diff --git a/docs/Intro to lifelines.rst b/docs/Intro to lifelines.rst
index 4bf546d8c..5e51f99ce 100644
--- a/docs/Intro to lifelines.rst	
+++ b/docs/Intro to lifelines.rst	
@@ -587,7 +587,8 @@ instruments could only detect the measurement was *less* than some upperbound.
 
 .. code:: python
 
-    from lifelines.datasets import lcd_dataset
+    from lifelines.datasets import generate_lcd_dataset
+    lcd_dataset = generate_lcd_dataset()
 
     T = lcd_dataset['alluvial_fan']['T']
     C = lcd_dataset['alluvial_fan']['C'] #boolean array, True if observed.
diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst
index 3434eec26..430c2b6d2 100644
--- a/docs/Quickstart.rst
+++ b/docs/Quickstart.rst
@@ -23,10 +23,11 @@ Let's start by importing some data. We need the durations that individuals are o
 
 .. code:: python
 
-    from lifelines.datasets import waltons_dataset    
+    from lifelines.datasets import generate_waltons_dataset
+    data = generate_waltons_dataset()    
 
-    T = waltons_dataset['T']
-    E = waltons_dataset['E']
+    T = data['T']
+    E = data['E']
 
 ``T`` is an array of durations, ``E`` is a either boolean or binary array representing whether the "death" was observed (alternatively an individual can be censored). 
 
@@ -85,7 +86,8 @@ While the above ``KaplanMeierFitter`` and ``NelsonAalenFitter`` are useful, they
 
 .. code:: python
     
-    from lifelines.datasets import regression_dataset
+    from lifelines.datasets import generate_regression_dataset
+    regression_dataset = generate_regression_dataset()
 
     regression_dataset.head()
 
diff --git a/docs/survival_regression.rst b/docs/survival_regression.rst
index 788f0d643..3847298e6 100644
--- a/docs/survival_regression.rst
+++ b/docs/survival_regression.rst
@@ -395,9 +395,10 @@ This example data is from the paper `here <http://cran.r-project.org/doc/contrib
 
 .. code:: python
 
-    from lifelines.datasets import rossi_dataset
+    from lifelines.datasets import generate_rossi_dataset
     from lifelines import CoxPHFitter
 
+    rossi_dataset = generate_rossi_dataset()
     cf = CoxPHFitter(alpha=0.95, tie_method='Efron')
     cf.fit(rossi_dataset, duration_col='week', event_col='arrest')
 
@@ -439,9 +440,10 @@ into a training set and a testing set, fits itself on the training set, and eval
 .. code:: python
       
         from lifelines import CoxPHFitter
-        from lifelines.datasets import regression_dataset
+        from lifelines.datasets import generate_regression_dataset
         from lifelines.utils import k_fold_cross_validation
 
+        regression_dataset = generate_regression_dataset()
         cf = CoxPHFitter()
         scores = k_fold_cross_validation(cf, regression_dataset, duration_col='T', event_col='E', k=3)
         print scores
diff --git a/lifelines/datasets.py b/lifelines/datasets.py
index a5d3d60ac..902c248c1 100644
--- a/lifelines/datasets.py
+++ b/lifelines/datasets.py
@@ -3,11 +3,14 @@
 import numpy as np
 from io import StringIO
 
-__all__ = ['waltons_dataset', 'regression_dataset',
-           'lcd_dataset', 'dd_dataset', 'rossi_dataset']
+__all__ = [ 'generate_waltons_dataset',
+            'generate_regression_dataset',
+            'generate_dd_dataset',
+            'generate_lcd_dataset',
+            'generate_rossi_dataset']
 
 
-def generate_left_censored_data():
+def generate_lcd_dataset():
     return {
         'alluvial_fan': {
             'T': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
@@ -27,7 +30,7 @@ def generate_left_censored_data():
     }
 
 
-def generate_waltons_data():
+def generate_waltons_dataset():
     waltonG = np.array(['miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
                         'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
                         'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
@@ -2547,8 +2550,4 @@ def generate_regression_dataset():
 """), header=0, delim_whitespace=True)
 
 
-waltons_dataset = generate_waltons_data()
-regression_dataset = generate_regression_dataset()
-dd_dataset = generate_dd_dataset()
-lcd_dataset = generate_left_censored_data()
-rossi_dataset = generate_rossi_dataset()
+
diff --git a/lifelines/estimation.py b/lifelines/estimation.py
index d6d2c9f60..d9f405a9b 100644
--- a/lifelines/estimation.py
+++ b/lifelines/estimation.py
@@ -723,12 +723,20 @@ def predict_survival_function(self, X):
         """
         return np.exp(-self.predict_cumulative_hazard(X))
 
+    def predict_percentile(self, X, p=0.5):
+        """
+        X: a (n,d) covariate matrix
+        Returns the median lifetimes for the individuals.
+        http://stats.stackexchange.com/questions/102986/percentile-loss-functions
+        """
+        return qth_survival_times(0.5, self.predict_survival_function(X))
+
     def predict_median(self, X):
         """
         X: a (n,d) covariate matrix
         Returns the median lifetimes for the individuals
         """
-        return median_survival_times(self.predict_survival_function(X))
+        return self.predict_percentile(X, 0.5)
 
     def predict_expectation(self, X):
         """
@@ -1032,19 +1040,27 @@ def predict_survival_function(self, X):
         """
         return exp(-self.predict_hazard(X).cumsum(0))
 
+    def predict_percentile(self, X, p=0.5):
+        """
+        X: a (n,d) covariate matrix
+        Returns the median lifetimes for the individuals.
+        http://stats.stackexchange.com/questions/102986/percentile-loss-functions
+        """
+        return qth_survival_times(0.5, self.predict_survival_function(X))
+
     def predict_median(self, X):
         """
         X: a (n,d) covariate matrix
         Returns the median lifetimes for the individuals
         """
-        return median_survival_times(self.predict_survival_function(X))
+        return self.predict_percentile(X, 0.5)
 
     def predict_expectation(self, X):
         """
         Compute the expected lifetime, E[T], using covarites X.
         """
-        t = self.cumulative_hazards_.index
-        return trapz(self.predict_survival_function(X).values.T, t)
+        v = self.predict_survival_function(X)
+        return trapz(v.values.T, v.index)
 
     def _compute_baseline_hazard(self):
         # http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf
diff --git a/lifelines/tests/test_suite.py b/lifelines/tests/test_suite.py
index 48011688f..d675f417b 100644
--- a/lifelines/tests/test_suite.py
+++ b/lifelines/tests/test_suite.py
@@ -27,7 +27,8 @@
 from ..generate_datasets import *
 from ..plotting import plot_lifetimes
 from ..utils import *
-from ..datasets import lcd_dataset, rossi_dataset, waltons_dataset, regression_dataset
+from ..datasets import generate_lcd_dataset, generate_rossi_dataset, \
+                       generate_waltons_dataset, generate_regression_dataset
 
 
 class MiscTests(unittest.TestCase):
@@ -93,7 +94,19 @@ def test_group_survival_table_from_events_works_with_series(self):
 
     def test_cross_validator(self):
         cf = CoxPHFitter()
-        k_fold_cross_validation(cf, regression_dataset, duration_col='T', event_col='E', k=3)
+        k_fold_cross_validation(cf, generate_regression_dataset(), duration_col='T', event_col='E', k=3)
+
+    def test_cross_validator_with_predictor(self):
+        cf = CoxPHFitter()
+        k_fold_cross_validation(cf, generate_regression_dataset(), 
+                                duration_col='T', event_col='E', k=3,
+                                predictor="predict_expectation")
+
+    def test_cross_validator_with_predictor_and_kwargs(self):
+        cf = CoxPHFitter()
+        k_fold_cross_validation(cf, generate_regression_dataset(), 
+                                duration_col='T', event_col='E', k=3,
+                                predictor="predict_percentile", predictor_kwargs={'p':0.6})
 
 
 class StatisticalTests(unittest.TestCase):
@@ -141,28 +154,28 @@ def test_not_to_break(self):
             self.assertTrue(False)
 
     def test_equal_intensity(self):
-        data1 = np.random.exponential(5, size=(200, 1))
-        data2 = np.random.exponential(5, size=(200, 1))
+        data1 = np.random.exponential(5, size=(2000, 1))
+        data2 = np.random.exponential(5, size=(2000, 1))
         summary, p_value, result = logrank_test(data1, data2)
         self.assertTrue(result is None)
 
     def test_unequal_intensity(self):
-        data1 = np.random.exponential(5, size=(200, 1))
-        data2 = np.random.exponential(1, size=(200, 1))
+        data1 = np.random.exponential(5, size=(2000, 1))
+        data2 = np.random.exponential(1, size=(2000, 1))
         summary, p_value, result = logrank_test(data1, data2)
         self.assertTrue(result)
 
     def test_unequal_intensity_event_observed(self):
-        data1 = np.random.exponential(5, size=(200, 1))
-        data2 = np.random.exponential(1, size=(200, 1))
-        eventA = np.random.binomial(1, 0.5, size=(200, 1))
-        eventB = np.random.binomial(1, 0.5, size=(200, 1))
+        data1 = np.random.exponential(5, size=(2000, 1))
+        data2 = np.random.exponential(1, size=(2000, 1))
+        eventA = np.random.binomial(1, 0.5, size=(2000, 1))
+        eventB = np.random.binomial(1, 0.5, size=(2000, 1))
         summary, p_value, result = logrank_test(data1, data2, event_observed_A=eventA, event_observed_B=eventB)
         self.assertTrue(result)
 
     def test_integer_times_logrank_test(self):
-        data1 = np.random.exponential(5, size=(200, 1)).astype(int)
-        data2 = np.random.exponential(1, size=(200, 1)).astype(int)
+        data1 = np.random.exponential(5, size=(2000, 1)).astype(int)
+        data2 = np.random.exponential(1, size=(2000, 1)).astype(int)
         summary, p_value, result = logrank_test(data1, data2)
         self.assertTrue(result)
 
@@ -350,6 +363,7 @@ def test_bayesian_fitter_large_data(self):
     @unittest.skipUnless("DISPLAY" in os.environ, "requires display")
     def test_kmf_left_censorship_plots(self):
         kmf = KaplanMeierFitter()
+        lcd_dataset = generate_lcd_dataset()
         kmf.fit(lcd_dataset['alluvial_fan']['T'], lcd_dataset['alluvial_fan']['C'], left_censorship=True, label='alluvial_fan')
         ax = kmf.plot()
 
@@ -773,7 +787,7 @@ def test_fit_method(self):
     def test_output_against_R(self):
         # from http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf
         expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849,  0.0915]])
-        df = rossi_dataset
+        df = generate_rossi_dataset()
         cf = CoxPHFitter()
         cf.fit(df, duration_col='week', event_col='arrest')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
@@ -785,6 +799,7 @@ def test_output_against_R(self):
 N = len(LIFETIMES)
 
 #walton's data
+waltons_dataset = generate_waltons_dataset()
 ix = waltons_dataset['group'] == 'miR-137'
 waltonT1 = waltons_dataset.ix[ix]['T']
 waltonT2 = waltons_dataset.ix[~ix]['T']
diff --git a/lifelines/utils.py b/lifelines/utils.py
index 4c569dde3..0bc92dab5 100644
--- a/lifelines/utils.py
+++ b/lifelines/utils.py
@@ -264,7 +264,9 @@ def AandS_approximation(p):
     return t - (c_0 + c_1 * t + c_2 * t ** 2) / (1 + d_1 * t + d_2 * t * t + d_3 * t ** 3)
 
 
-def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E', k=5, loss_function="concordance"):
+def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E',
+                             k=5, loss_function="concordance", predictor="predict_median",
+                             predictor_kwargs={}):
     """
     Perform cross validation on a dataset. 
 
@@ -277,6 +279,9 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E', k=5, lo
     k: the number of folds to perform. n/k data will be withheld for testing on.
     loss_function: "concordance" only.
             "concordance":  concordance index (C-index) between two series of event times
+    predictor: a string that matches a prediction method on the fitter instances. For example, 
+            "predict_mean" or "predict_percentile". Default is "predict_median"
+    predictor_kwargs: keyward args to pass into predictor.
 
     Returns:
         (k,1) array of scores for each fold. 
@@ -305,7 +310,7 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E', k=5, lo
 
         #fit the fitter to the training data
         fitter.fit(training_data, duration_col=duration_col, event_col=event_col)
-        T_pred = fitter.predict_median(X_testing).values
+        T_pred = getattr(fitter, predictor)(X_testing, **predictor_kwargs).values
 
         scores[i-1] = concordance_index(T_actual, T_pred, E_actual)
 
diff --git a/setup.py b/setup.py
index c4b225c1f..4d12068c5 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def read(fname):
 
 setup(
     name="lifelines",
-    version="0.4.1.1",
+    version="0.4.2",
     author="Cameron Davidson-Pilon",
     author_email="cam.davidson.pilon@gmail.com",
     description="Survival analysis in Python, including Kaplan Meier, Nelson Aalen and regression",

From d01faf3253a16ee25332e6d5cad58c452383b13d Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 18 Jun 2014 23:07:59 -0400
Subject: [PATCH 3/5] changelong

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e0ec8492..a327bdd4e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,7 +10,7 @@
 - Bug fix in CoxPHFitter `predict_expectation` function. 
 - Correct spelling mistake in newton-rhapson algorithm.
 - `datasets` now contains functions for generating the respective datasets, ex: `generate_waltons_dataset`.
-
+- Bumping up the number of samples in statistical tests to prevent them from failing so often (this a stop-gap)
 
 ####0.4.1.1
 

From 67634ab43b1fa50fdd31a836495c12f03f982d3d Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 18 Jun 2014 23:13:37 -0400
Subject: [PATCH 4/5] pep8 everything

---
 CHANGELOG.md                  |  1 +
 lifelines/datasets.py         | 13 +++++--------
 lifelines/estimation.py       | 15 ++++++++-------
 lifelines/statistics.py       | 12 ++++++------
 lifelines/tests/test_suite.py | 25 ++++++++++++-------------
 lifelines/utils.py            | 27 ++++++++++++++-------------
 6 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a327bdd4e..1316c69b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - Correct spelling mistake in newton-rhapson algorithm.
 - `datasets` now contains functions for generating the respective datasets, ex: `generate_waltons_dataset`.
 - Bumping up the number of samples in statistical tests to prevent them from failing so often (this a stop-gap)
+- pep8 everything
 
 ####0.4.1.1
 
diff --git a/lifelines/datasets.py b/lifelines/datasets.py
index 902c248c1..7454c587b 100644
--- a/lifelines/datasets.py
+++ b/lifelines/datasets.py
@@ -3,11 +3,11 @@
 import numpy as np
 from io import StringIO
 
-__all__ = [ 'generate_waltons_dataset',
-            'generate_regression_dataset',
-            'generate_dd_dataset',
-            'generate_lcd_dataset',
-            'generate_rossi_dataset']
+__all__ = ['generate_waltons_dataset',
+           'generate_regression_dataset',
+           'generate_dd_dataset',
+           'generate_lcd_dataset',
+           'generate_rossi_dataset']
 
 
 def generate_lcd_dataset():
@@ -2548,6 +2548,3 @@ def generate_regression_dataset():
 0.031865  1.753759  0.252040   8.519852  1
 1.631269  1.588621  3.709899   4.480448  1
 """), header=0, delim_whitespace=True)
-
-
-
diff --git a/lifelines/estimation.py b/lifelines/estimation.py
index d9f405a9b..2cc11e7a2 100644
--- a/lifelines/estimation.py
+++ b/lifelines/estimation.py
@@ -139,7 +139,7 @@ def smoothed_hazard_(self, bandwidth):
         hazard_name = "smoothed-" + cumulative_hazard_name
         hazard_ = self.cumulative_hazard_.diff().fillna(self.cumulative_hazard_.iloc[0])
         C = (hazard_[cumulative_hazard_name] != 0.0).values
-        return pd.DataFrame( 1./(2*bandwidth)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth), hazard_.values[C,:]),
+        return pd.DataFrame( 1./(2*bandwidth)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None,:], bandwidth), hazard_.values[C,:]),
                              columns=[hazard_name], index=timeline)
 
     def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None):
@@ -156,7 +156,7 @@ def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None):
         self._cumulative_sq.iloc[0] = 0
         var_hazard_ = self._cumulative_sq.diff().fillna(self._cumulative_sq.iloc[0])
         C = (var_hazard_.values != 0.0)  # only consider the points with jumps
-        std_hazard_ = np.sqrt(1./(2*bandwidth**2)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth)**2, var_hazard_.values[C]))
+        std_hazard_ = np.sqrt(1./(2*bandwidth**2)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None,:], bandwidth)**2, var_hazard_.values[C]))
         values = {
             self.ci_labels[0]: hazard_ * np.exp(alpha2 * std_hazard_ / hazard_),
             self.ci_labels[1]: hazard_ * np.exp(-alpha2 * std_hazard_ / hazard_)
@@ -747,6 +747,7 @@ def predict_expectation(self, X):
 
 
 class CoxPHFitter(BaseFitter):
+
     """
     This class implements fitting Cox's proportional hazard model:
 
@@ -802,7 +803,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
         # Iterate backwards to utilize recursive relationship
         for i, (ti, ei) in reversed(list(enumerate(zip(T, E)))):
             # Doing it like this to preserve shape
-            xi = X[i:i+1]
+            xi = X[i:i + 1]
             # Calculate phi values
             phi_i = exp(dot(xi, beta))
             phi_x_i = dot(phi_i, xi)
@@ -823,7 +824,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
                 # Keep track of count
                 tie_count += 1
 
-            if i > 0 and T[i-1] == ti:
+            if i > 0 and T[i - 1] == ti:
                 # There are more ties/members of the risk set
                 continue
             elif tie_count == 0:
@@ -999,7 +1000,7 @@ def _compute_confidence_intervals(self):
 
     def _compute_standard_errors(self):
         se = np.sqrt(inv(-self._hessian_).diagonal())
-        return pd.DataFrame(se[None, :],
+        return pd.DataFrame(se[None,:],
                             index=['se'], columns=self.hazards_.columns)
 
     def _compute_z_values(self):
@@ -1207,10 +1208,10 @@ def qth_survival_times(q, survival_functions):
     sv_b = (1.0 * (survival_functions < q)).cumsum() > 0
     try:
         v = sv_b.idxmax(0)
-        v[sv_b.iloc[-1, :] == 0] = np.inf
+        v[sv_b.iloc[-1,:] == 0] = np.inf
     except:
         v = sv_b.argmax(0)
-        v[sv_b[-1, :] == 0] = np.inf
+        v[sv_b[-1,:] == 0] = np.inf
     return v
 
 
diff --git a/lifelines/statistics.py b/lifelines/statistics.py
index d8f0d28f7..898fda132 100644
--- a/lifelines/statistics.py
+++ b/lifelines/statistics.py
@@ -53,8 +53,8 @@ def concordance_index(event_times, predicted_event_times, event_observed=None):
                    event_observed)
 
 
-def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None, 
-                  alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
+def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None,
+                 alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
     """
     Measures and reports on whether two intensity processes are different. That is, given two
     event series, determines whether the data generating processes are statistically different.
@@ -91,8 +91,8 @@ def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_obse
     event_times = np.r_[event_times_A, event_times_B]
     groups = np.r_[np.zeros(event_times_A.shape[0]), np.ones(event_times_B.shape[0])]
     event_observed = np.r_[event_observed_A, event_observed_B]
-    return multivariate_logrank_test(event_times, groups, event_observed, 
-                                          alpha=alpha, t_0=t_0, suppress_print=suppress_print, **kwargs)
+    return multivariate_logrank_test(event_times, groups, event_observed,
+                                     alpha=alpha, t_0=t_0, suppress_print=suppress_print, **kwargs)
 
 
 def pairwise_logrank_test(event_durations, groups, event_observed=None,
@@ -175,7 +175,7 @@ def pairwise_logrank_test(event_durations, groups, event_observed=None,
     return [pd.DataFrame(x, columns=unique_groups, index=unique_groups) for x in [S, P, T]]
 
 
-def multivariate_logrank_test(event_durations, groups, event_observed=None, 
+def multivariate_logrank_test(event_durations, groups, event_observed=None,
                               alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
     """
     This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
@@ -238,7 +238,7 @@ def multivariate_logrank_test(event_durations, groups, event_observed=None,
                                    df=n_groups - 1, **kwargs)
 
     if not suppress_print:
-      print(summary)
+        print(summary)
     return summary, p_value, test_result
 
 
diff --git a/lifelines/tests/test_suite.py b/lifelines/tests/test_suite.py
index d675f417b..c3bac402f 100644
--- a/lifelines/tests/test_suite.py
+++ b/lifelines/tests/test_suite.py
@@ -28,7 +28,7 @@
 from ..plotting import plot_lifetimes
 from ..utils import *
 from ..datasets import generate_lcd_dataset, generate_rossi_dataset, \
-                       generate_waltons_dataset, generate_regression_dataset
+    generate_waltons_dataset, generate_regression_dataset
 
 
 class MiscTests(unittest.TestCase):
@@ -98,15 +98,15 @@ def test_cross_validator(self):
 
     def test_cross_validator_with_predictor(self):
         cf = CoxPHFitter()
-        k_fold_cross_validation(cf, generate_regression_dataset(), 
+        k_fold_cross_validation(cf, generate_regression_dataset(),
                                 duration_col='T', event_col='E', k=3,
                                 predictor="predict_expectation")
 
     def test_cross_validator_with_predictor_and_kwargs(self):
         cf = CoxPHFitter()
-        k_fold_cross_validation(cf, generate_regression_dataset(), 
+        k_fold_cross_validation(cf, generate_regression_dataset(),
                                 duration_col='T', event_col='E', k=3,
-                                predictor="predict_percentile", predictor_kwargs={'p':0.6})
+                                predictor="predict_percentile", predictor_kwargs={'p': 0.6})
 
 
 class StatisticalTests(unittest.TestCase):
@@ -241,18 +241,17 @@ def test_pairwise_logrank_test(self):
         npt.assert_array_equal(R, V)
 
     def test_multivariate_inputs(self):
-        T = np.array([1,2,3])
-        E = np.array([1,1,0], dtype=bool)
-        G = np.array([1,2,1])
-        multivariate_logrank_test(T,G,E)
-        pairwise_logrank_test(T,G,E)
+        T = np.array([1, 2, 3])
+        E = np.array([1, 1, 0], dtype=bool)
+        G = np.array([1, 2, 1])
+        multivariate_logrank_test(T, G, E)
+        pairwise_logrank_test(T, G, E)
 
         T = pd.Series(T)
         E = pd.Series(E)
         G = pd.Series(G)
-        multivariate_logrank_test(T,G,E)
-        pairwise_logrank_test(T,G,E)
-
+        multivariate_logrank_test(T, G, E)
+        pairwise_logrank_test(T, G, E)
 
     def test_lists_to_KaplanMeierFitter(self):
         T = [2, 3, 4., 1., 6, 5.]
@@ -798,7 +797,7 @@ def test_output_against_R(self):
 OBSERVED = np.array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0])
 N = len(LIFETIMES)
 
-#walton's data
+# walton's data
 waltons_dataset = generate_waltons_dataset()
 ix = waltons_dataset['group'] == 'miR-137'
 waltonT1 = waltons_dataset.ix[ix]['T']
diff --git a/lifelines/utils.py b/lifelines/utils.py
index 0bc92dab5..ea7fb1410 100644
--- a/lifelines/utils.py
+++ b/lifelines/utils.py
@@ -88,7 +88,7 @@ def group_survival_table_from_events(groups, durations, event_observed, min_obse
         g_name = str(g)
         data = data.join(survival_table_from_events(T, C, B,
                                                     columns=['removed:' + g_name, "observed:" + g_name, 'censored:' + g_name, 'entrance' + g_name]),
-                                                    how='outer')
+                         how='outer')
     data = data.fillna(0)
     # hmmm pandas its too bad I can't do data.ix[:limit] and leave out the if.
     if int(limit) != -1:
@@ -265,10 +265,10 @@ def AandS_approximation(p):
 
 
 def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E',
-                             k=5, loss_function="concordance", predictor="predict_median",
-                             predictor_kwargs={}):
+                            k=5, loss_function="concordance", predictor="predict_median",
+                            predictor_kwargs={}):
     """
-    Perform cross validation on a dataset. 
+    Perform cross validation on a dataset.
 
     fitter: either an instance of AalenAdditiveFitter or CoxPHFitter.
     df: a Pandas dataframe with necessary columns `duration_col` and `event_col`, plus
@@ -279,28 +279,28 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E',
     k: the number of folds to perform. n/k data will be withheld for testing on.
     loss_function: "concordance" only.
             "concordance":  concordance index (C-index) between two series of event times
-    predictor: a string that matches a prediction method on the fitter instances. For example, 
+    predictor: a string that matches a prediction method on the fitter instances. For example,
             "predict_mean" or "predict_percentile". Default is "predict_median"
     predictor_kwargs: keyward args to pass into predictor.
 
     Returns:
-        (k,1) array of scores for each fold. 
+        (k,1) array of scores for each fold.
     """
     from .statistics import concordance_index
 
-    n,d = df.shape
+    n, d = df.shape
     scores = np.zeros((k,))
     df = df.copy()
-    df = df.reindex(np.random.permutation(df.index)).sort(event_col)    
+    df = df.reindex(np.random.permutation(df.index)).sort(event_col)
 
-    assignments = np.array((n//k + 1)*list(range(1,k+1)))
+    assignments = np.array((n // k + 1) * list(range(1, k + 1)))
     assignments = assignments[:n]
 
     testing_columns = df.columns - [duration_col, event_col]
 
-    for i in range(1,k+1):
+    for i in range(1, k + 1):
 
-        ix = assignments == i 
+        ix = assignments == i
         training_data = df.ix[~ix]
         testing_data = df.ix[ix]
 
@@ -308,14 +308,15 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E',
         E_actual = testing_data[event_col].values
         X_testing = testing_data[testing_columns]
 
-        #fit the fitter to the training data
+        # fit the fitter to the training data
         fitter.fit(training_data, duration_col=duration_col, event_col=event_col)
         T_pred = getattr(fitter, predictor)(X_testing, **predictor_kwargs).values
 
-        scores[i-1] = concordance_index(T_actual, T_pred, E_actual)
+        scores[i - 1] = concordance_index(T_actual, T_pred, E_actual)
 
     return scores
 
+
 def epanechnikov_kernel(t, T, bandwidth=1.):
     M = 0.75 * (1 - (t - T) / bandwidth) ** 2
     M[abs((t - T)) >= bandwidth] = 0

From 1093f96f93e7eea1fbe99fdc6ace99c4f5486240 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 19 Jun 2014 07:43:48 -0400
Subject: [PATCH 5/5] no predict_mean

---
 lifelines/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lifelines/utils.py b/lifelines/utils.py
index ea7fb1410..8c46a6a10 100644
--- a/lifelines/utils.py
+++ b/lifelines/utils.py
@@ -280,7 +280,7 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E',
     loss_function: "concordance" only.
             "concordance":  concordance index (C-index) between two series of event times
     predictor: a string that matches a prediction method on the fitter instances. For example,
-            "predict_mean" or "predict_percentile". Default is "predict_median"
+            "predict_expectation" or "predict_percentile". Default is "predict_median"
     predictor_kwargs: keyward args to pass into predictor.
 
     Returns: