From 15547cd19826b4487b6576d2cff6c12614ef1e27 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 17 May 2018 10:29:37 -0400
Subject: [PATCH 1/4] some questionable improvements

---
 lifelines/fitters/cox_time_varying_fitter.py | 88 +++++++++++++++-----
 lifelines/fitters/coxph_fitter.py            |  2 +-
 lifelines/utils/__init__.py                  |  4 +-
 lifelines/version.py                         |  2 +-
 4 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 3c2c15d14..dfa09e913 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -2,6 +2,8 @@
 from __future__ import print_function
 from __future__ import division
 import warnings
+import time
+
 import numpy as np
 import pandas as pd
 from scipy import stats
@@ -58,6 +60,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
         self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
         self.confidence_intervals_ = self._compute_confidence_intervals()
         self._n_examples = df.shape[0]
+        self._n_unique = df.index.unique().shape[0]
 
         return self
 
@@ -110,7 +113,7 @@ def summary(self):
         df['upper %.2f' % self.alpha] = self.confidence_intervals_.loc['upper-bound'].values
         return df
 
-    def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-6):
+    def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-5):
         """
         Newton Rhaphson algorithm for fitting CPH model.
 
@@ -137,6 +140,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
         i = 0
         converging = True
         ll, previous_ll = 0, 0
+        start = time.time()
 
         step_sizer = StepSizer(step_size)
         step_size = step_sizer.next()
@@ -158,7 +162,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
             hessian, gradient = h, g
 
             if show_progress:
-                print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f" % (i, norm(delta), step_size, ll))
+                print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start))
 
             # convergence criteria
             if norm(delta) < precision:
@@ -204,16 +208,12 @@ def _get_gradients(self, df, stops_events, beta):
 
         _, d = df.shape
         hessian = np.zeros((d, d))
-        gradient = np.zeros((1, d))
+        gradient = np.zeros(d)
         log_lik = 0
 
         unique_death_times = np.unique(stops_events['stop'].loc[stops_events['event']])
 
         for t in unique_death_times:
-            x_death_sum = np.zeros((1, d))
-            tie_phi, risk_phi = 0, 0
-            risk_phi_x, tie_phi_x = np.zeros((1, d)), np.zeros((1, d))
-            risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d))
 
             ix = (stops_events['start'] < t) & (t <= stops_events['stop'])
             df_at_t = df.loc[ix]
@@ -224,9 +224,9 @@ def _get_gradients(self, df, stops_events, beta):
             phi_x_x_i = dot(df_at_t.T, phi_x_i) # dot(df_at_t.T, phi_i * df_at_t)
 
             # Calculate sums of Risk set
-            risk_phi += phi_i.sum()
-            risk_phi_x += phi_x_i.sum(0).values.reshape((1, d))
-            risk_phi_x_x += phi_x_x_i
+            risk_phi = phi_i.sum()
+            risk_phi_x = phi_x_i.sum(0).values
+            risk_phi_x_x = phi_x_x_i
 
             # Calculate the sums of Tie set
             deaths = stops_events_at_t['event'] & (stops_events_at_t['stop'] == t)
@@ -234,15 +234,15 @@ def _get_gradients(self, df, stops_events, beta):
 
             xi_deaths = df_at_t.loc[deaths]
 
-            x_death_sum += xi_deaths.sum(0).values.reshape((1, d))
+            x_death_sum = xi_deaths.sum(0).values
 
             if death_counts > 1:
                 # it's faster if we can skip computing these when we don't need to.
-                tie_phi += phi_i[deaths.values].sum()
-                tie_phi_x += phi_x_i.loc[deaths].sum(0).values.reshape((1, d))
-                tie_phi_x_x += dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths)
+                tie_phi = phi_i[deaths.values].sum()
+                tie_phi_x = phi_x_i.loc[deaths].sum(0).values
+                tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths)
 
-            partial_gradient = np.zeros((1, d))
+            partial_gradient = np.zeros(d)
 
             for l in range(death_counts):
 
@@ -262,7 +262,7 @@ def _get_gradients(self, df, stops_events, beta):
                 partial_gradient += z / denom
                 # In case z and denom both are really small numbers,
                 # make sure to do division before multiplications
-                a2 = dot(z.T / denom, z / denom)
+                a2 = np.outer(z / denom, z / denom)
 
                 hessian -= (a1 - a2)
 
@@ -270,8 +270,49 @@ def _get_gradients(self, df, stops_events, beta):
 
             # Values outside tie sum
             gradient += x_death_sum - partial_gradient
-            log_lik += dot(x_death_sum, beta)[0][0]
-        return hessian, gradient, log_lik
+            log_lik += dot(x_death_sum, beta)[0]
+
+        return hessian, gradient.reshape(1, d), log_lik
+
+    def predict_log_partial_hazard(self, X):
+        """
+        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
+            can be in any order. If a numpy array, columns must be in the
+            same order as the training data.
+
+        This is equivalent to R's linear.predictors.
+        Returns the log of the partial hazard for the individuals, partial since the
+        baseline hazard is not included. Equal to \beta (X - \bar{X})
+
+        If X is a dataframe, the order of the columns do not matter. But
+        if X is an array, then the column ordering is assumed to be the
+        same as the training dataset.
+        """
+        if isinstance(X, pd.DataFrame):
+            order = self.hazards_.columns
+            X = X[order]
+
+        index = _get_index(X)
+        X = normalize(X, self._norm_mean.values, 1)
+        return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
+
+
+    def predict_partial_hazard(self, X):
+        """
+        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
+            can be in any order. If a numpy array, columns must be in the
+            same order as the training data.
+
+
+        If X is a dataframe, the order of the columns do not matter. But
+        if X is an array, then the column ordering is assumed to be the
+        same as the training dataset.
+
+        Returns the partial hazard for the individuals, partial since the
+        baseline hazard is not included. Equal to \exp{\beta X}
+        """
+        return exp(self.predict_log_partial_hazard(X))
+
 
     def print_summary(self):
         """
@@ -283,7 +324,7 @@ def print_summary(self):
         df[''] = [significance_code(p) for p in df['p']]
 
         # Print information about data first
-        print('n={}, number of events={}'.format(self._n_examples,
+        print('periods={}, uniques={}, number of events={}'.format(self._n_examples, self._n_unique,
                                                  np.where(self.event_observed)[0].shape[0]),
               end='\n\n')
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
@@ -324,3 +365,12 @@ def plot(self, standardized=False, **kwargs):
         plt.yticks(yaxis_locations, tick_labels)
         plt.xlabel("standardized coef" if standardized else "coef")
         return ax
+
+    def __repr__(self):
+        classname = self.__class__.__name__
+        try:
+            s = """<lifelines.%s: fitted with %d periods, %d uniques, %d events>""" % (
+                classname, self._n_examples, self._n_unique, self.event_observed.shape[0] - np.where(self.event_observed)[0].shape[0])
+        except AttributeError:
+            s = """<lifelines.%s>""" % classname
+        return s
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index c9979d591..e7e487c38 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -336,7 +336,7 @@ def _get_efron_values(self, X, beta, T, E, weights):
                 continue
 
             # There was atleast one event and no more ties remain. Time to sum.
-            partial_gradient = np.zeros((1, d))
+            partial_gradient = np.zeros((tie_count, d))
 
             for l in range(tie_count):
                 c = l / tie_count
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 69035da1d..95ac64fdb 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1005,9 +1005,9 @@ def concordance_value(time_a, time_b, pred_a, pred_b):
 
 
 def pass_for_numeric_dtypes_or_raise(df):
-    nonnumeric_cols = df.select_dtypes(exclude=[np.number, bool]).columns.tolist()
+    nonnumeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
     if len(nonnumeric_cols) > 0:
-        raise TypeError("DataFrame contains nonnumeric columns: %s. Try using pandas.get_dummies to convert the column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols)
+        raise TypeError("DataFrame contains nonnumeric columns: %s. Try converting boolean column(s) to integers, or using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols)
 
 
 def check_for_overlapping_intervals(df):
diff --git a/lifelines/version.py b/lifelines/version.py
index 789b2f93c..c1fdf9229 100644
--- a/lifelines/version.py
+++ b/lifelines/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '0.14.1'
+__version__ = '0.14.2'

From 55140c2132e7e819f00cffec4f297a63fcc456db Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 17 May 2018 11:01:36 -0400
Subject: [PATCH 2/4] simpler print messages

---
 lifelines/fitters/cox_time_varying_fitter.py | 4 ++--
 lifelines/fitters/coxph_fitter.py            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index dfa09e913..7bdded1ec 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -325,7 +325,7 @@ def print_summary(self):
 
         # Print information about data first
         print('periods={}, uniques={}, number of events={}'.format(self._n_examples, self._n_unique,
-                                                 np.where(self.event_observed)[0].shape[0]),
+                                                                   self.event_observed.sum()),
               end='\n\n')
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
         # Significance code explanation
@@ -370,7 +370,7 @@ def __repr__(self):
         classname = self.__class__.__name__
         try:
             s = """<lifelines.%s: fitted with %d periods, %d uniques, %d events>""" % (
-                classname, self._n_examples, self._n_unique, self.event_observed.shape[0] - np.where(self.event_observed)[0].shape[0])
+                classname, self._n_examples, self._n_unique, self.event_observed.sum())
         except AttributeError:
             s = """<lifelines.%s>""" % classname
         return s
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index e7e487c38..fc0899a4d 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -336,7 +336,7 @@ def _get_efron_values(self, X, beta, T, E, weights):
                 continue
 
             # There was atleast one event and no more ties remain. Time to sum.
-            partial_gradient = np.zeros((tie_count, d))
+            partial_gradient = np.zeros((1, d))
 
             for l in range(tie_count):
                 c = l / tie_count
@@ -433,7 +433,7 @@ def print_summary(self):
 
         # Print information about data first
         print('n={}, number of events={}'.format(self._n_examples,
-                                                 np.where(self.event_observed)[0].shape[0]),
+                                                 self.event_observed.sum()),
               end='\n\n')
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
         # Significance code explanation

From dc3a57762eadde90e74a531aca5a69134d402f1b Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 17 May 2018 11:12:44 -0400
Subject: [PATCH 3/4] keep current limits

---
 lifelines/fitters/cox_time_varying_fitter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 7bdded1ec..56cf15a1f 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -113,7 +113,7 @@ def summary(self):
         df['upper %.2f' % self.alpha] = self.confidence_intervals_.loc['upper-bound'].values
         return df
 
-    def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-5):
+    def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-6):
         """
         Newton Rhaphson algorithm for fitting CPH model.
 
@@ -162,7 +162,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
             hessian, gradient = h, g
 
             if show_progress:
-                print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start))
+                print("Iteration %d: norm_delta = %.6f, step_size = %.6f, ll = %.6f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start))
 
             # convergence criteria
             if norm(delta) < precision:

From 53d616e01b4a2798acacaccf7b0e74563e6b00bd Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 17 May 2018 11:52:33 -0400
Subject: [PATCH 4/4] adding docs and a bool test

---
 lifelines/fitters/cox_time_varying_fitter.py | 26 +++++++++++++++++++-
 lifelines/fitters/coxph_fitter.py            |  1 +
 lifelines/utils/__init__.py                  |  4 +--
 tests/test_estimation.py                     | 10 ++++++++
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 56cf15a1f..601c9572d 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -36,6 +36,29 @@ def __init__(self, alpha=0.95, penalizer=0.0):
         self.penalizer = penalizer
 
     def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None):
+        """
+        Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times
+        are handled using Efron's tie-method.
+
+        Parameters:
+          df: a Pandas dataframe with necessary columns `duration_col` and
+             `event_col`, plus other covariates. `duration_col` refers to
+             the lifetimes of the subjects. `event_col` refers to whether
+             the 'death' events was observed: 1 if observed, 0 else (censored).
+          id_col:  A subject could have multiple rows in the dataframe. This column contains
+             the unique identifer per subject.
+          event_col: the column in dataframe that contains the subjects' death
+             observation. If left as None, assume all individuals are non-censored.
+          start_col: the column that contains the start of a subject's time period.
+          end_col: the column that contains the end of a subject's time period.
+          show_progress: since the fitter is iterative, show convergence
+             diagnostics.
+          step_size: set an initial step size for the fitting algorithm.
+
+        Returns:
+            self, with additional properties: hazards_
+
+        """
 
         df = df.copy()
         if not (id_col in df and event_col in df and start_col in df and stop_col in df):
@@ -50,6 +73,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
 
         stop_times_events = df[["event", "stop", "start"]]
         df = df.drop(["event", "stop", "start"], axis=1)
+        df = df.astype(float)
 
         self._norm_mean = df.mean(0)
         self._norm_std = df.std(0)
@@ -162,7 +186,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
             hessian, gradient = h, g
 
             if show_progress:
-                print("Iteration %d: norm_delta = %.6f, step_size = %.6f, ll = %.6f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start))
+                print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start))
 
             # convergence criteria
             if norm(delta) < precision:
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index fc0899a4d..3bc3cb8b4 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -83,6 +83,7 @@ def fit(self, df, duration_col, event_col=None,
              catagorical covariate does not obey the proportional hazard assumption. This
              is used similar to the `strata` expression in R.
              See http://courses.washington.edu/b515/l17.pdf.
+          step_size: set an initial step size for the fitting algorithm.
 
         Returns:
             self, with additional properties: hazards_
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 95ac64fdb..29bf9e8d5 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1005,9 +1005,9 @@ def concordance_value(time_a, time_b, pred_a, pred_b):
 
 
 def pass_for_numeric_dtypes_or_raise(df):
-    nonnumeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
+    nonnumeric_cols = df.select_dtypes(exclude=[np.number, bool]).columns.tolist()
     if len(nonnumeric_cols) > 0:
-        raise TypeError("DataFrame contains nonnumeric columns: %s. Try converting boolean column(s) to integers, or using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols)
+        raise TypeError("DataFrame contains nonnumeric columns: %s. Try using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols)
 
 
 def check_for_overlapping_intervals(df):
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 7f738b2d2..85e748fa5 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1482,6 +1482,16 @@ def test_fitter_will_raise_an_error_if_overlapping_intervals(self, ctv):
             with pytest.raises(ValueError):
                 ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event")
 
+    def test_fitter_accept_boolean_columns(self, ctv):
+        df = pd.DataFrame.from_records([
+            {'id': 1, 'start': 0, 'stop': 5,  'var': -1.2, 'bool': True, 'event': 1},
+            {'id': 2, 'start': 0, 'stop': 5,  'var': 1.3, 'bool': False, 'event': 1},
+            {'id': 3, 'start': 0, 'stop': 5,  'var': -1.3, 'bool': False, 'event': 1},
+        ])
+
+        ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event")
+
+
     def test_warning_is_raised_if_df_has_a_near_constant_column(self, ctv, dfcv):
         dfcv['constant'] = 1.0