V0.16.2 (#594)

* fixes and perf improvements for 0.16.2 * bump version * creating test for single vs batch, and adding perf script * remove pdb * fix test
CamDavidsonPilon · Jan 2, 2019 · 0063469 · 0063469
1 parent ca9673e
commit 0063469
Show file tree

Hide file tree

Showing 9 changed files with 244 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 ### Changelogs
 
+#### 0.16.2
+ - Fixed `CoxTimeVaryingFitter` to allow more than one variable to be stratafied
+ - Significant performance improvements for `CoxPHFitter` with dataset has lots of duplicate times. See https://github.com/CamDavidsonPilon/lifelines/issues/591
+
 #### 0.16.1
  - Fixed py2 division error in `concordance` method.
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -60,7 +60,7 @@
 #
 # The short X.Y version.
 
-version = "0.16.1"
+version = "0.16.2"
 # The full version, including dev info
 release = version
 

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
@@ -198,7 +198,7 @@ def _check_values(df, stop_times_events):
 
     def _partition_by_strata(self, X, stop_times_events, weights):
         for stratum, stratified_X in X.groupby(self.strata):
-            stratified_stop_times_events, stratified_W = (stop_times_events.loc[[stratum]], weights.loc[[stratum]])
+            stratified_stop_times_events, stratified_W = (stop_times_events.loc[stratum], weights.loc[stratum])
             yield (stratified_X, stratified_stop_times_events, stratified_W), stratum
 
     def _partition_by_strata_and_apply(self, X, stop_times_events, weights, function, *args):
@@ -385,8 +385,7 @@ def _get_gradients(self, df, stops_events, weights, beta):  # pylint: disable=to
         gradient: (1, d) numpy array
         log_likelihood: float
         """
-        # import pdb
-        # pdb.set_trace()
+
         _, d = df.shape
         hessian = np.zeros((d, d))
         gradient = np.zeros(d)
@@ -454,16 +453,17 @@ def _get_gradients(self, df, stops_events, weights, beta):  # pylint: disable=to
                     a1 = risk_phi_x_x / denom
 
                 # Gradient
-                partial_gradient += weighted_average * numer / denom
+                partial_gradient += numer / denom
                 # In case numer and denom both are really small numbers,
                 # make sure to do division before multiplications
-                a2 = np.outer(numer / denom, numer / denom)
+                t = numer[:, None] / denom
+                a2 = t.dot(t.T)
 
                 hessian -= weighted_average * (a1 - a2)
                 log_lik -= weighted_average * np.log(denom)
 
             # Values outside tie sum
-            gradient += x_death_sum - partial_gradient
+            gradient += x_death_sum - weighted_average * partial_gradient
             log_lik += dot(x_death_sum, beta)[0]
 
         return hessian, gradient.reshape(1, d), log_lik

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
@@ -107,6 +107,7 @@ def fit(
         weights_col=None,
         cluster_col=None,
         robust=False,
+        batch_mode=None,
     ):
         """
         Fit the Cox Propertional Hazard model to a dataset.
@@ -162,6 +163,9 @@ def fit(
             specifies what column has unique identifers for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
             be used.
 
+        batch_mode: bool, optional
+            enabling batch_mode can be faster for datasets with a large number of ties. If left as None, lifelines will choose the best option.
+
         Returns
         -------
         self: CoxPHFitter
@@ -214,6 +218,7 @@ def fit(
         self.cluster_col = cluster_col
         self.weights_col = weights_col
         self._n_examples = df.shape[0]
+        self._batch_mode = batch_mode
         self.strata = coalesce(strata, self.strata)
 
         X, T, E, weights, original_index, self._clusters = self._preprocess_dataframe(df)
@@ -357,7 +362,7 @@ def _newton_rhaphson(
         """
         self.path = []
         assert precision <= 1.0, "precision must be less than or equal to 1."
-        _, d = X.shape
+        n, d = X.shape
 
         # make sure betas are correct size.
         if initial_beta is not None:
@@ -371,7 +376,12 @@ def _newton_rhaphson(
 
         # Method of choice is just efron right now
         if self.tie_method == "Efron":
-            get_gradients = self._get_efron_values
+            # https://github.com/CamDavidsonPilon/lifelines/issues/591
+            frac_dups = T.unique().shape[0] / n
+            if self._batch_mode or (0.4690 + 3.045e-05 * n + 2.374137 * frac_dups + 0.000711 * n * frac_dups < 1):
+                get_gradients = self._get_efron_values_batch
+            else:
+                get_gradients = self._get_efron_values_single
         else:
             raise NotImplementedError("Only Efron is available.")
 
@@ -477,7 +487,7 @@ def _newton_rhaphson(
 
         return beta
 
-    def _get_efron_values(self, X, T, E, weights, beta):
+    def _get_efron_values_single(self, X, T, E, weights, beta):
         """
         Calculates the first and second order vector differentials, with respect to beta.
         Note that X, T, E are assumed to be sorted on T!
@@ -608,6 +618,103 @@ def _get_efron_values(self, X, T, E, weights, beta):
             tie_phi_x_x = np.zeros((d, d))
         return hessian, gradient, log_lik
 
+    def _get_efron_values_batch(self, X, T, E, weights, beta):  # pylint: disable=too-many-locals
+        """
+        Calculates the first and second order vector differentials, with respect to beta.
+
+        Returns
+        -------
+        hessian: (d, d) numpy array,
+        gradient: (1, d) numpy array
+        log_likelihood: float
+        """
+
+        _, d = X.shape
+        hessian = np.zeros((d, d))
+        gradient = np.zeros(d)
+        log_lik = 0
+
+        unique_death_times = np.unique(T[E])
+
+        for t in unique_death_times:
+
+            ix = T >= t  # everyone in the risk set
+
+            X_at_t = X[ix]
+            weights_at_t = weights[ix]
+            events_at_t = E[ix]
+
+            phi_i = weights_at_t[:, None] * exp(dot(X_at_t, beta))
+            phi_x_i = phi_i * X_at_t
+            phi_x_x_i = dot(X_at_t.T, phi_x_i)
+
+            # Calculate sums of Risk set
+            risk_phi = phi_i.sum()
+            risk_phi_x = phi_x_i.sum(0)
+            risk_phi_x_x = phi_x_x_i
+
+            # Calculate the sums of Tie set
+            deaths = (T[ix] == t) & events_at_t
+
+            ties_counts = deaths.sum()  # should always at 1 or more
+            assert ties_counts >= 1
+
+            xi_deaths = X_at_t[deaths]
+            weights_deaths = weights_at_t[deaths]
+
+            x_death_sum = (weights_deaths[:, None] * xi_deaths).sum(0)
+
+            if ties_counts > 1:
+                # it's faster if we can skip computing these when we don't need to.
+                tie_phi = phi_i[deaths].sum()
+                tie_phi_x = phi_x_i[deaths].sum(0)
+                tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths] * xi_deaths)
+
+            partial_gradient = np.zeros(d)
+            partial_ll = 0
+            partial_hessian = np.zeros((d, d))
+
+            weight_count = weights_deaths.sum()
+            weighted_average = weight_count / ties_counts
+
+            for l in range(ties_counts):
+
+                if ties_counts > 1:
+
+                    # A good explaination for how Efron handles ties. Consider three of five subjects who fail at the time.
+                    # As it is not known a priori that who is the first to fail, so one-third of
+                    # (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third
+                    # of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc.
+
+                    increasing_proportion = l / ties_counts
+                    denom = risk_phi - increasing_proportion * tie_phi
+                    numer = risk_phi_x - increasing_proportion * tie_phi_x
+                    # Hessian
+                    a1 = (risk_phi_x_x - increasing_proportion * tie_phi_x_x) / denom
+                else:
+                    denom = risk_phi
+                    numer = risk_phi_x
+                    # Hessian
+                    a1 = risk_phi_x_x / denom
+
+                # Gradient
+                partial_gradient += numer / denom
+                # In case numer and denom both are really small numbers,
+                # make sure to do division before multiplications
+                t = numer[:, None] / denom
+                # this is faster than an outerproduct
+                a2 = t.dot(t.T)
+
+                partial_hessian -= a1 - a2
+                partial_ll -= np.log(denom)
+
+            # Values outside tie sum
+            gradient += x_death_sum - weighted_average * partial_gradient
+            log_lik += dot(x_death_sum, beta)[0] + weighted_average * partial_ll
+            hessian += weighted_average * partial_hessian
+
+        return hessian, gradient.reshape(1, d), log_lik
+
     def _partition_by_strata(self, X, T, E, weights, as_dataframes=False):
         for stratum, stratified_X in X.groupby(self.strata):
             stratified_E, stratified_T, stratified_W = (E.loc[[stratum]], T.loc[[stratum]], weights.loc[[stratum]])

diff --git a/lifelines/version.py b/lifelines/version.py
@@ -1,4 +1,4 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-__version__ = "0.16.1"
+__version__ = "0.16.2"
diff --git a/perf_tests/batch_vs_single.py b/perf_tests/batch_vs_single.py
@@ -0,0 +1,59 @@
+from time import time
+import pandas as pd
+import numpy as np
+from lifelines.datasets import load_rossi
+from lifelines import CoxPHFitter
+import statsmodels.api as sm
+
+# This compares the batch algorithm (in CTV) vs the single iteration algorithm (original in CPH)
+# N vs (% ties == unique(T) / N)
+
+
+ROSSI_ROWS = 432
+results = {}
+
+
+for n_copies in [1, 2, 4, 6, 8, 10, 12, 14]:
+
+    # lower percents means more ties.
+    # original rossi dataset has 0.113
+    for fraction in np.linspace(0.01, 0.30, 10):
+        print(n_copies, fraction)
+
+        df = pd.concat([load_rossi()] * n_copies)
+        n_unique_durations = int(df.shape[0] * fraction) + 1
+        unique_durations = np.round(np.random.exponential(10, size=n_unique_durations), 5)
+
+        df["week"] = np.tile(unique_durations, int(np.ceil(1 / fraction)))[: df.shape[0]]
+
+        cph_batch = CoxPHFitter()
+        start_time = time()
+        cph_batch.fit(df, "week", "arrest", batch_mode=True)
+        batch_time = time() - start_time
+
+        cph_single = CoxPHFitter()
+        start_time = time()
+        cph_single.fit(df, "week", "arrest", batch_mode=False)
+        single_time = time() - start_time
+
+        print({"batch": batch_time, "single": single_time})
+        results[(n_copies * ROSSI_ROWS, fraction)] = {"batch": batch_time, "single": single_time}
+
+results = pd.DataFrame(results).T.sort_index()
+results["ratio"] = results["batch"] / results["single"]
+print(results)
+
+results = results.reset_index()
+results = results.rename(columns={"level_0": "N", "level_1": "frac"})
+
+
+results["N * frac"] = results["N"] * results["frac"]
+
+X = results[["N", "frac", "N * frac"]]
+X = sm.add_constant(X)
+
+Y = results["ratio"]
+
+
+model = sm.OLS(Y, X).fit()
+model.summary()
diff --git a/perf_tests/cp_perf_test.py b/perf_tests/cp_perf_test.py
@@ -5,13 +5,16 @@
 if __name__ == "__main__":
     import pandas as pd
     import time
+    import numpy as np
 
     from lifelines import CoxPHFitter
     from lifelines.datasets import load_rossi
 
     df = load_rossi()
-    df = pd.concat([df] * 20)
+    df = pd.concat([df] * 40)
+    # df['week'] = np.random.exponential(1, size=df.shape[0])
     cp = CoxPHFitter()
     start_time = time.time()
-    cp.fit(df, duration_col="week", event_col="arrest")
+    cp.fit(df, duration_col="week", event_col="arrest", batch_mode=False)
     print("--- %s seconds ---" % (time.time() - start_time))
+    cp.print_summary()
diff --git a/perf_tests/ctv_perf_test.py b/perf_tests/ctv_perf_test.py
@@ -3,13 +3,16 @@
     import time
     import pandas as pd
     from lifelines import CoxTimeVaryingFitter
-    from lifelines.datasets import load_stanford_heart_transplants
+    from lifelines.datasets import load_rossi
+    from lifelines.utils import to_long_format
 
-    dfcv = load_stanford_heart_transplants()
-    dfcv = pd.concat([dfcv] * 50)
+    df = load_rossi()
+    df = pd.concat([df] * 20)
+    df = df.reset_index()
+    df = to_long_format(df, duration_col="week")
     ctv = CoxTimeVaryingFitter()
     start_time = time.time()
-    ctv.fit(dfcv, id_col="id", event_col="event", start_col="start", stop_col="stop")
+    ctv.fit(df, id_col="index", event_col="arrest", start_col="start", stop_col="stop", strata=["wexp", "prio"])
     time_took = time.time() - start_time
     print("--- %s seconds ---" % time_took)
     ctv.print_summary()