Skip to content

Commit

Permalink
V0.16.2 (#594)
Browse files Browse the repository at this point in the history
* fixes and perf improvements for 0.16.2

* bump version

* creating test for single vs batch, and adding perf script

* remove pdb

* fix test
  • Loading branch information
CamDavidsonPilon authored Jan 2, 2019
1 parent ca9673e commit 0063469
Show file tree
Hide file tree
Showing 9 changed files with 244 additions and 21 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
### Changelogs

#### 0.16.2
- Fixed `CoxTimeVaryingFitter` to allow more than one variable to be stratafied
- Significant performance improvements for `CoxPHFitter` with dataset has lots of duplicate times. See https://github.com/CamDavidsonPilon/lifelines/issues/591

#### 0.16.1
- Fixed py2 division error in `concordance` method.

Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
#
# The short X.Y version.

version = "0.16.1"
version = "0.16.2"
# The full version, including dev info
release = version

Expand Down
12 changes: 6 additions & 6 deletions lifelines/fitters/cox_time_varying_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _check_values(df, stop_times_events):

def _partition_by_strata(self, X, stop_times_events, weights):
for stratum, stratified_X in X.groupby(self.strata):
stratified_stop_times_events, stratified_W = (stop_times_events.loc[[stratum]], weights.loc[[stratum]])
stratified_stop_times_events, stratified_W = (stop_times_events.loc[stratum], weights.loc[stratum])
yield (stratified_X, stratified_stop_times_events, stratified_W), stratum

def _partition_by_strata_and_apply(self, X, stop_times_events, weights, function, *args):
Expand Down Expand Up @@ -385,8 +385,7 @@ def _get_gradients(self, df, stops_events, weights, beta): # pylint: disable=to
gradient: (1, d) numpy array
log_likelihood: float
"""
# import pdb
# pdb.set_trace()

_, d = df.shape
hessian = np.zeros((d, d))
gradient = np.zeros(d)
Expand Down Expand Up @@ -454,16 +453,17 @@ def _get_gradients(self, df, stops_events, weights, beta): # pylint: disable=to
a1 = risk_phi_x_x / denom

# Gradient
partial_gradient += weighted_average * numer / denom
partial_gradient += numer / denom
# In case numer and denom both are really small numbers,
# make sure to do division before multiplications
a2 = np.outer(numer / denom, numer / denom)
t = numer[:, None] / denom
a2 = t.dot(t.T)

hessian -= weighted_average * (a1 - a2)
log_lik -= weighted_average * np.log(denom)

# Values outside tie sum
gradient += x_death_sum - partial_gradient
gradient += x_death_sum - weighted_average * partial_gradient
log_lik += dot(x_death_sum, beta)[0]

return hessian, gradient.reshape(1, d), log_lik
Expand Down
113 changes: 110 additions & 3 deletions lifelines/fitters/coxph_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def fit(
weights_col=None,
cluster_col=None,
robust=False,
batch_mode=None,
):
"""
Fit the Cox Propertional Hazard model to a dataset.
Expand Down Expand Up @@ -162,6 +163,9 @@ def fit(
specifies what column has unique identifers for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
be used.
batch_mode: bool, optional
enabling batch_mode can be faster for datasets with a large number of ties. If left as None, lifelines will choose the best option.
Returns
-------
self: CoxPHFitter
Expand Down Expand Up @@ -214,6 +218,7 @@ def fit(
self.cluster_col = cluster_col
self.weights_col = weights_col
self._n_examples = df.shape[0]
self._batch_mode = batch_mode
self.strata = coalesce(strata, self.strata)

X, T, E, weights, original_index, self._clusters = self._preprocess_dataframe(df)
Expand Down Expand Up @@ -357,7 +362,7 @@ def _newton_rhaphson(
"""
self.path = []
assert precision <= 1.0, "precision must be less than or equal to 1."
_, d = X.shape
n, d = X.shape

# make sure betas are correct size.
if initial_beta is not None:
Expand All @@ -371,7 +376,12 @@ def _newton_rhaphson(

# Method of choice is just efron right now
if self.tie_method == "Efron":
get_gradients = self._get_efron_values
# https://github.com/CamDavidsonPilon/lifelines/issues/591
frac_dups = T.unique().shape[0] / n
if self._batch_mode or (0.4690 + 3.045e-05 * n + 2.374137 * frac_dups + 0.000711 * n * frac_dups < 1):
get_gradients = self._get_efron_values_batch
else:
get_gradients = self._get_efron_values_single
else:
raise NotImplementedError("Only Efron is available.")

Expand Down Expand Up @@ -477,7 +487,7 @@ def _newton_rhaphson(

return beta

def _get_efron_values(self, X, T, E, weights, beta):
def _get_efron_values_single(self, X, T, E, weights, beta):
"""
Calculates the first and second order vector differentials, with respect to beta.
Note that X, T, E are assumed to be sorted on T!
Expand Down Expand Up @@ -608,6 +618,103 @@ def _get_efron_values(self, X, T, E, weights, beta):
tie_phi_x_x = np.zeros((d, d))
return hessian, gradient, log_lik

def _get_efron_values_batch(self, X, T, E, weights, beta): # pylint: disable=too-many-locals
"""
Calculates the first and second order vector differentials, with respect to beta.
Returns
-------
hessian: (d, d) numpy array,
gradient: (1, d) numpy array
log_likelihood: float
"""

_, d = X.shape
hessian = np.zeros((d, d))
gradient = np.zeros(d)
log_lik = 0

unique_death_times = np.unique(T[E])

for t in unique_death_times:

ix = T >= t # everyone in the risk set

X_at_t = X[ix]
weights_at_t = weights[ix]
events_at_t = E[ix]

phi_i = weights_at_t[:, None] * exp(dot(X_at_t, beta))
phi_x_i = phi_i * X_at_t
phi_x_x_i = dot(X_at_t.T, phi_x_i)

# Calculate sums of Risk set
risk_phi = phi_i.sum()
risk_phi_x = phi_x_i.sum(0)
risk_phi_x_x = phi_x_x_i

# Calculate the sums of Tie set
deaths = (T[ix] == t) & events_at_t

ties_counts = deaths.sum() # should always at 1 or more
assert ties_counts >= 1

xi_deaths = X_at_t[deaths]
weights_deaths = weights_at_t[deaths]

x_death_sum = (weights_deaths[:, None] * xi_deaths).sum(0)

if ties_counts > 1:
# it's faster if we can skip computing these when we don't need to.
tie_phi = phi_i[deaths].sum()
tie_phi_x = phi_x_i[deaths].sum(0)
tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths] * xi_deaths)

partial_gradient = np.zeros(d)
partial_ll = 0
partial_hessian = np.zeros((d, d))

weight_count = weights_deaths.sum()
weighted_average = weight_count / ties_counts

for l in range(ties_counts):

if ties_counts > 1:

# A good explaination for how Efron handles ties. Consider three of five subjects who fail at the time.
# As it is not known a priori that who is the first to fail, so one-third of
# (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third
# of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc.

increasing_proportion = l / ties_counts
denom = risk_phi - increasing_proportion * tie_phi
numer = risk_phi_x - increasing_proportion * tie_phi_x
# Hessian
a1 = (risk_phi_x_x - increasing_proportion * tie_phi_x_x) / denom
else:
denom = risk_phi
numer = risk_phi_x
# Hessian
a1 = risk_phi_x_x / denom

# Gradient
partial_gradient += numer / denom
# In case numer and denom both are really small numbers,
# make sure to do division before multiplications
t = numer[:, None] / denom
# this is faster than an outerproduct
a2 = t.dot(t.T)

partial_hessian -= a1 - a2
partial_ll -= np.log(denom)

# Values outside tie sum
gradient += x_death_sum - weighted_average * partial_gradient
log_lik += dot(x_death_sum, beta)[0] + weighted_average * partial_ll
hessian += weighted_average * partial_hessian

return hessian, gradient.reshape(1, d), log_lik

def _partition_by_strata(self, X, T, E, weights, as_dataframes=False):
for stratum, stratified_X in X.groupby(self.strata):
stratified_E, stratified_T, stratified_W = (E.loc[[stratum]], T.loc[[stratum]], weights.loc[[stratum]])
Expand Down
2 changes: 1 addition & 1 deletion lifelines/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

__version__ = "0.16.1"
__version__ = "0.16.2"
59 changes: 59 additions & 0 deletions perf_tests/batch_vs_single.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from time import time
import pandas as pd
import numpy as np
from lifelines.datasets import load_rossi
from lifelines import CoxPHFitter
import statsmodels.api as sm

# This compares the batch algorithm (in CTV) vs the single iteration algorithm (original in CPH)
# N vs (% ties == unique(T) / N)


ROSSI_ROWS = 432
results = {}


for n_copies in [1, 2, 4, 6, 8, 10, 12, 14]:

# lower percents means more ties.
# original rossi dataset has 0.113
for fraction in np.linspace(0.01, 0.30, 10):
print(n_copies, fraction)

df = pd.concat([load_rossi()] * n_copies)
n_unique_durations = int(df.shape[0] * fraction) + 1
unique_durations = np.round(np.random.exponential(10, size=n_unique_durations), 5)

df["week"] = np.tile(unique_durations, int(np.ceil(1 / fraction)))[: df.shape[0]]

cph_batch = CoxPHFitter()
start_time = time()
cph_batch.fit(df, "week", "arrest", batch_mode=True)
batch_time = time() - start_time

cph_single = CoxPHFitter()
start_time = time()
cph_single.fit(df, "week", "arrest", batch_mode=False)
single_time = time() - start_time

print({"batch": batch_time, "single": single_time})
results[(n_copies * ROSSI_ROWS, fraction)] = {"batch": batch_time, "single": single_time}

results = pd.DataFrame(results).T.sort_index()
results["ratio"] = results["batch"] / results["single"]
print(results)

results = results.reset_index()
results = results.rename(columns={"level_0": "N", "level_1": "frac"})


results["N * frac"] = results["N"] * results["frac"]

X = results[["N", "frac", "N * frac"]]
X = sm.add_constant(X)

Y = results["ratio"]


model = sm.OLS(Y, X).fit()
model.summary()
7 changes: 5 additions & 2 deletions perf_tests/cp_perf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
if __name__ == "__main__":
import pandas as pd
import time
import numpy as np

from lifelines import CoxPHFitter
from lifelines.datasets import load_rossi

df = load_rossi()
df = pd.concat([df] * 20)
df = pd.concat([df] * 40)
# df['week'] = np.random.exponential(1, size=df.shape[0])
cp = CoxPHFitter()
start_time = time.time()
cp.fit(df, duration_col="week", event_col="arrest")
cp.fit(df, duration_col="week", event_col="arrest", batch_mode=False)
print("--- %s seconds ---" % (time.time() - start_time))
cp.print_summary()
11 changes: 7 additions & 4 deletions perf_tests/ctv_perf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@
import time
import pandas as pd
from lifelines import CoxTimeVaryingFitter
from lifelines.datasets import load_stanford_heart_transplants
from lifelines.datasets import load_rossi
from lifelines.utils import to_long_format

dfcv = load_stanford_heart_transplants()
dfcv = pd.concat([dfcv] * 50)
df = load_rossi()
df = pd.concat([df] * 20)
df = df.reset_index()
df = to_long_format(df, duration_col="week")
ctv = CoxTimeVaryingFitter()
start_time = time.time()
ctv.fit(dfcv, id_col="id", event_col="event", start_col="start", stop_col="stop")
ctv.fit(df, id_col="index", event_col="arrest", start_col="start", stop_col="stop", strata=["wexp", "prio"])
time_took = time.time() - start_time
print("--- %s seconds ---" % time_took)
ctv.print_summary()
Loading

0 comments on commit 0063469

Please sign in to comment.