Skip to content

Commit

Permalink
Merge pull request #461 from CamDavidsonPilon/some-questionable-perf-…
Browse files Browse the repository at this point in the history
…improvements

v0.14.2
  • Loading branch information
CamDavidsonPilon authored May 19, 2018
2 parents e4a3155 + 53d616e commit cd7bb5c
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 22 deletions.
112 changes: 93 additions & 19 deletions lifelines/fitters/cox_time_varying_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from __future__ import print_function
from __future__ import division
import warnings
import time

import numpy as np
import pandas as pd
from scipy import stats
Expand Down Expand Up @@ -34,6 +36,29 @@ def __init__(self, alpha=0.95, penalizer=0.0):
self.penalizer = penalizer

def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None):
"""
Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times
are handled using Efron's tie-method.
Parameters:
df: a Pandas dataframe with necessary columns `duration_col` and
`event_col`, plus other covariates. `duration_col` refers to
the lifetimes of the subjects. `event_col` refers to whether
the 'death' events was observed: 1 if observed, 0 else (censored).
id_col: A subject could have multiple rows in the dataframe. This column contains
the unique identifer per subject.
event_col: the column in dataframe that contains the subjects' death
observation. If left as None, assume all individuals are non-censored.
start_col: the column that contains the start of a subject's time period.
end_col: the column that contains the end of a subject's time period.
show_progress: since the fitter is iterative, show convergence
diagnostics.
step_size: set an initial step size for the fitting algorithm.
Returns:
self, with additional properties: hazards_
"""

df = df.copy()
if not (id_col in df and event_col in df and start_col in df and stop_col in df):
Expand All @@ -48,6 +73,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr

stop_times_events = df[["event", "stop", "start"]]
df = df.drop(["event", "stop", "start"], axis=1)
df = df.astype(float)

self._norm_mean = df.mean(0)
self._norm_std = df.std(0)
Expand All @@ -58,6 +84,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
self.confidence_intervals_ = self._compute_confidence_intervals()
self._n_examples = df.shape[0]
self._n_unique = df.index.unique().shape[0]

return self

Expand Down Expand Up @@ -137,6 +164,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
i = 0
converging = True
ll, previous_ll = 0, 0
start = time.time()

step_sizer = StepSizer(step_size)
step_size = step_sizer.next()
Expand All @@ -158,7 +186,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
hessian, gradient = h, g

if show_progress:
print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f" % (i, norm(delta), step_size, ll))
print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start))

# convergence criteria
if norm(delta) < precision:
Expand Down Expand Up @@ -204,16 +232,12 @@ def _get_gradients(self, df, stops_events, beta):

_, d = df.shape
hessian = np.zeros((d, d))
gradient = np.zeros((1, d))
gradient = np.zeros(d)
log_lik = 0

unique_death_times = np.unique(stops_events['stop'].loc[stops_events['event']])

for t in unique_death_times:
x_death_sum = np.zeros((1, d))
tie_phi, risk_phi = 0, 0
risk_phi_x, tie_phi_x = np.zeros((1, d)), np.zeros((1, d))
risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d))

ix = (stops_events['start'] < t) & (t <= stops_events['stop'])
df_at_t = df.loc[ix]
Expand All @@ -224,25 +248,25 @@ def _get_gradients(self, df, stops_events, beta):
phi_x_x_i = dot(df_at_t.T, phi_x_i) # dot(df_at_t.T, phi_i * df_at_t)

# Calculate sums of Risk set
risk_phi += phi_i.sum()
risk_phi_x += phi_x_i.sum(0).values.reshape((1, d))
risk_phi_x_x += phi_x_x_i
risk_phi = phi_i.sum()
risk_phi_x = phi_x_i.sum(0).values
risk_phi_x_x = phi_x_x_i

# Calculate the sums of Tie set
deaths = stops_events_at_t['event'] & (stops_events_at_t['stop'] == t)
death_counts = deaths.sum() # should always be atleast 1.

xi_deaths = df_at_t.loc[deaths]

x_death_sum += xi_deaths.sum(0).values.reshape((1, d))
x_death_sum = xi_deaths.sum(0).values

if death_counts > 1:
# it's faster if we can skip computing these when we don't need to.
tie_phi += phi_i[deaths.values].sum()
tie_phi_x += phi_x_i.loc[deaths].sum(0).values.reshape((1, d))
tie_phi_x_x += dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths)
tie_phi = phi_i[deaths.values].sum()
tie_phi_x = phi_x_i.loc[deaths].sum(0).values
tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths)

partial_gradient = np.zeros((1, d))
partial_gradient = np.zeros(d)

for l in range(death_counts):

Expand All @@ -262,16 +286,57 @@ def _get_gradients(self, df, stops_events, beta):
partial_gradient += z / denom
# In case z and denom both are really small numbers,
# make sure to do division before multiplications
a2 = dot(z.T / denom, z / denom)
a2 = np.outer(z / denom, z / denom)

hessian -= (a1 - a2)

log_lik -= np.log(denom)

# Values outside tie sum
gradient += x_death_sum - partial_gradient
log_lik += dot(x_death_sum, beta)[0][0]
return hessian, gradient, log_lik
log_lik += dot(x_death_sum, beta)[0]

return hessian, gradient.reshape(1, d), log_lik

def predict_log_partial_hazard(self, X):
"""
X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
can be in any order. If a numpy array, columns must be in the
same order as the training data.
This is equivalent to R's linear.predictors.
Returns the log of the partial hazard for the individuals, partial since the
baseline hazard is not included. Equal to \beta (X - \bar{X})
If X is a dataframe, the order of the columns do not matter. But
if X is an array, then the column ordering is assumed to be the
same as the training dataset.
"""
if isinstance(X, pd.DataFrame):
order = self.hazards_.columns
X = X[order]

index = _get_index(X)
X = normalize(X, self._norm_mean.values, 1)
return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)


def predict_partial_hazard(self, X):
"""
X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
can be in any order. If a numpy array, columns must be in the
same order as the training data.
If X is a dataframe, the order of the columns do not matter. But
if X is an array, then the column ordering is assumed to be the
same as the training dataset.
Returns the partial hazard for the individuals, partial since the
baseline hazard is not included. Equal to \exp{\beta X}
"""
return exp(self.predict_log_partial_hazard(X))


def print_summary(self):
"""
Expand All @@ -283,8 +348,8 @@ def print_summary(self):
df[''] = [significance_code(p) for p in df['p']]

# Print information about data first
print('n={}, number of events={}'.format(self._n_examples,
np.where(self.event_observed)[0].shape[0]),
print('periods={}, uniques={}, number of events={}'.format(self._n_examples, self._n_unique,
self.event_observed.sum()),
end='\n\n')
print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
# Significance code explanation
Expand Down Expand Up @@ -324,3 +389,12 @@ def plot(self, standardized=False, **kwargs):
plt.yticks(yaxis_locations, tick_labels)
plt.xlabel("standardized coef" if standardized else "coef")
return ax

def __repr__(self):
classname = self.__class__.__name__
try:
s = """<lifelines.%s: fitted with %d periods, %d uniques, %d events>""" % (
classname, self._n_examples, self._n_unique, self.event_observed.sum())
except AttributeError:
s = """<lifelines.%s>""" % classname
return s
3 changes: 2 additions & 1 deletion lifelines/fitters/coxph_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def fit(self, df, duration_col, event_col=None,
catagorical covariate does not obey the proportional hazard assumption. This
is used similar to the `strata` expression in R.
See http://courses.washington.edu/b515/l17.pdf.
step_size: set an initial step size for the fitting algorithm.
Returns:
self, with additional properties: hazards_
Expand Down Expand Up @@ -433,7 +434,7 @@ def print_summary(self):

# Print information about data first
print('n={}, number of events={}'.format(self._n_examples,
np.where(self.event_observed)[0].shape[0]),
self.event_observed.sum()),
end='\n\n')
print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
# Significance code explanation
Expand Down
2 changes: 1 addition & 1 deletion lifelines/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,7 @@ def concordance_value(time_a, time_b, pred_a, pred_b):
def pass_for_numeric_dtypes_or_raise(df):
nonnumeric_cols = df.select_dtypes(exclude=[np.number, bool]).columns.tolist()
if len(nonnumeric_cols) > 0:
raise TypeError("DataFrame contains nonnumeric columns: %s. Try using pandas.get_dummies to convert the column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols)
raise TypeError("DataFrame contains nonnumeric columns: %s. Try using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols)


def check_for_overlapping_intervals(df):
Expand Down
2 changes: 1 addition & 1 deletion lifelines/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from __future__ import unicode_literals

__version__ = '0.14.1'
__version__ = '0.14.2'
10 changes: 10 additions & 0 deletions tests/test_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1482,6 +1482,16 @@ def test_fitter_will_raise_an_error_if_overlapping_intervals(self, ctv):
with pytest.raises(ValueError):
ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event")

def test_fitter_accept_boolean_columns(self, ctv):
df = pd.DataFrame.from_records([
{'id': 1, 'start': 0, 'stop': 5, 'var': -1.2, 'bool': True, 'event': 1},
{'id': 2, 'start': 0, 'stop': 5, 'var': 1.3, 'bool': False, 'event': 1},
{'id': 3, 'start': 0, 'stop': 5, 'var': -1.3, 'bool': False, 'event': 1},
])

ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event")


def test_warning_is_raised_if_df_has_a_near_constant_column(self, ctv, dfcv):
dfcv['constant'] = 1.0

Expand Down

0 comments on commit cd7bb5c

Please sign in to comment.