Skip to content

Commit

Permalink
Merge pull request #71 from CamDavidsonPilon/four-two
Browse files Browse the repository at this point in the history
Four two
  • Loading branch information
CamDavidsonPilon committed Jun 19, 2014
2 parents 4aac1ec + 1093f96 commit e05299a
Show file tree
Hide file tree
Showing 10 changed files with 120 additions and 73 deletions.
12 changes: 10 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
### Changelogs



####0.4.2

- Add Breslows method to CoxPHFitter.

- Massive speed improvements to CoxPHFitter.
- Additional prediction method: `predict_percentile` is available on CoxPHFitter and AalenAdditiveFitter. Given a percentile, p, this function returns the value t such that *S(t | x) = p*. It is a generalization of `predict_median`.
- Additional kwargs in `k_fold_cross_validation` that will accept different prediction methods (default is `predict_median`).
- Bug fix in CoxPHFitter `predict_expectation` function.
- Correct spelling mistake in newton-rhapson algorithm.
- `datasets` now contains functions for generating the respective datasets, ex: `generate_waltons_dataset`.
- Bumping up the number of samples in statistical tests to prevent them from failing so often (this a stop-gap)
- pep8 everything

####0.4.1.1

Expand Down
3 changes: 2 additions & 1 deletion docs/Intro to lifelines.rst
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,8 @@ instruments could only detect the measurement was *less* than some upperbound.

.. code:: python
from lifelines.datasets import lcd_dataset
from lifelines.datasets import generate_lcd_dataset
lcd_dataset = generate_lcd_dataset()
T = lcd_dataset['alluvial_fan']['T']
C = lcd_dataset['alluvial_fan']['C'] #boolean array, True if observed.
Expand Down
10 changes: 6 additions & 4 deletions docs/Quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ Let's start by importing some data. We need the durations that individuals are o

.. code:: python
from lifelines.datasets import waltons_dataset
from lifelines.datasets import generate_waltons_dataset
data = generate_waltons_dataset()
T = waltons_dataset['T']
E = waltons_dataset['E']
T = data['T']
E = data['E']
``T`` is an array of durations, ``E`` is a either boolean or binary array representing whether the "death" was observed (alternatively an individual can be censored).

Expand Down Expand Up @@ -85,7 +86,8 @@ While the above ``KaplanMeierFitter`` and ``NelsonAalenFitter`` are useful, they

.. code:: python
from lifelines.datasets import regression_dataset
from lifelines.datasets import generate_regression_dataset
regression_dataset = generate_regression_dataset()
regression_dataset.head()
Expand Down
6 changes: 4 additions & 2 deletions docs/survival_regression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -395,9 +395,10 @@ This example data is from the paper `here <http://cran.r-project.org/doc/contrib

.. code:: python
from lifelines.datasets import rossi_dataset
from lifelines.datasets import generate_rossi_dataset
from lifelines import CoxPHFitter
rossi_dataset = generate_rossi_dataset()
cf = CoxPHFitter(alpha=0.95, tie_method='Efron')
cf.fit(rossi_dataset, duration_col='week', event_col='arrest')
Expand Down Expand Up @@ -439,9 +440,10 @@ into a training set and a testing set, fits itself on the training set, and eval
.. code:: python
from lifelines import CoxPHFitter
from lifelines.datasets import regression_dataset
from lifelines.datasets import generate_regression_dataset
from lifelines.utils import k_fold_cross_validation
regression_dataset = generate_regression_dataset()
cf = CoxPHFitter()
scores = k_fold_cross_validation(cf, regression_dataset, duration_col='T', event_col='E', k=3)
print scores
Expand Down
18 changes: 7 additions & 11 deletions lifelines/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
import numpy as np
from io import StringIO

__all__ = ['waltons_dataset', 'regression_dataset',
'lcd_dataset', 'dd_dataset', 'rossi_dataset']
__all__ = ['generate_waltons_dataset',
'generate_regression_dataset',
'generate_dd_dataset',
'generate_lcd_dataset',
'generate_rossi_dataset']


def generate_left_censored_data():
def generate_lcd_dataset():
return {
'alluvial_fan': {
'T': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
Expand All @@ -27,7 +30,7 @@ def generate_left_censored_data():
}


def generate_waltons_data():
def generate_waltons_dataset():
waltonG = np.array(['miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
Expand Down Expand Up @@ -2545,10 +2548,3 @@ def generate_regression_dataset():
0.031865 1.753759 0.252040 8.519852 1
1.631269 1.588621 3.709899 4.480448 1
"""), header=0, delim_whitespace=True)


waltons_dataset = generate_waltons_data()
regression_dataset = generate_regression_dataset()
dd_dataset = generate_dd_dataset()
lcd_dataset = generate_left_censored_data()
rossi_dataset = generate_rossi_dataset()
42 changes: 30 additions & 12 deletions lifelines/estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def smoothed_hazard_(self, bandwidth):
hazard_name = "smoothed-" + cumulative_hazard_name
hazard_ = self.cumulative_hazard_.diff().fillna(self.cumulative_hazard_.iloc[0])
C = (hazard_[cumulative_hazard_name] != 0.0).values
return pd.DataFrame( 1./(2*bandwidth)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth), hazard_.values[C,:]),
return pd.DataFrame( 1./(2*bandwidth)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None,:], bandwidth), hazard_.values[C,:]),
columns=[hazard_name], index=timeline)

def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None):
Expand All @@ -156,7 +156,7 @@ def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None):
self._cumulative_sq.iloc[0] = 0
var_hazard_ = self._cumulative_sq.diff().fillna(self._cumulative_sq.iloc[0])
C = (var_hazard_.values != 0.0) # only consider the points with jumps
std_hazard_ = np.sqrt(1./(2*bandwidth**2)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth)**2, var_hazard_.values[C]))
std_hazard_ = np.sqrt(1./(2*bandwidth**2)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None,:], bandwidth)**2, var_hazard_.values[C]))
values = {
self.ci_labels[0]: hazard_ * np.exp(alpha2 * std_hazard_ / hazard_),
self.ci_labels[1]: hazard_ * np.exp(-alpha2 * std_hazard_ / hazard_)
Expand Down Expand Up @@ -493,12 +493,13 @@ def _fit_static(self, dataframe, duration_col="T", event_col="E",
C = pd.Series(df[event_col].values, dtype=bool, index=ids)
T = pd.Series(df[duration_col].values, index=ids)

df = df.set_index([duration_col, id_col])
df = df.set_index(id_col)

ix = T.argsort()
T, C = T.iloc[ix], C.iloc[ix]

del df[event_col]
del df[duration_col]
n, d = df.shape
columns = df.columns

Expand Down Expand Up @@ -722,12 +723,20 @@ def predict_survival_function(self, X):
"""
return np.exp(-self.predict_cumulative_hazard(X))

def predict_percentile(self, X, p=0.5):
"""
X: a (n,d) covariate matrix
Returns the median lifetimes for the individuals.
http://stats.stackexchange.com/questions/102986/percentile-loss-functions
"""
return qth_survival_times(0.5, self.predict_survival_function(X))

def predict_median(self, X):
"""
X: a (n,d) covariate matrix
Returns the median lifetimes for the individuals
"""
return median_survival_times(self.predict_survival_function(X))
return self.predict_percentile(X, 0.5)

def predict_expectation(self, X):
"""
Expand All @@ -738,6 +747,7 @@ def predict_expectation(self, X):


class CoxPHFitter(BaseFitter):

"""
This class implements fitting Cox's proportional hazard model:
Expand Down Expand Up @@ -793,7 +803,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
# Iterate backwards to utilize recursive relationship
for i, (ti, ei) in reversed(list(enumerate(zip(T, E)))):
# Doing it like this to preserve shape
xi = X[i:i+1]
xi = X[i:i + 1]
# Calculate phi values
phi_i = exp(dot(xi, beta))
phi_x_i = dot(phi_i, xi)
Expand All @@ -814,7 +824,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
# Keep track of count
tie_count += 1

if i > 0 and T[i-1] == ti:
if i > 0 and T[i - 1] == ti:
# There are more ties/members of the risk set
continue
elif tie_count == 0:
Expand Down Expand Up @@ -990,7 +1000,7 @@ def _compute_confidence_intervals(self):

def _compute_standard_errors(self):
se = np.sqrt(inv(-self._hessian_).diagonal())
return pd.DataFrame(se[None, :],
return pd.DataFrame(se[None,:],
index=['se'], columns=self.hazards_.columns)

def _compute_z_values(self):
Expand Down Expand Up @@ -1031,19 +1041,27 @@ def predict_survival_function(self, X):
"""
return exp(-self.predict_hazard(X).cumsum(0))

def predict_percentile(self, X, p=0.5):
"""
X: a (n,d) covariate matrix
Returns the median lifetimes for the individuals.
http://stats.stackexchange.com/questions/102986/percentile-loss-functions
"""
return qth_survival_times(0.5, self.predict_survival_function(X))

def predict_median(self, X):
"""
X: a (n,d) covariate matrix
Returns the median lifetimes for the individuals
"""
return median_survival_times(self.predict_survival_function(X))
return self.predict_percentile(X, 0.5)

def predict_expectation(self, X):
"""
Compute the expected lifetime, E[T], using covarites X.
"""
t = self.cumulative_hazards_.index
return trapz(self.predict_survival_function(X).values.T, t)
v = self.predict_survival_function(X)
return trapz(v.values.T, v.index)

def _compute_baseline_hazard(self):
# http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf
Expand Down Expand Up @@ -1190,10 +1208,10 @@ def qth_survival_times(q, survival_functions):
sv_b = (1.0 * (survival_functions < q)).cumsum() > 0
try:
v = sv_b.idxmax(0)
v[sv_b.iloc[-1, :] == 0] = np.inf
v[sv_b.iloc[-1,:] == 0] = np.inf
except:
v = sv_b.argmax(0)
v[sv_b[-1, :] == 0] = np.inf
v[sv_b[-1,:] == 0] = np.inf
return v


Expand Down
12 changes: 6 additions & 6 deletions lifelines/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def concordance_index(event_times, predicted_event_times, event_observed=None):
event_observed)


def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None,
alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None,
alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
"""
Measures and reports on whether two intensity processes are different. That is, given two
event series, determines whether the data generating processes are statistically different.
Expand Down Expand Up @@ -91,8 +91,8 @@ def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_obse
event_times = np.r_[event_times_A, event_times_B]
groups = np.r_[np.zeros(event_times_A.shape[0]), np.ones(event_times_B.shape[0])]
event_observed = np.r_[event_observed_A, event_observed_B]
return multivariate_logrank_test(event_times, groups, event_observed,
alpha=alpha, t_0=t_0, suppress_print=suppress_print, **kwargs)
return multivariate_logrank_test(event_times, groups, event_observed,
alpha=alpha, t_0=t_0, suppress_print=suppress_print, **kwargs)


def pairwise_logrank_test(event_durations, groups, event_observed=None,
Expand Down Expand Up @@ -175,7 +175,7 @@ def pairwise_logrank_test(event_durations, groups, event_observed=None,
return [pd.DataFrame(x, columns=unique_groups, index=unique_groups) for x in [S, P, T]]


def multivariate_logrank_test(event_durations, groups, event_observed=None,
def multivariate_logrank_test(event_durations, groups, event_observed=None,
alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
"""
This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
Expand Down Expand Up @@ -238,7 +238,7 @@ def multivariate_logrank_test(event_durations, groups, event_observed=None,
df=n_groups - 1, **kwargs)

if not suppress_print:
print(summary)
print(summary)
return summary, p_value, test_result


Expand Down
Loading

0 comments on commit e05299a

Please sign in to comment.