Merge pull request #71 from CamDavidsonPilon/four-two

Four two
CamDavidsonPilon · Jun 19, 2014 · e05299a · e05299a
2 parents 4aac1ec + 1093f96
commit e05299a
Show file tree

Hide file tree

Showing 10 changed files with 120 additions and 73 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,17 @@
 ### Changelogs
 
+
+
 ####0.4.2
-
- - Add Breslows method to CoxPHFitter. 
 
+ - Massive speed improvements to CoxPHFitter. 
+ - Additional prediction method: `predict_percentile` is available on CoxPHFitter and AalenAdditiveFitter. Given a percentile, p, this function returns the value t such that *S(t | x) = p*. It is a generalization of `predict_median`. 
+ - Additional kwargs in `k_fold_cross_validation` that will accept different prediction methods (default is `predict_median`). 
+- Bug fix in CoxPHFitter `predict_expectation` function. 
+- Correct spelling mistake in newton-rhapson algorithm.
+- `datasets` now contains functions for generating the respective datasets, ex: `generate_waltons_dataset`.
+- Bumping up the number of samples in statistical tests to prevent them from failing so often (this a stop-gap)
+- pep8 everything
 
 ####0.4.1.1
 

diff --git a/docs/Intro to lifelines.rst b/docs/Intro to lifelines.rst
@@ -587,7 +587,8 @@ instruments could only detect the measurement was *less* than some upperbound.
 
 .. code:: python
 
-    from lifelines.datasets import lcd_dataset
+    from lifelines.datasets import generate_lcd_dataset
+    lcd_dataset = generate_lcd_dataset()
 
     T = lcd_dataset['alluvial_fan']['T']
     C = lcd_dataset['alluvial_fan']['C'] #boolean array, True if observed.

diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst
@@ -23,10 +23,11 @@ Let's start by importing some data. We need the durations that individuals are o
 
 .. code:: python
 
-    from lifelines.datasets import waltons_dataset    
+    from lifelines.datasets import generate_waltons_dataset
+    data = generate_waltons_dataset()    
 
-    T = waltons_dataset['T']
-    E = waltons_dataset['E']
+    T = data['T']
+    E = data['E']
 
 ``T`` is an array of durations, ``E`` is a either boolean or binary array representing whether the "death" was observed (alternatively an individual can be censored). 
 
@@ -85,7 +86,8 @@ While the above ``KaplanMeierFitter`` and ``NelsonAalenFitter`` are useful, they
 
 .. code:: python
     
-    from lifelines.datasets import regression_dataset
+    from lifelines.datasets import generate_regression_dataset
+    regression_dataset = generate_regression_dataset()
 
     regression_dataset.head()
 

diff --git a/docs/survival_regression.rst b/docs/survival_regression.rst
@@ -395,9 +395,10 @@ This example data is from the paper `here <http://cran.r-project.org/doc/contrib
 
 .. code:: python
 
-    from lifelines.datasets import rossi_dataset
+    from lifelines.datasets import generate_rossi_dataset
     from lifelines import CoxPHFitter
 
+    rossi_dataset = generate_rossi_dataset()
     cf = CoxPHFitter(alpha=0.95, tie_method='Efron')
     cf.fit(rossi_dataset, duration_col='week', event_col='arrest')
 
@@ -439,9 +440,10 @@ into a training set and a testing set, fits itself on the training set, and eval
 .. code:: python
       
         from lifelines import CoxPHFitter
-        from lifelines.datasets import regression_dataset
+        from lifelines.datasets import generate_regression_dataset
         from lifelines.utils import k_fold_cross_validation
 
+        regression_dataset = generate_regression_dataset()
         cf = CoxPHFitter()
         scores = k_fold_cross_validation(cf, regression_dataset, duration_col='T', event_col='E', k=3)
         print scores

diff --git a/lifelines/datasets.py b/lifelines/datasets.py
@@ -3,11 +3,14 @@
 import numpy as np
 from io import StringIO
 
-__all__ = ['waltons_dataset', 'regression_dataset',
-           'lcd_dataset', 'dd_dataset', 'rossi_dataset']
+__all__ = ['generate_waltons_dataset',
+           'generate_regression_dataset',
+           'generate_dd_dataset',
+           'generate_lcd_dataset',
+           'generate_rossi_dataset']
 
 
-def generate_left_censored_data():
+def generate_lcd_dataset():
     return {
         'alluvial_fan': {
             'T': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
@@ -27,7 +30,7 @@ def generate_left_censored_data():
     }
 
 
-def generate_waltons_data():
+def generate_waltons_dataset():
     waltonG = np.array(['miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
                         'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
                         'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137', 'miR-137',
@@ -2545,10 +2548,3 @@ def generate_regression_dataset():
 0.031865  1.753759  0.252040   8.519852  1
 1.631269  1.588621  3.709899   4.480448  1
 """), header=0, delim_whitespace=True)
-
-
-waltons_dataset = generate_waltons_data()
-regression_dataset = generate_regression_dataset()
-dd_dataset = generate_dd_dataset()
-lcd_dataset = generate_left_censored_data()
-rossi_dataset = generate_rossi_dataset()
diff --git a/lifelines/estimation.py b/lifelines/estimation.py
@@ -139,7 +139,7 @@ def smoothed_hazard_(self, bandwidth):
         hazard_name = "smoothed-" + cumulative_hazard_name
         hazard_ = self.cumulative_hazard_.diff().fillna(self.cumulative_hazard_.iloc[0])
         C = (hazard_[cumulative_hazard_name] != 0.0).values
-        return pd.DataFrame( 1./(2*bandwidth)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth), hazard_.values[C,:]),
+        return pd.DataFrame( 1./(2*bandwidth)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None,:], bandwidth), hazard_.values[C,:]),
                              columns=[hazard_name], index=timeline)
 
     def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None):
@@ -156,7 +156,7 @@ def smoothed_hazard_confidence_intervals_(self, bandwidth, hazard_=None):
         self._cumulative_sq.iloc[0] = 0
         var_hazard_ = self._cumulative_sq.diff().fillna(self._cumulative_sq.iloc[0])
         C = (var_hazard_.values != 0.0)  # only consider the points with jumps
-        std_hazard_ = np.sqrt(1./(2*bandwidth**2)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None, :], bandwidth)**2, var_hazard_.values[C]))
+        std_hazard_ = np.sqrt(1./(2*bandwidth**2)*np.dot(epanechnikov_kernel(timeline[:, None], timeline[C][None,:], bandwidth)**2, var_hazard_.values[C]))
         values = {
             self.ci_labels[0]: hazard_ * np.exp(alpha2 * std_hazard_ / hazard_),
             self.ci_labels[1]: hazard_ * np.exp(-alpha2 * std_hazard_ / hazard_)
@@ -493,12 +493,13 @@ def _fit_static(self, dataframe, duration_col="T", event_col="E",
         C = pd.Series(df[event_col].values, dtype=bool, index=ids)
         T = pd.Series(df[duration_col].values, index=ids)
 
-        df = df.set_index([duration_col, id_col])
+        df = df.set_index(id_col)
 
         ix = T.argsort()
         T, C = T.iloc[ix], C.iloc[ix]
 
         del df[event_col]
+        del df[duration_col]
         n, d = df.shape
         columns = df.columns
 
@@ -722,12 +723,20 @@ def predict_survival_function(self, X):
         """
         return np.exp(-self.predict_cumulative_hazard(X))
 
+    def predict_percentile(self, X, p=0.5):
+        """
+        X: a (n,d) covariate matrix
+        Returns the median lifetimes for the individuals.
+        http://stats.stackexchange.com/questions/102986/percentile-loss-functions
+        """
+        return qth_survival_times(0.5, self.predict_survival_function(X))
+
     def predict_median(self, X):
         """
         X: a (n,d) covariate matrix
         Returns the median lifetimes for the individuals
         """
-        return median_survival_times(self.predict_survival_function(X))
+        return self.predict_percentile(X, 0.5)
 
     def predict_expectation(self, X):
         """
@@ -738,6 +747,7 @@ def predict_expectation(self, X):
 
 
 class CoxPHFitter(BaseFitter):
+
     """
     This class implements fitting Cox's proportional hazard model:
 
@@ -793,7 +803,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
         # Iterate backwards to utilize recursive relationship
         for i, (ti, ei) in reversed(list(enumerate(zip(T, E)))):
             # Doing it like this to preserve shape
-            xi = X[i:i+1]
+            xi = X[i:i + 1]
             # Calculate phi values
             phi_i = exp(dot(xi, beta))
             phi_x_i = dot(phi_i, xi)
@@ -814,7 +824,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
                 # Keep track of count
                 tie_count += 1
 
-            if i > 0 and T[i-1] == ti:
+            if i > 0 and T[i - 1] == ti:
                 # There are more ties/members of the risk set
                 continue
             elif tie_count == 0:
@@ -990,7 +1000,7 @@ def _compute_confidence_intervals(self):
 
     def _compute_standard_errors(self):
         se = np.sqrt(inv(-self._hessian_).diagonal())
-        return pd.DataFrame(se[None, :],
+        return pd.DataFrame(se[None,:],
                             index=['se'], columns=self.hazards_.columns)
 
     def _compute_z_values(self):
@@ -1031,19 +1041,27 @@ def predict_survival_function(self, X):
         """
         return exp(-self.predict_hazard(X).cumsum(0))
 
+    def predict_percentile(self, X, p=0.5):
+        """
+        X: a (n,d) covariate matrix
+        Returns the median lifetimes for the individuals.
+        http://stats.stackexchange.com/questions/102986/percentile-loss-functions
+        """
+        return qth_survival_times(0.5, self.predict_survival_function(X))
+
     def predict_median(self, X):
         """
         X: a (n,d) covariate matrix
         Returns the median lifetimes for the individuals
         """
-        return median_survival_times(self.predict_survival_function(X))
+        return self.predict_percentile(X, 0.5)
 
     def predict_expectation(self, X):
         """
         Compute the expected lifetime, E[T], using covarites X.
         """
-        t = self.cumulative_hazards_.index
-        return trapz(self.predict_survival_function(X).values.T, t)
+        v = self.predict_survival_function(X)
+        return trapz(v.values.T, v.index)
 
     def _compute_baseline_hazard(self):
         # http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf
@@ -1190,10 +1208,10 @@ def qth_survival_times(q, survival_functions):
     sv_b = (1.0 * (survival_functions < q)).cumsum() > 0
     try:
         v = sv_b.idxmax(0)
-        v[sv_b.iloc[-1, :] == 0] = np.inf
+        v[sv_b.iloc[-1,:] == 0] = np.inf
     except:
         v = sv_b.argmax(0)
-        v[sv_b[-1, :] == 0] = np.inf
+        v[sv_b[-1,:] == 0] = np.inf
     return v
 
 

diff --git a/lifelines/statistics.py b/lifelines/statistics.py
@@ -53,8 +53,8 @@ def concordance_index(event_times, predicted_event_times, event_observed=None):
                    event_observed)
 
 
-def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None, 
-                  alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
+def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None,
+                 alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
     """
     Measures and reports on whether two intensity processes are different. That is, given two
     event series, determines whether the data generating processes are statistically different.
@@ -91,8 +91,8 @@ def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_obse
     event_times = np.r_[event_times_A, event_times_B]
     groups = np.r_[np.zeros(event_times_A.shape[0]), np.ones(event_times_B.shape[0])]
     event_observed = np.r_[event_observed_A, event_observed_B]
-    return multivariate_logrank_test(event_times, groups, event_observed, 
-                                          alpha=alpha, t_0=t_0, suppress_print=suppress_print, **kwargs)
+    return multivariate_logrank_test(event_times, groups, event_observed,
+                                     alpha=alpha, t_0=t_0, suppress_print=suppress_print, **kwargs)
 
 
 def pairwise_logrank_test(event_durations, groups, event_observed=None,
@@ -175,7 +175,7 @@ def pairwise_logrank_test(event_durations, groups, event_observed=None,
     return [pd.DataFrame(x, columns=unique_groups, index=unique_groups) for x in [S, P, T]]
 
 
-def multivariate_logrank_test(event_durations, groups, event_observed=None, 
+def multivariate_logrank_test(event_durations, groups, event_observed=None,
                               alpha=0.95, t_0=-1, suppress_print=False, **kwargs):
     """
     This test is a generalization of the logrank_test: it can deal with n>2 populations (and should
@@ -238,7 +238,7 @@ def multivariate_logrank_test(event_durations, groups, event_observed=None,
                                    df=n_groups - 1, **kwargs)
 
     if not suppress_print:
-      print(summary)
+        print(summary)
     return summary, p_value, test_result