diff --git a/CHANGELOG.md b/CHANGELOG.md index 599a70999..e9a399050 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ ## Changelog +#### 0.24.10 + +##### New features + - New improvements when using splines model in CoxPHFitter - it should offer much better prediction and baseline-hazard estimation, including extrapolation and interpolation. + +##### API Changes + - Related to above: the fitted spline parameters are now available in the `.summary` and `.print_summary` methods. + +##### Bug fixes +- fixed a bug in initialization of some interval-censoring models -> better convergence. + #### 0.24.9 - 2020-06-05 diff --git a/docs/Changelog.rst b/docs/Changelog.rst index df861f95a..6aad1a2ef 100644 --- a/docs/Changelog.rst +++ b/docs/Changelog.rst @@ -1,9 +1,35 @@ Changelog --------- +0.24.10 +^^^^^^^ + +New features +'''''''''''' + +- New improvements when using splines model in CoxPHFitter - it should + offer much better prediction and baseline-hazard estimation, + including extrapolation and interpolation. + +API Changes +''''''''''' + +- Related to above: the fitted spline parameters are now available in + the ``.summary`` and ``.print_summary`` methods. + +Bug fixes +''''''''' + +- fixed a bug in initialization of some interval-censoring models -> + better convergence. + +.. _section-1: + 0.24.9 - 2020-06-05 ^^^^^^^^^^^^^^^^^^^ +.. _new-features-1: + New features '''''''''''' @@ -12,18 +38,20 @@ New features ``tarone-ware``, ``peto``, ``fleming-harrington``. Thanks @sean-reed - new interval censored dataset: ``lifelines.datasets.load_mice`` +.. _bug-fixes-1: + Bug fixes ''''''''' - Cleared up some mislabeling in ``plot_loglogs``. Thanks @sean-reed! - tuples are now able to be used as input in univariate models. -.. _section-1: +.. _section-2: 0.24.8 - 2020-05-17 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-1: +.. _new-features-2: New features '''''''''''' @@ -32,12 +60,12 @@ New features Not all edge cases are fully checked, and some features are missing. Try it under ``KaplanMeierFitter.fit_interval_censoring`` -.. _section-2: +.. _section-3: 0.24.7 - 2020-05-17 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-2: +.. _new-features-3: New features '''''''''''' @@ -53,12 +81,12 @@ New features - some convergence tweaks which should help recent performance regressions. -.. _section-3: +.. _section-4: 0.24.6 - 2020-05-05 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-3: +.. _new-features-4: New features '''''''''''' @@ -68,7 +96,7 @@ New features - New ``lifelines.plotting.plot_interval_censored_lifetimes`` for plotting interval censored data - thanks @sean-reed! -.. _bug-fixes-1: +.. _bug-fixes-2: Bug fixes ''''''''' @@ -76,19 +104,19 @@ Bug fixes - fixed bug where ``cdf_plot`` and ``qq_plot`` were not factoring in the weights correctly. -.. _section-4: +.. _section-5: 0.24.5 - 2020-05-01 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-4: +.. _new-features-5: New features '''''''''''' - ``plot_lifetimes`` accepts pandas Series. -.. _bug-fixes-2: +.. _bug-fixes-3: Bug fixes ''''''''' @@ -98,12 +126,12 @@ Bug fixes - Improved ``at_risk_counts`` for subplots. - More data validation checks for ``CoxTimeVaryingFitter`` -.. _section-5: +.. _section-6: 0.24.4 - 2020-04-13 ^^^^^^^^^^^^^^^^^^^ -.. _bug-fixes-3: +.. _bug-fixes-4: Bug fixes ''''''''' @@ -112,12 +140,12 @@ Bug fixes - setting a dataframe in ``ancillary_df`` works for interval censoring - ``.score`` works for interval censored models -.. _section-6: +.. _section-7: 0.24.3 - 2020-03-25 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-5: +.. _new-features-6: New features '''''''''''' @@ -127,7 +155,7 @@ New features the hazard ratio would be at previous times. This is useful because the final hazard ratio is some weighted average of these. -.. _bug-fixes-4: +.. _bug-fixes-5: Bug fixes ''''''''' @@ -135,12 +163,12 @@ Bug fixes - Fixed error in HTML printer that was hiding concordance index information. -.. _section-7: +.. _section-8: 0.24.2 - 2020-03-15 ^^^^^^^^^^^^^^^^^^^ -.. _bug-fixes-5: +.. _bug-fixes-6: Bug fixes ''''''''' @@ -152,12 +180,12 @@ Bug fixes - Fixed a keyword bug in ``plot_covariate_groups`` for parametric models. -.. _section-8: +.. _section-9: 0.24.1 - 2020-03-05 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-6: +.. _new-features-7: New features '''''''''''' @@ -165,14 +193,14 @@ New features - Stability improvements for GeneralizedGammaRegressionFitter and CoxPHFitter with spline estimation. -.. _bug-fixes-6: +.. _bug-fixes-7: Bug fixes ''''''''' - Fixed bug with plotting hazards in NelsonAalenFitter. -.. _section-9: +.. _section-10: 0.24.0 - 2020-02-20 ^^^^^^^^^^^^^^^^^^^ @@ -181,7 +209,7 @@ This version and future versions of lifelines no longer support py35. Pandas 1.0 is fully supported, along with previous versions. Minimum Scipy has been bumped to 1.2.0. -.. _new-features-7: +.. _new-features-8: New features '''''''''''' @@ -205,6 +233,8 @@ New features - new ``lifelines.fitters.mixins.ProportionalHazardMixin`` that implements proportional hazard checks. +.. _api-changes-1: + API Changes ''''''''''' @@ -231,7 +261,7 @@ API Changes to ``scoring_method``. - removed ``_score_`` and ``path`` from Cox model. -.. _bug-fixes-7: +.. _bug-fixes-8: Bug fixes ''''''''' @@ -244,12 +274,12 @@ Bug fixes - Cox models now incorporate any penalizers in their ``log_likelihood_`` -.. _section-10: +.. _section-11: 0.23.9 - 2020-01-28 ^^^^^^^^^^^^^^^^^^^ -.. _bug-fixes-8: +.. _bug-fixes-9: Bug fixes ''''''''' @@ -260,12 +290,12 @@ Bug fixes of ``GeneralizedGammaRegressionFitter`` and any custom regression models should update their code as soon as possible. -.. _section-11: +.. _section-12: 0.23.8 - 2020-01-21 ^^^^^^^^^^^^^^^^^^^ -.. _bug-fixes-9: +.. _bug-fixes-10: Bug fixes ''''''''' @@ -276,19 +306,19 @@ Bug fixes ``GeneralizedGammaRegressionFitter`` and any custom regression models should update their code as soon as possible. -.. _section-12: +.. _section-13: 0.23.7 - 2020-01-14 ^^^^^^^^^^^^^^^^^^^ Bug fixes for py3.5. -.. _section-13: +.. _section-14: 0.23.6 - 2020-01-07 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-8: +.. _new-features-9: New features '''''''''''' @@ -302,12 +332,12 @@ New features - custom parametric regression models can now do left and interval censoring. -.. _section-14: +.. _section-15: 0.23.5 - 2020-01-05 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-9: +.. _new-features-10: New features '''''''''''' @@ -316,7 +346,7 @@ New features - New lymph node cancer dataset, originally from *H.F. for the German Breast Cancer Study Group (GBSG) (1994)* -.. _bug-fixes-10: +.. _bug-fixes-11: Bug fixes ''''''''' @@ -326,26 +356,26 @@ Bug fixes - fixed bug where large exponential numbers in ``print_summary`` were not being suppressed correctly. -.. _section-15: +.. _section-16: 0.23.4 - 2019-12-15 ^^^^^^^^^^^^^^^^^^^ - Bug fix for PyPI -.. _section-16: +.. _section-17: 0.23.3 - 2019-12-11 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-10: +.. _new-features-11: New features '''''''''''' - ``StatisticalResult.print_summary`` supports html output. -.. _bug-fixes-11: +.. _bug-fixes-12: Bug fixes ''''''''' @@ -353,12 +383,12 @@ Bug fixes - fix import in ``printer.py`` - fix html printing with Univariate models. -.. _section-17: +.. _section-18: 0.23.2 - 2019-12-07 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-11: +.. _new-features-12: New features '''''''''''' @@ -370,7 +400,7 @@ New features - performance improvements on regression models’ preprocessing. Should make datasets with high number of columns more performant. -.. _bug-fixes-12: +.. _bug-fixes-13: Bug fixes ''''''''' @@ -379,12 +409,12 @@ Bug fixes - fixed repr for ``sklearn_adapter`` classes. - fixed ``conditional_after`` in Cox model with strata was used. -.. _section-18: +.. _section-19: 0.23.1 - 2019-11-27 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-12: +.. _new-features-13: New features '''''''''''' @@ -394,7 +424,7 @@ New features - performance improvements for ``CoxPHFitter`` - up to 30% performance improvements for some datasets. -.. _bug-fixes-13: +.. _bug-fixes-14: Bug fixes ''''''''' @@ -406,12 +436,12 @@ Bug fixes - fixed bug when using ``print_summary`` with left censored models. - lots of minor bug fixes. -.. _section-19: +.. _section-20: 0.23.0 - 2019-11-17 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-13: +.. _new-features-14: New features '''''''''''' @@ -420,7 +450,7 @@ New features Jupyter notebooks! - silenced some warnings. -.. _bug-fixes-14: +.. _bug-fixes-15: Bug fixes ''''''''' @@ -430,7 +460,7 @@ Bug fixes now fixed. - fixed a NaN error in confidence intervals for KaplanMeierFitter -.. _api-changes-1: +.. _api-changes-2: API Changes ''''''''''' @@ -442,7 +472,7 @@ API Changes - ``left_censorship`` in ``fit`` has been removed in favour of ``fit_left_censoring``. -.. _section-20: +.. _section-21: 0.22.10 - 2019-11-08 ^^^^^^^^^^^^^^^^^^^^ @@ -450,7 +480,7 @@ API Changes The tests were re-factored to be shipped with the package. Let me know if this causes problems. -.. _bug-fixes-15: +.. _bug-fixes-16: Bug fixes ''''''''' @@ -460,12 +490,12 @@ Bug fixes - fixed bug in plot_covariate_groups for AFT models when >1d arrays were used for values arg. -.. _section-21: +.. _section-22: 0.22.9 - 2019-10-30 ^^^^^^^^^^^^^^^^^^^ -.. _bug-fixes-16: +.. _bug-fixes-17: Bug fixes ''''''''' @@ -477,12 +507,12 @@ Bug fixes - ``CoxPHFitter`` now displays correct columns values when changing alpha param. -.. _section-22: +.. _section-23: 0.22.8 - 2019-10-06 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-14: +.. _new-features-15: New features '''''''''''' @@ -492,19 +522,19 @@ New features - ``conditional_after`` now available in ``CoxPHFitter.predict_median`` - Suppressed some unimportant warnings. -.. _bug-fixes-17: +.. _bug-fixes-18: Bug fixes ''''''''' - fixed initial_point being ignored in AFT models. -.. _section-23: +.. _section-24: 0.22.7 - 2019-09-29 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-15: +.. _new-features-16: New features '''''''''''' @@ -512,7 +542,7 @@ New features - new ``ApproximationWarning`` to tell you if the package is making an potentially mislead approximation. -.. _bug-fixes-18: +.. _bug-fixes-19: Bug fixes ''''''''' @@ -521,7 +551,7 @@ Bug fixes - realigned values in ``print_summary``. - fixed bug in ``survival_difference_at_fixed_point_in_time_test`` -.. _api-changes-2: +.. _api-changes-3: API Changes ''''''''''' @@ -531,24 +561,24 @@ API Changes - Some previous ``StatisticalWarnings`` have been replaced by ``ApproximationWarning`` -.. _section-24: +.. _section-25: 0.22.6 - 2019-09-25 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-16: +.. _new-features-17: New features '''''''''''' - ``conditional_after`` works for ``CoxPHFitter`` prediction models 😅 -.. _bug-fixes-19: +.. _bug-fixes-20: Bug fixes ''''''''' -.. _api-changes-3: +.. _api-changes-4: API Changes ''''''''''' @@ -559,12 +589,12 @@ API Changes - ``utils.dataframe_interpolate_at_times`` renamed to ``utils.interpolate_at_times_and_return_pandas``. -.. _section-25: +.. _section-26: 0.22.5 - 2019-09-20 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-17: +.. _new-features-18: New features '''''''''''' @@ -573,7 +603,7 @@ New features weights. - Better support for predicting on Pandas Series -.. _bug-fixes-20: +.. _bug-fixes-21: Bug fixes ''''''''' @@ -582,7 +612,7 @@ Bug fixes - Fixed an issue with ``AalenJohansenFitter`` failing to plot confidence intervals. -.. _api-changes-4: +.. _api-changes-5: API Changes ''''''''''' @@ -590,12 +620,12 @@ API Changes - ``_get_initial_value`` in parametric univariate models is renamed ``_create_initial_point`` -.. _section-26: +.. _section-27: 0.22.4 - 2019-09-04 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-18: +.. _new-features-19: New features '''''''''''' @@ -606,7 +636,7 @@ New features - new ``utils.restricted_mean_survival_time`` that approximates the RMST using numerical integration against survival functions. -.. _api-changes-5: +.. _api-changes-6: API changes ''''''''''' @@ -614,7 +644,7 @@ API changes - ``KaplanMeierFitter.survival_function_``\ ‘s’ index is no longer given the name “timeline”. -.. _bug-fixes-21: +.. _bug-fixes-22: Bug fixes ''''''''' @@ -622,12 +652,12 @@ Bug fixes - Fixed issue where ``concordance_index`` would never exit if NaNs in dataset. -.. _section-27: +.. _section-28: 0.22.3 - 2019-08-08 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-19: +.. _new-features-20: New features '''''''''''' @@ -640,7 +670,7 @@ New features - smarter initial conditions for parametric regression models. - New regression model: ``GeneralizedGammaRegressionFitter`` -.. _api-changes-6: +.. _api-changes-7: API changes ''''''''''' @@ -651,7 +681,7 @@ API changes gains only in Cox models, and only a small fraction of the API was being used. -.. _bug-fixes-22: +.. _bug-fixes-23: Bug fixes ''''''''' @@ -663,19 +693,19 @@ Bug fixes - Fixed an error in the ``predict_percentile`` of ``LogLogisticAFTFitter``. New tests have been added around this. -.. _section-28: +.. _section-29: 0.22.2 - 2019-07-25 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-20: +.. _new-features-21: New features '''''''''''' - lifelines is now compatible with scipy>=1.3.0 -.. _bug-fixes-23: +.. _bug-fixes-24: Bug fixes ''''''''' @@ -686,12 +716,12 @@ Bug fixes errors when using the library. The correctly numpy has been pinned (to 1.14.0+) -.. _section-29: +.. _section-30: 0.22.1 - 2019-07-14 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-21: +.. _new-features-22: New features '''''''''''' @@ -706,7 +736,7 @@ New features right censoring) - improvements to ``lifelines.utils.gamma`` -.. _api-changes-7: +.. _api-changes-8: API changes ''''''''''' @@ -719,7 +749,7 @@ API changes ``.print_summary`` includes confidence intervals for the exponential of the value. -.. _bug-fixes-24: +.. _bug-fixes-25: Bug fixes ''''''''' @@ -729,12 +759,12 @@ Bug fixes - fixed an overflow bug in ``KaplanMeierFitter`` confidence intervals - improvements in data validation for ``CoxTimeVaryingFitter`` -.. _section-30: +.. _section-31: 0.22.0 - 2019-07-03 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-22: +.. _new-features-23: New features '''''''''''' @@ -746,7 +776,7 @@ New features - for parametric univariate models, the ``conditional_time_to_event_`` is now exact instead of an approximation. -.. _api-changes-8: +.. _api-changes-9: API changes ''''''''''' @@ -768,7 +798,7 @@ API changes could set ``fit_intercept`` to False and not have to set ``ancillary_df`` - now one must specify a DataFrame. -.. _bug-fixes-25: +.. _bug-fixes-26: Bug fixes ''''''''' @@ -777,21 +807,21 @@ Bug fixes is now exact instead of an approximation. - fixed a name error bug in ``CoxTimeVaryingFitter.plot`` -.. _section-31: +.. _section-32: 0.21.5 - 2019-06-22 ^^^^^^^^^^^^^^^^^^^ I’m skipping 0.21.4 version because of deployment issues. -.. _new-features-23: +.. _new-features-24: New features '''''''''''' - ``scoring_method`` now a kwarg on ``sklearn_adapter`` -.. _bug-fixes-26: +.. _bug-fixes-27: Bug fixes ''''''''' @@ -801,12 +831,12 @@ Bug fixes - fixed visual bug that misaligned x-axis ticks and at-risk counts. Thanks @christopherahern! -.. _section-32: +.. _section-33: 0.21.3 - 2019-06-04 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-24: +.. _new-features-25: New features '''''''''''' @@ -820,19 +850,19 @@ New features - ``CoxPHFitter.check_assumptions`` now accepts a ``columns`` parameter to specify only checking a subset of columns. -.. _bug-fixes-27: +.. _bug-fixes-28: Bug fixes ''''''''' - ``covariates_from_event_matrix`` handle nulls better -.. _section-33: +.. _section-34: 0.21.2 - 2019-05-16 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-25: +.. _new-features-26: New features '''''''''''' @@ -844,7 +874,7 @@ New features that computes, you guessed it, the log-likelihood ratio test. Previously this was an internal API that is being exposed. -.. _api-changes-9: +.. _api-changes-10: API changes ''''''''''' @@ -856,17 +886,17 @@ API changes - removing ``_compute_likelihood_ratio_test`` on regression models. Use ``log_likelihood_ratio_test`` now. -.. _bug-fixes-28: +.. _bug-fixes-29: Bug fixes ''''''''' -.. _section-34: +.. _section-35: 0.21.1 - 2019-04-26 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-26: +.. _new-features-27: New features '''''''''''' @@ -875,7 +905,7 @@ New features ``add_covariate_to_timeline`` - PiecewiseExponentialFitter now allows numpy arrays as breakpoints -.. _api-changes-10: +.. _api-changes-11: API changes ''''''''''' @@ -883,19 +913,19 @@ API changes - output of ``survival_table_from_events`` when collapsing rows to intervals now removes the “aggregate” column multi-index. -.. _bug-fixes-29: +.. _bug-fixes-30: Bug fixes ''''''''' - fixed bug in CoxTimeVaryingFitter when ax is provided, thanks @j-i-l! -.. _section-35: +.. _section-36: 0.21.0 - 2019-04-12 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-27: +.. _new-features-28: New features '''''''''''' @@ -909,7 +939,7 @@ New features - a new interval censored dataset is available under ``lifelines.datasets.load_diabetes`` -.. _api-changes-11: +.. _api-changes-12: API changes ''''''''''' @@ -920,7 +950,7 @@ API changes - ``entries`` property in multivariate parametric models has a new Series name: ``entry`` -.. _bug-fixes-30: +.. _bug-fixes-31: Bug fixes ''''''''' @@ -930,19 +960,19 @@ Bug fixes - Fixed an error that didn’t let users use Numpy arrays in prediction for AFT models -.. _section-36: +.. _section-37: 0.20.5 - 2019-04-08 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-28: +.. _new-features-29: New features '''''''''''' - performance improvements for ``print_summary``. -.. _api-changes-12: +.. _api-changes-13: API changes ''''''''''' @@ -952,7 +982,7 @@ API changes - in ``AalenJohansenFitter``, the ``variance`` parameter is renamed to ``variance_`` to align with the usual lifelines convention. -.. _bug-fixes-31: +.. _bug-fixes-32: Bug fixes ''''''''' @@ -961,12 +991,12 @@ Bug fixes test when using strata. - Fixed some plotting bugs with ``AalenJohansenFitter`` -.. _section-37: +.. _section-38: 0.20.4 - 2019-03-27 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-29: +.. _new-features-30: New features '''''''''''' @@ -977,7 +1007,7 @@ New features generating piecewise exp. data - Faster ``print_summary`` for AFT models. -.. _api-changes-13: +.. _api-changes-14: API changes ''''''''''' @@ -985,7 +1015,7 @@ API changes - Pandas is now correctly pinned to >= 0.23.0. This was always the case, but not specified in setup.py correctly. -.. _bug-fixes-32: +.. _bug-fixes-33: Bug fixes ''''''''' @@ -994,12 +1024,12 @@ Bug fixes - ``PiecewiseExponentialFitter`` is available with ``from lifelines import *``. -.. _section-38: +.. _section-39: 0.20.3 - 2019-03-23 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-30: +.. _new-features-31: New features '''''''''''' @@ -1012,12 +1042,12 @@ New features ``plot_survival_function`` and ``confidence_interval_survival_function_``. -.. _section-39: +.. _section-40: 0.20.2 - 2019-03-21 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-31: +.. _new-features-32: New features '''''''''''' @@ -1032,7 +1062,7 @@ New features - add a ``lifelines.plotting.qq_plot`` for univariate parametric models that handles censored data. -.. _api-changes-14: +.. _api-changes-15: API changes ''''''''''' @@ -1041,7 +1071,7 @@ API changes @vpolimenov! - The ``C`` column in ``load_lcd`` dataset is renamed to ``E``. -.. _bug-fixes-33: +.. _bug-fixes-34: Bug fixes ''''''''' @@ -1057,7 +1087,7 @@ Bug fixes the q parameter was below the truncation limit. This should have been ``-np.inf`` -.. _section-40: +.. _section-41: 0.20.1 - 2019-03-16 ^^^^^^^^^^^^^^^^^^^ @@ -1071,7 +1101,7 @@ Bug fixes decades of development. - suppressed unimportant warnings -.. _api-changes-15: +.. _api-changes-16: API changes ''''''''''' @@ -1081,7 +1111,7 @@ API changes This is no longer the case. A 0 will still be added if there is a duration (observed or not) at 0 occurs however. -.. _section-41: +.. _section-42: 0.20.0 - 2019-03-05 ^^^^^^^^^^^^^^^^^^^ @@ -1090,7 +1120,7 @@ API changes recent installs where Py3. - Updated minimum dependencies, specifically Matplotlib and Pandas. -.. _new-features-32: +.. _new-features-33: New features '''''''''''' @@ -1098,7 +1128,7 @@ New features - smarter initialization for AFT models which should improve convergence. -.. _api-changes-16: +.. _api-changes-17: API changes ''''''''''' @@ -1110,19 +1140,19 @@ API changes transposed now (previous parameters where columns, now parameters are rows). -.. _bug-fixes-34: +.. _bug-fixes-35: Bug fixes ''''''''' - Fixed a bug with plotting and ``check_assumptions``. -.. _section-42: +.. _section-43: 0.19.5 - 2019-02-26 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-33: +.. _new-features-34: New features '''''''''''' @@ -1132,24 +1162,24 @@ New features features or categorical variables. - Convergence improvements for AFT models. -.. _section-43: +.. _section-44: 0.19.4 - 2019-02-25 ^^^^^^^^^^^^^^^^^^^ -.. _bug-fixes-35: +.. _bug-fixes-36: Bug fixes ''''''''' - remove some bad print statements in ``CoxPHFitter``. -.. _section-44: +.. _section-45: 0.19.3 - 2019-02-25 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-34: +.. _new-features-35: New features '''''''''''' @@ -1161,12 +1191,12 @@ New features - Performance increase to ``print_summary`` in the ``CoxPHFitter`` and ``CoxTimeVaryingFitter`` model. -.. _section-45: +.. _section-46: 0.19.2 - 2019-02-22 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-35: +.. _new-features-36: New features '''''''''''' @@ -1174,7 +1204,7 @@ New features - ``ParametricUnivariateFitters``, like ``WeibullFitter``, have smoothed plots when plotting (vs stepped plots) -.. _bug-fixes-36: +.. _bug-fixes-37: Bug fixes ''''''''' @@ -1184,12 +1214,12 @@ Bug fixes - Univariate fitters are more flexiable and can allow 2-d and DataFrames as inputs. -.. _section-46: +.. _section-47: 0.19.1 - 2019-02-21 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-36: +.. _new-features-37: New features '''''''''''' @@ -1197,7 +1227,7 @@ New features - improved stability of ``LogNormalFitter`` - Matplotlib for Python3 users are not longer forced to use 2.x. -.. _api-changes-17: +.. _api-changes-18: API changes ''''''''''' @@ -1206,12 +1236,12 @@ API changes ``PiecewiseExponential`` to the same as ``ExponentialFitter`` (from ``\lambda * t`` to ``t / \lambda``). -.. _section-47: +.. _section-48: 0.19.0 - 2019-02-20 ^^^^^^^^^^^^^^^^^^^ -.. _new-features-37: +.. _new-features-38: New features '''''''''''' @@ -1224,7 +1254,7 @@ New features - ``CoxPHFitter`` performance improvements (about 10%) - ``CoxTimeVaryingFitter`` performance improvements (about 10%) -.. _api-changes-18: +.. _api-changes-19: API changes ''''''''''' @@ -1250,7 +1280,7 @@ API changes means that the *default* for alpha is set to 0.05 in the latest lifelines, instead of 0.95 in previous versions. -.. _bug-fixes-37: +.. _bug-fixes-38: Bug Fixes ''''''''' @@ -1267,7 +1297,7 @@ Bug Fixes models. Thanks @airanmehr! - Fixed some Pandas <0.24 bugs. -.. _section-48: +.. _section-49: 0.18.6 - 2019-02-13 ^^^^^^^^^^^^^^^^^^^ @@ -1277,7 +1307,7 @@ Bug Fixes ``rank`` and ``km`` p-values now. - some performance improvements to ``qth_survival_time``. -.. _section-49: +.. _section-50: 0.18.5 - 2019-02-11 ^^^^^^^^^^^^^^^^^^^ @@ -1298,7 +1328,7 @@ Bug Fixes that can be used to turn off variance calculations since this can take a long time for large datasets. Thanks @pzivich! -.. _section-50: +.. _section-51: 0.18.4 - 2019-02-10 ^^^^^^^^^^^^^^^^^^^ @@ -1308,7 +1338,7 @@ Bug Fixes - adding left-truncation support to parametric univarite models with the ``entry`` kwarg in ``.fit`` -.. _section-51: +.. _section-52: 0.18.3 - 2019-02-07 ^^^^^^^^^^^^^^^^^^^ @@ -1318,7 +1348,7 @@ Bug Fixes warnings are more noticeable. - Improved some warning and error messages. -.. _section-52: +.. _section-53: 0.18.2 - 2019-02-05 ^^^^^^^^^^^^^^^^^^^ @@ -1334,7 +1364,7 @@ Bug Fixes Moved them all (most) to use ``autograd``. - ``LogNormalFitter`` no longer models ``log_sigma``. -.. _section-53: +.. _section-54: 0.18.1 - 2019-02-02 ^^^^^^^^^^^^^^^^^^^ @@ -1345,7 +1375,7 @@ Bug Fixes - use the ``autograd`` lib to help with gradients. - New ``LogLogisticFitter`` univariate fitter available. -.. _section-54: +.. _section-55: 0.18.0 - 2019-01-31 ^^^^^^^^^^^^^^^^^^^ @@ -1382,7 +1412,7 @@ Bug Fixes ``LinAlgError: Matrix is singular.`` and report back to the user advice. -.. _section-55: +.. _section-56: 0.17.5 - 2019-01-25 ^^^^^^^^^^^^^^^^^^^ @@ -1390,7 +1420,7 @@ Bug Fixes - more bugs in ``plot_covariate_groups`` fixed when using non-numeric strata. -.. _section-56: +.. _section-57: 0.17.4 -2019-01-25 ^^^^^^^^^^^^^^^^^^ @@ -1402,7 +1432,7 @@ Bug Fixes - ``groups`` is now called ``values`` in ``CoxPHFitter.plot_covariate_groups`` -.. _section-57: +.. _section-58: 0.17.3 - 2019-01-24 ^^^^^^^^^^^^^^^^^^^ @@ -1410,7 +1440,7 @@ Bug Fixes - Fix in ``compute_residuals`` when using ``schoenfeld`` and the minumum duration has only censored subjects. -.. _section-58: +.. _section-59: 0.17.2 2019-01-22 ^^^^^^^^^^^^^^^^^ @@ -1421,7 +1451,7 @@ Bug Fixes ``for`` loop. The downside is the code is more esoteric now. I’ve added comments as necessary though 🤞 -.. _section-59: +.. _section-60: 0.17.1 - 2019-01-20 ^^^^^^^^^^^^^^^^^^^ @@ -1438,7 +1468,7 @@ Bug Fixes - Fixes a Pandas performance warning in ``CoxTimeVaryingFitter``. - Performances improvements to ``CoxTimeVaryingFitter``. -.. _section-60: +.. _section-61: 0.17.0 - 2019-01-11 ^^^^^^^^^^^^^^^^^^^ @@ -1459,7 +1489,7 @@ Bug Fixes - some plotting improvemnts to ``plotting.plot_lifetimes`` -.. _section-61: +.. _section-62: 0.16.3 - 2019-01-03 ^^^^^^^^^^^^^^^^^^^ @@ -1467,7 +1497,7 @@ Bug Fixes - More ``CoxPHFitter`` performance improvements. Up to a 40% reduction vs 0.16.2 for some datasets. -.. _section-62: +.. _section-63: 0.16.2 - 2019-01-02 ^^^^^^^^^^^^^^^^^^^ @@ -1478,14 +1508,14 @@ Bug Fixes has lots of duplicate times. See https://github.com/CamDavidsonPilon/lifelines/issues/591 -.. _section-63: +.. _section-64: 0.16.1 - 2019-01-01 ^^^^^^^^^^^^^^^^^^^ - Fixed py2 division error in ``concordance`` method. -.. _section-64: +.. _section-65: 0.16.0 - 2019-01-01 ^^^^^^^^^^^^^^^^^^^ @@ -1521,7 +1551,7 @@ Bug Fixes ``lifelines.utils.to_episodic_format``. - ``CoxTimeVaryingFitter`` now accepts ``strata``. -.. _section-65: +.. _section-66: 0.15.4 ^^^^^^ @@ -1529,14 +1559,14 @@ Bug Fixes - bug fix for the Cox model likelihood ratio test when using non-trivial weights. -.. _section-66: +.. _section-67: 0.15.3 - 2018-12-18 ^^^^^^^^^^^^^^^^^^^ - Only allow matplotlib less than 3.0. -.. _section-67: +.. _section-68: 0.15.2 - 2018-11-23 ^^^^^^^^^^^^^^^^^^^ @@ -1547,7 +1577,7 @@ Bug Fixes - removed ``entry`` from ``ExponentialFitter`` and ``WeibullFitter`` as it was doing nothing. -.. _section-68: +.. _section-69: 0.15.1 - 2018-11-23 ^^^^^^^^^^^^^^^^^^^ @@ -1556,7 +1586,7 @@ Bug Fixes - Raise NotImplementedError if the ``robust`` flag is used in ``CoxTimeVaryingFitter`` - that’s not ready yet. -.. _section-69: +.. _section-70: 0.15.0 - 2018-11-22 ^^^^^^^^^^^^^^^^^^^ @@ -1627,7 +1657,7 @@ Bug Fixes When Estimating Risks in Pharmacoepidemiology” for a nice overview of the model. -.. _section-70: +.. _section-71: 0.14.6 - 2018-07-02 ^^^^^^^^^^^^^^^^^^^ @@ -1635,7 +1665,7 @@ Bug Fixes - fix for n > 2 groups in ``multivariate_logrank_test`` (again). - fix bug for when ``event_observed`` column was not boolean. -.. _section-71: +.. _section-72: 0.14.5 - 2018-06-29 ^^^^^^^^^^^^^^^^^^^ @@ -1643,7 +1673,7 @@ Bug Fixes - fix for n > 2 groups in ``multivariate_logrank_test`` - fix weights in KaplanMeierFitter when using a pandas Series. -.. _section-72: +.. _section-73: 0.14.4 - 2018-06-14 ^^^^^^^^^^^^^^^^^^^ @@ -1660,7 +1690,7 @@ Bug Fixes - New ``delay`` parameter in ``add_covariate_to_timeline`` - removed ``two_sided_z_test`` from ``statistics`` -.. _section-73: +.. _section-74: 0.14.3 - 2018-05-24 ^^^^^^^^^^^^^^^^^^^ @@ -1672,7 +1702,7 @@ Bug Fixes - adds a ``column`` argument to ``CoxTimeVaryingFitter`` and ``CoxPHFitter`` ``plot`` method to plot only a subset of columns. -.. _section-74: +.. _section-75: 0.14.2 - 2018-05-18 ^^^^^^^^^^^^^^^^^^^ @@ -1680,7 +1710,7 @@ Bug Fixes - some quality of life improvements for working with ``CoxTimeVaryingFitter`` including new ``predict_`` methods. -.. _section-75: +.. _section-76: 0.14.1 - 2018-04-01 ^^^^^^^^^^^^^^^^^^^ @@ -1698,7 +1728,7 @@ Bug Fixes faster completion of ``fit`` for large dataframes, and up to 10% faster for small dataframes. -.. _section-76: +.. _section-77: 0.14.0 - 2018-03-03 ^^^^^^^^^^^^^^^^^^^ @@ -1720,7 +1750,7 @@ Bug Fixes of a ``RuntimeWarning`` - New checks for complete separation in the dataset for regressions. -.. _section-77: +.. _section-78: 0.13.0 - 2017-12-22 ^^^^^^^^^^^^^^^^^^^ @@ -1749,7 +1779,7 @@ Bug Fixes group the same subjects together and give that observation a weight equal to the count. Altogether, this means a much faster regression. -.. _section-78: +.. _section-79: 0.12.0 ^^^^^^ @@ -1766,7 +1796,7 @@ Bug Fixes - Additional functionality to ``utils.survival_table_from_events`` to bin the index to make the resulting table more readable. -.. _section-79: +.. _section-80: 0.11.3 ^^^^^^ @@ -1778,7 +1808,7 @@ Bug Fixes observation or censorship. - More accurate prediction methods parametrics univariate models. -.. _section-80: +.. _section-81: 0.11.2 ^^^^^^ @@ -1786,14 +1816,14 @@ Bug Fixes - Changing liscense to valilla MIT. - Speed up ``NelsonAalenFitter.fit`` considerably. -.. _section-81: +.. _section-82: 0.11.1 - 2017-06-22 ^^^^^^^^^^^^^^^^^^^ - Python3 fix for ``CoxPHFitter.plot``. -.. _section-82: +.. _section-83: 0.11.0 - 2017-06-21 ^^^^^^^^^^^^^^^^^^^ @@ -1807,14 +1837,14 @@ Bug Fixes of a new ``loc`` kwarg. This is to align with Pandas deprecating ``ix`` -.. _section-83: +.. _section-84: 0.10.1 - 2017-06-05 ^^^^^^^^^^^^^^^^^^^ - fix in internal normalization for ``CoxPHFitter`` predict methods. -.. _section-84: +.. _section-85: 0.10.0 ^^^^^^ @@ -1829,7 +1859,7 @@ Bug Fixes mimic R’s ``basehaz`` API. - new ``predict_log_partial_hazards`` to ``CoxPHFitter`` -.. _section-85: +.. _section-86: 0.9.4 ^^^^^ @@ -1852,7 +1882,7 @@ Bug Fixes - performance improvements in ``CoxPHFitter`` - should see at least a 10% speed improvement in ``fit``. -.. _section-86: +.. _section-87: 0.9.2 ^^^^^ @@ -1861,7 +1891,7 @@ Bug Fixes - throw an error if no admissable pairs in the c-index calculation. Previously a NaN was returned. -.. _section-87: +.. _section-88: 0.9.1 ^^^^^ @@ -1869,7 +1899,7 @@ Bug Fixes - add two summary functions to Weibull and Exponential fitter, solves #224 -.. _section-88: +.. _section-89: 0.9.0 ^^^^^ @@ -1885,7 +1915,7 @@ Bug Fixes - Default predict method in ``k_fold_cross_validation`` is now ``predict_expectation`` -.. _section-89: +.. _section-90: 0.8.1 - 2015-08-01 ^^^^^^^^^^^^^^^^^^ @@ -1902,7 +1932,7 @@ Bug Fixes - scaling of smooth hazards in NelsonAalenFitter was off by a factor of 0.5. -.. _section-90: +.. _section-91: 0.8.0 ^^^^^ @@ -1921,7 +1951,7 @@ Bug Fixes ``lifelines.statistics. power_under_cph``. - fixed a bug when using KaplanMeierFitter for left-censored data. -.. _section-91: +.. _section-92: 0.7.1 ^^^^^ @@ -1940,7 +1970,7 @@ Bug Fixes - refactor each fitter into it’s own submodule. For now, the tests are still in the same file. This will also *not* break the API. -.. _section-92: +.. _section-93: 0.7.0 - 2015-03-01 ^^^^^^^^^^^^^^^^^^ @@ -1959,7 +1989,7 @@ Bug Fixes duration remaining until the death event, given survival up until time t. -.. _section-93: +.. _section-94: 0.6.1 ^^^^^ @@ -1971,7 +2001,7 @@ Bug Fixes your work is to sum up the survival function (for expected values or something similar), it’s more difficult to make a mistake. -.. _section-94: +.. _section-95: 0.6.0 - 2015-02-04 ^^^^^^^^^^^^^^^^^^ @@ -1994,7 +2024,7 @@ Bug Fixes - In ``KaplanMeierFitter``, ``epsilon`` has been renamed to ``precision``. -.. _section-95: +.. _section-96: 0.5.1 - 2014-12-24 ^^^^^^^^^^^^^^^^^^ @@ -2015,7 +2045,7 @@ Bug Fixes ``lifelines.plotting.add_at_risk_counts``. - Fix bug Epanechnikov kernel. -.. _section-96: +.. _section-97: 0.5.0 - 2014-12-07 ^^^^^^^^^^^^^^^^^^ @@ -2028,7 +2058,7 @@ Bug Fixes - add test for summary() - Alternate metrics can be used for ``k_fold_cross_validation``. -.. _section-97: +.. _section-98: 0.4.4 - 2014-11-27 ^^^^^^^^^^^^^^^^^^ @@ -2040,7 +2070,7 @@ Bug Fixes - Fixes bug in 1-d input not returning in CoxPHFitter - Lots of new tests. -.. _section-98: +.. _section-99: 0.4.3 - 2014-07-23 ^^^^^^^^^^^^^^^^^^ @@ -2061,7 +2091,7 @@ Bug Fixes - Adds option ``include_likelihood`` to CoxPHFitter fit method to save the final log-likelihood value. -.. _section-99: +.. _section-100: 0.4.2 - 2014-06-19 ^^^^^^^^^^^^^^^^^^ @@ -2081,7 +2111,7 @@ Bug Fixes from failing so often (this a stop-gap) - pep8 everything -.. _section-100: +.. _section-101: 0.4.1.1 ^^^^^^^ @@ -2094,7 +2124,7 @@ Bug Fixes - Adding more robust cross validation scheme based on issue #67. - fixing ``regression_dataset`` in ``datasets``. -.. _section-101: +.. _section-102: 0.4.1 - 2014-06-11 ^^^^^^^^^^^^^^^^^^ @@ -2113,7 +2143,7 @@ Bug Fixes - Adding a Changelog. - more sanitizing for the statistical tests =) -.. _section-102: +.. _section-103: 0.4.0 - 2014-06-08 ^^^^^^^^^^^^^^^^^^ diff --git a/docs/Compatibility with scikit-learn.rst b/docs/Compatibility with scikit-learn.rst index 0fa96ce15..ff483e25b 100644 --- a/docs/Compatibility with scikit-learn.rst +++ b/docs/Compatibility with scikit-learn.rst @@ -88,3 +88,21 @@ The wrapped classes can even be used in more complex scikit-learn functions (ex: .. note:: The :func:`lifelines.utils.sklearn_adapter` is currently only designed to work with right-censored data. + +Serialization +--------------------- + +A note on saving these models: saving can be done with any serialization library, but to load them in a different script / program, you may need to recreate the class (this is a consequence of the implementation). Ex: + + + +.. code:: python + + # needed to reload + from lifelines.utils.sklearn_adapter import sklearn_adapter + from lifelines import CoxPHFitter + sklearn_adapter(CoxPHFitter, event_col='arrest') + + from joblib import load + + model = load(...) diff --git a/docs/index.rst b/docs/index.rst index 027c56cf3..ed25fdd16 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -55,7 +55,7 @@ Contents: :caption: Additional documentation Examples - References + references .. toctree:: :maxdepth: 1 diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py index 3383afaa7..b1ddb5100 100644 --- a/lifelines/fitters/__init__.py +++ b/lifelines/fitters/__init__.py @@ -1170,6 +1170,7 @@ class KnownModelParametricUnivariateFitter(ParametricUnivariateFitter): class RegressionFitter(BaseFitter): _KNOWN_MODEL = False + _FAST_MEDIAN_PREDICT = False _ALLOWED_RESIDUALS = {"schoenfeld", "score", "delta_beta", "deviance", "martingale", "scaled_schoenfeld"} def __init__(self, *args, **kwargs): @@ -1195,39 +1196,6 @@ def compute_residuals(self, training_dataframe: pd.DataFrame, kind: str) -> pd.D resids = getattr(self, "_compute_%s" % kind)(X, Ts, E, weights, index=shuffled_original_index) return resids - def _preprocess_dataframe(self, training_dataframe): - raise NotImplementedError() - - def _compute_schoenfeld(self, X: pd.DataFrame, T: pd.Series, E: pd.Series, weights: pd.Series, index: pd.Index): - raise NotImplementedError() - - def _compute_score(self, X, T, E, weights, index=None): - raise NotImplementedError() - - def _compute_delta_beta(self, X, T, E, weights, index=None): - raise NotImplementedError() - - def _compute_deviance(self, X, T, E, weights, index=None): - raise NotImplementedError() - - def _compute_martingale(self, X, T, E, weights, index=None): - raise NotImplementedError() - - def _compute_scale_schoenfeld(self, X, T, E, weights, index=None): - raise NotImplementedError() - - def score(self, df: pd.DataFrame, scoring_method: str = "log_likelihood") -> float: - raise NotImplementedError() - - def predict_percentile(self, df, *, p=0.5, conditional_after=None) -> pd.Series: - raise NotImplementedError() - - def predict_median(self, df, conditional_after=None) -> pd.Series: - raise NotImplementedError() - - def predict_expectation(self, df, conditional_after=None) -> pd.Series: - raise NotImplementedError() - class SemiParametricRegressionFittter(RegressionFitter): @property @@ -1749,7 +1717,8 @@ def _fit( Ts, E.values, weights.values, entries.values, self._create_Xs_dict(df) ) self.confidence_intervals_ = self._compute_confidence_intervals() - self._predicted_median = self.predict_median(df) + if self._FAST_MEDIAN_PREDICT: + self._predicted_median = self.predict_median(df) return self def _create_initial_point(self, Ts, E, entries, weights, Xs) -> Union[List[Dict], Dict]: @@ -2133,7 +2102,7 @@ def print_summary(self, decimals=2, style=None, **kwargs): sr = self.log_likelihood_ratio_test() footers = [] - if utils.CensoringType.is_right_censoring(self): + if utils.CensoringType.is_right_censoring(self) and self._FAST_MEDIAN_PREDICT: footers.append(("Concordance", "{:.{prec}f}".format(self.concordance_index_, prec=decimals))) footers.extend( @@ -2143,7 +2112,7 @@ def print_summary(self, decimals=2, style=None, **kwargs): "log-likelihood ratio test", "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), ), - ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals)), + ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-utils.safe_log2(sr.p_value), prec=decimals)), ] ) @@ -2251,6 +2220,8 @@ def predict_cumulative_hazard(self, df, *, times=None, conditional_after=None): df = df.to_frame().T df = self._filter_dataframe_to_covariates(df).copy().astype(float) + + # TODO: where does self.timeline come from? times = utils.coalesce(times, self.timeline) times = np.atleast_1d(times).astype(float) @@ -2536,6 +2507,7 @@ def AIC_(self) -> float: class ParametericAFTRegressionFitter(ParametricRegressionFitter): _KNOWN_MODEL = True + _FAST_MEDIAN_PREDICT = True _primary_parameter_name: str _ancillary_parameter_name: str diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 3f5fbf57c..cd57b151e 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -41,6 +41,7 @@ string_justify, coalesce, ) +from lifelines import utils __all__ = ["CoxTimeVaryingFitter"] @@ -297,7 +298,7 @@ def summary(self): df["exp(coef) upper %g%%" % ci] = self.hazard_ratios_ * np.exp(z * self.standard_errors_) df["z"] = self._compute_z_values() df["p"] = self._compute_p_values() - df["-log2(p)"] = -np.log2(df["p"]) + df["-log2(p)"] = -utils.safe_log2(df["p"]) return df def _newton_rhaphson( @@ -675,7 +676,7 @@ def print_summary(self, decimals=2, style=None, **kwargs): "log-likelihood ratio test", "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), ), - ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals)), + ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-utils.safe_log2(sr.p_value), prec=decimals)), ] ) diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 5c56ae25e..17f75b6f0 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -22,117 +22,406 @@ from lifelines.utils.printer import Printer from lifelines.utils.safe_exp import safe_exp from lifelines.utils.concordance import _concordance_summary_statistics, _concordance_ratio, concordance_index -from lifelines.utils import ( - _get_index, - _to_list, - _to_tuple, - _to_1d_array, - inv_normal_cdf, - normalize, - qth_survival_times, - coalesce, - check_for_numeric_dtypes_or_raise, - check_low_var, - check_complete_separation, - check_nans_or_infs, - StatError, - ConvergenceWarning, - StatisticalWarning, - StepSizer, - ConvergenceError, - string_justify, - interpolate_at_times_and_return_pandas, - CensoringType, - interpolate_at_times, - format_p_value, -) +from lifelines import utils __all__ = ["CoxPHFitter"] -CONVERGENCE_DOCS = "Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model" +class CoxPHFitter(RegressionFitter, ProportionalHazardMixin): + r""" + This class implements fitting Cox's proportional hazard model using Efron's method for ties. -class _PHSplineFitter(ParametricRegressionFitter, SplineFitterMixin, ProportionalHazardMixin): - """ - Proportional Hazard model with cubic splines. + .. math:: h(t|x) = h_0(t) \exp((x - \overline{x})' \beta) - References - ------------ - Royston, P., & Parmar, M. K. B. (2002). Flexible parametric proportional-hazards and proportional-odds models for censored survival data, with application to prognostic modelling and estimation of treatment effects. Statistics in Medicine, 21(15), 2175–2197. doi:10.1002/sim.1203  + The baseline hazard, :math:`h_0(t)` can be modeled non-parametrically (using Breslow's method) or parametrically (using cubic splines). + This is specified using the ``baseline_estimation_method`` parameter. + + Parameters + ---------- + + alpha: float, optional (default=0.05) + the level in the confidence intervals. + + baseline_estimation_method: string, optional + specify how the fitter should estimate the baseline. ``"breslow"`` or ``"spline"`` + + penalizer: float or array, optional (default=0.0) + Attach a penalty to the size of the coefficients during regression. This improves + stability of the estimates and controls for high correlation between covariates. + For example, this shrinks the magnitude value of :math:`\beta_i`. See ``l1_ratio`` below. + The penalty term is :math:`\frac{1}{2} \text{penalizer} \left( (1-\text{l1_ratio}) ||\beta||_2^2 + \text{l1_ratio}||\beta||_1\right)`. + + Alternatively, penalizer is an array equal in size to the number of parameters, with penalty coefficients for specific variables. For + example, `penalizer=0.01 * np.ones(p)` is the same as `penalizer=0.01` + + l1_ratio: float, optional (default=0.0) + Specify what ratio to assign to a L1 vs L2 penalty. Same as scikit-learn. See ``penalizer`` above. + + strata: list, optional + specify a list of columns to use in stratification. This is useful if a + categorical covariate does not obey the proportional hazard assumption. This + is used similar to the `strata` expression in R. + See http://courses.washington.edu/b515/l17.pdf. + + n_baseline_knots: int + Used when ``baseline_estimation_method`` is "spline". Set the number of interior knots in the baseline hazard. + + Examples + -------- + .. code:: python + + from lifelines.datasets import load_rossi + from lifelines import CoxPHFitter + rossi = load_rossi() + cph = CoxPHFitter() + cph.fit(rossi, 'week', 'arrest') + cph.print_summary() + + Attributes + ---------- + params_ : Series + The estimated coefficients. Changed in version 0.22.0: use to be ``.hazards_`` + hazard_ratios_ : Series + The exp(coefficients) + confidence_intervals_ : DataFrame + The lower and upper confidence intervals for the hazard coefficients + durations: Series + The durations provided + event_observed: Series + The event_observed variable provided + weights: Series + The event_observed variable provided + variance_matrix_ : numpy array + The variance matrix of the coefficients + strata: list + the strata provided + standard_errors_: Series + the standard errors of the estimates + baseline_hazard_: DataFrame + baseline_cumulative_hazard_: DataFrame + baseline_survival_: DataFrame """ _KNOWN_MODEL = True - _scipy_fit_method = "SLSQP" - _scipy_fit_options = {"maxiter": 1000, "iprint": 100} - def __init__(self, n_baseline_knots=1, *args, **kwargs): + def __init__( + self, + baseline_estimation_method: str = "breslow", + penalizer: Union[float, np.ndarray] = 0.0, + strata: Optional[Union[List[str], str]] = None, + l1_ratio: float = 0.0, + n_baseline_knots: Optional[int] = None, + **kwargs, + ) -> None: + + super(CoxPHFitter, self).__init__(**kwargs) + + if l1_ratio < 0 or l1_ratio > 1: + raise ValueError("l1_ratio parameter must in [0, 1].") + + self.penalizer = penalizer + self._strata = strata + self.l1_ratio = l1_ratio + self.baseline_estimation_method = baseline_estimation_method self.n_baseline_knots = n_baseline_knots - self._fitted_parameter_names = ["beta_"] + ["phi%d_" % i for i in range(1, self.n_baseline_knots + 2)] - super(_PHSplineFitter, self).__init__(*args, **kwargs) - def set_knots(self, T, E): - self.knots = np.percentile(T[E.astype(bool).values], np.linspace(5, 95, self.n_baseline_knots + 2)) - return + @utils.CensoringType.right_censoring + def fit( + self, + df: pd.DataFrame, + duration_col: Optional[str] = None, + event_col: Optional[str] = None, + show_progress: bool = False, + initial_point: Optional[ndarray] = None, + strata: Optional[Union[str, List[str]]] = None, + step_size: Optional[float] = None, + weights_col: Optional[str] = None, + cluster_col: Optional[str] = None, + robust: bool = False, + batch_mode: Optional[bool] = None, + ) -> "CoxPHFitter": + """ + Fit the Cox proportional hazard model to a dataset. - def _pre_fit_model(self, Ts, E, df): - self.set_knots(Ts[0], E) + Parameters + ---------- + df: DataFrame + a Pandas DataFrame with necessary columns `duration_col` and + `event_col` (see below), covariates columns, and special columns (weights, strata). + `duration_col` refers to + the lifetimes of the subjects. `event_col` refers to whether + the 'death' events was observed: 1 if observed, 0 else (censored). - def _create_initial_point(self, Ts, E, entries, weights, Xs): - return [ - { - **{"beta_": np.zeros(len(Xs.mappings["beta_"])), "phi1_": np.array([0.05]), "phi2_": np.array([-0.05])}, - **{"phi%d_" % i: np.array([0.0]) for i in range(3, self.n_baseline_knots + 2)}, - } - ] + duration_col: string + the name of the column in DataFrame that contains the subjects' + lifetimes. - def _cumulative_hazard(self, params, T, Xs): - lT = anp.log(T) - H = safe_exp(anp.dot(Xs["beta_"], params["beta_"]) + params["phi1_"] * lT) + event_col: string, optional + the name of the column in DataFrame that contains the subjects' death + observation. If left as None, assume all individuals are uncensored. - for i in range(2, self.n_baseline_knots + 2): - H *= safe_exp( - params["phi%d_" % i] * self.basis(lT, anp.log(self.knots[i - 1]), anp.log(self.knots[0]), anp.log(self.knots[-1])) - ) - return H + weights_col: string, optional + an optional column in the DataFrame, df, that denotes the weight per subject. + This column is expelled and not used as a covariate, but as a weight in the + final regression. Default weight is 1. + This can be used for case-weights. For example, a weight of 2 means there were two subjects with + identical observations. + This can be used for sampling weights. In that case, use ``robust=True`` to get more accurate standard errors. + show_progress: bool, optional (default=False) + since the fitter is iterative, show convergence + diagnostics. Useful if convergence is failing. -class _BatchVsSingle: + initial_point: (d,) numpy array, optional + initialize the starting point of the iterative + algorithm. Default is the zero vector. - BATCH = "batch" - SINGLE = "single" + strata: list or string, optional + specify a column or list of columns n to use in stratification. This is useful if a + categorical covariate does not obey the proportional hazard assumption. This + is used similar to the ``strata`` expression in R. + See http://courses.washington.edu/b515/l17.pdf. - def decide(self, batch_mode: Optional[bool], n_unique: int, n_total: int, n_vars: int) -> str: - frac_dups = n_unique / n_total - if batch_mode or ( - # https://github.com/CamDavidsonPilon/lifelines/issues/591 for original issue. - # new values from from perf/batch_vs_single script. - (batch_mode is None) - and ( + step_size: float, optional + set an initial step size for the fitting algorithm. Setting to 1.0 may improve performance, but could also hurt convergence. + + robust: bool, optional (default=False) + Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle + ties, so if there are high number of ties, results may significantly differ. See + "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 + + cluster_col: string, optional + specifies what column has unique identifiers for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to + be used. + + batch_mode: bool, optional + enabling batch_mode can be faster for datasets with a large number of ties. If left as None, lifelines will choose the best option. + + Returns + ------- + self: CoxPHFitter + self with additional new properties: ``print_summary``, ``hazards_``, ``confidence_intervals_``, ``baseline_survival_``, etc. + + + Note + ---- + Tied survival times are handled using Efron's tie-method. + + + Examples + -------- + .. code:: python + + from lifelines import CoxPHFitter + + df = pd.DataFrame({ + 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], + 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], + 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], + 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], + }) + + cph = CoxPHFitter() + cph.fit(df, 'T', 'E') + cph.print_summary() + cph.predict_median(df) + + .. code:: python + + from lifelines import CoxPHFitter + + df = pd.DataFrame({ + 'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], + 'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], + 'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2], + 'weights': [1.1, 0.5, 2.0, 1.6, 1.2, 4.3, 1.4, 4.5, 3.0, 3.2, 0.4, 6.2], + 'month': [10, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], + 'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7], + }) + + cph = CoxPHFitter() + cph.fit(df, 'T', 'E', strata=['month', 'age'], robust=True, weights_col='weights') + cph.print_summary() + cph.predict_median(df) + + """ + self._model = self._fit_model( + df, + duration_col, + event_col=event_col, + show_progress=show_progress, + initial_point=initial_point, + strata=utils.coalesce(strata, self._strata), + step_size=step_size, + weights_col=weights_col, + cluster_col=cluster_col, + robust=robust, + batch_mode=batch_mode, + ) + return self + + def __getattr__(self, attr): + if attr == "_model": + raise AttributeError("Must call `fit` first.") + + if hasattr(self._model, attr): + return getattr(self._model, attr) + + raise AttributeError("%s has no attribute '%s'" % (self._class_name, attr)) + + def __dir__(self): + # pretty hacky - probably a better way + return self._model.__dir__() + ["print_summary", "baseline_estimation_method"] + + def _fit_model(self, *args, **kwargs): + if self.baseline_estimation_method == "breslow": + return self._fit_model_breslow(*args, **kwargs) + elif self.baseline_estimation_method == "spline": + return self._fit_model_spline(*args, **kwargs) + else: + raise ValueError("Invalid baseline estimate supplied.") + + def _fit_model_breslow(self, *args, **kwargs): + model = SemiParametricPHFitter(penalizer=self.penalizer, l1_ratio=self.l1_ratio, strata=self._strata) + model.fit(*args, **kwargs) + return model + + def _fit_model_spline(self, *args, **kwargs): + # handle strata and cluster_col + if kwargs["strata"] is not None: + raise ValueError("strata is not available for this baseline estimation method") + if kwargs["cluster_col"] is not None: + raise ValueError("cluster_col is not available for this baseline estimation method") + + kwargs.pop("strata") + kwargs.pop("cluster_col") + kwargs.pop("step_size") + kwargs.pop("batch_mode") + df = args[0].copy() + df["_intercept"] = 1 + + regressors = { + **{"beta_": df.columns.difference(["T", "E", "weights"]), "phi1_": ["_intercept"]}, + **{"phi%d_" % i: ["_intercept"] for i in range(2, self.n_baseline_knots + 2)}, + } + + model = ParametricSplinePHFitter(penalizer=self.penalizer, l1_ratio=self.l1_ratio, n_baseline_knots=self.n_baseline_knots) + model.fit(df, *args[1:], regressors=regressors, **kwargs) + return model + + def print_summary(self, decimals: int = 2, style: Optional[str] = None, **kwargs) -> None: + """ + Print summary statistics describing the fit, the coefficients, and the error bounds. + + Parameters + ----------- + decimals: int, optional (default=2) + specify the number of decimal places to show + style: string + {html, ascii, latex} + kwargs: + print additional metadata in the output (useful to provide model names, dataset names, etc.) when comparing + multiple outputs. + + """ + + # Print information about data first + justify = utils.string_justify(25) + + headers: List[Tuple[str, Any]] = [] + headers.append(("duration col", "'%s'" % self.duration_col)) + + if self.event_col: + headers.append(("event col", "'%s'" % self.event_col)) + if self.weights_col: + headers.append(("weights col", "'%s'" % self.weights_col)) + if self.cluster_col: + headers.append(("cluster col", "'%s'" % self.cluster_col)) + if isinstance(self.penalizer, np.ndarray) or self.penalizer > 0: + headers.append(("penalizer", self.penalizer)) + headers.append(("l1 ratio", self.l1_ratio)) + if self.robust or self.cluster_col: + headers.append(("robust variance", True)) + if self.strata: + headers.append(("strata", self.strata)) + if self.baseline_estimation_method == "spline": + headers.append(("number of baseline knots", self.n_baseline_knots)) + + headers.extend( + [ + ("baseline estimation", self.baseline_estimation_method), + ("number of observations", "{:g}".format(self.weights.sum())), + ("number of events observed", "{:g}".format(self.weights[self.event_observed > 0].sum())), ( - 6.153_952e-01 - + -3.927_241e-06 * n_total - + 2.544_118e-11 * n_total ** 2 - + 2.071_377e00 * frac_dups - + -9.724_922e-01 * frac_dups ** 2 - + 9.138_711e-06 * n_total * frac_dups - + -5.617_844e-03 * n_vars - + -4.402_736e-08 * n_vars * n_total - ) - < 1 + "partial log-likelihood" if self.baseline_estimation_method == "breslow" else "log-likelihood", + "{:.{prec}f}".format(self.log_likelihood_, prec=decimals), + ), + ("time fit was run", self._time_fit_was_called), + ] + ) + + footers = [] + sr = self.log_likelihood_ratio_test() + + if self.baseline_estimation_method == "breslow": + footers.extend( + [ + ("Concordance", "{:.{prec}f}".format(self.concordance_index_, prec=decimals)), + ("Partial AIC", "{:.{prec}f}".format(self.AIC_partial_, prec=decimals)), + ] ) - ): - return self.BATCH - return self.SINGLE + elif self.baseline_estimation_method == "spline": + footers.append(("AIC", "{:.{prec}f}".format(self.AIC_, prec=decimals))) + + footers.append( + ("log-likelihood ratio test", "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals)) + ) + footers.append(("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-utils.safe_log2(sr.p_value), prec=decimals))) + + p = Printer(self, headers, footers, justify, decimals, kwargs) + p.print(style=style) + + def compute_followup_hazard_ratios(self, training_df: DataFrame, followup_times: Iterable) -> DataFrame: + """ + Recompute the hazard ratio at different follow-up times (lifelines handles accounting for updated censoring and updated durations). + This is useful because we need to remember that the hazard ratio is actually a weighted-average of period-specific hazard ratios. + + Parameters + ---------- + + training_df: pd.DataFrame + The same dataframe used to train the model + followup_times: Iterable + a list/array of follow-up times to recompute the hazard ratio at. + + + """ + results = {} + for t in sorted(followup_times): + assert t <= training_df[self.duration_col].max(), "all follow-up times must be less than max observed duration" + df = training_df.copy() + # if we "rollback" the df to time t, who is dead and who is censored + df[self.event_col] = (df[self.duration_col] <= t) & df[self.event_col] + df[self.duration_col] = np.minimum(df[self.duration_col], t) + + model = self.__class__( + penalizer=self.penalizer, + l1_ratio=self.l1_ratio, + strata=self._strata, + baseline_estimation_method=self.baseline_estimation_method, + n_baseline_knots=self.n_baseline_knots, + ).fit(df, self.duration_col, self.event_col, weights_col=self.weights_col, cluster_col=self.cluster_col) + results[t] = model.hazard_ratios_ + return DataFrame(results).T -class CoxPHFitter(SemiParametricRegressionFittter, ProportionalHazardMixin): +class SemiParametricPHFitter(ProportionalHazardMixin, SemiParametricRegressionFittter): r""" This class implements fitting Cox's proportional hazard model using Efron's method for ties. .. math:: h(t|x) = h_0(t) \exp((x - \overline{x})' \beta) - The baseline hazard, :math:`h_0(t)` can be modeled non-parametrically (using Breslow's method) or parametrically (using cubic splines). - This is specified using the ``baseline_estimation_method`` parameter. + The baseline hazard, :math:`h_0(t)` is modeled non-parametrically (using Breslow's method). Parameters ---------- @@ -140,9 +429,6 @@ class CoxPHFitter(SemiParametricRegressionFittter, ProportionalHazardMixin): alpha: float, optional (default=0.05) the level in the confidence intervals. - baseline_estimation_method: string, optional - specify how the fitter should estimate the baseline. ``"breslow"`` or ``"spline"`` - penalizer: float or array, optional (default=0.0) Attach a penalty to the size of the coefficients during regression. This improves stability of the estimates and controls for high correlation between covariates. @@ -161,9 +447,6 @@ class CoxPHFitter(SemiParametricRegressionFittter, ProportionalHazardMixin): is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. - n_baseline_knots: int - Used when ``baseline_estimation_method`` is "spline". Set the number of interior knots in the baseline hazard. - Examples -------- .. code:: python @@ -199,20 +482,17 @@ class CoxPHFitter(SemiParametricRegressionFittter, ProportionalHazardMixin): baseline_cumulative_hazard_: DataFrame baseline_survival_: DataFrame """ - _KNOWN_MODEL = True def __init__( self, - baseline_estimation_method: str = "breslow", penalizer: Union[float, np.ndarray] = 0.0, strata: Optional[Union[List[str], str]] = None, l1_ratio: float = 0.0, - n_baseline_knots: int = 1, **kwargs, ) -> None: - super(CoxPHFitter, self).__init__(**kwargs) + super(SemiParametricPHFitter, self).__init__(**kwargs) if l1_ratio < 0 or l1_ratio > 1: raise ValueError("l1_ratio parameter must in [0, 1].") @@ -220,10 +500,8 @@ def __init__( self.penalizer = penalizer self.strata = strata self.l1_ratio = l1_ratio - self.baseline_estimation_method = baseline_estimation_method - self.n_baseline_knots = n_baseline_knots - @CensoringType.right_censoring + @utils.CensoringType.right_censoring def fit( self, df: pd.DataFrame, @@ -237,7 +515,7 @@ def fit( cluster_col: Optional[str] = None, robust: bool = False, batch_mode: Optional[bool] = None, - ) -> "CoxPHFitter": + ) -> "SemiParametricPHFitter": """ Fit the Cox proportional hazard model to a dataset. @@ -354,7 +632,7 @@ def fit( self.weights_col = weights_col self._n_examples = df.shape[0] self._batch_mode = batch_mode - self.strata = coalesce(strata, self.strata) + self.strata = utils.coalesce(strata, self.strata) X, T, E, weights, original_index, self._clusters = self._preprocess_dataframe(df) @@ -372,14 +650,15 @@ def fit( # this is surprisingly faster to do... X_norm = pd.DataFrame( - normalize(X.values, self._norm_mean.values, self._norm_std.values), index=X.index, columns=X.columns + utils.normalize(X.values, self._norm_mean.values, self._norm_std.values), index=X.index, columns=X.columns ) - params_, ll_, variance_matrix_, baseline_hazard_, baseline_cumulative_hazard_ = self._fit_model( + params_, ll_, variance_matrix_, baseline_hazard_, baseline_cumulative_hazard_, model = self._fit_model( X_norm, T, E, weights=weights, initial_point=initial_point, show_progress=show_progress, step_size=step_size ) self.log_likelihood_ = ll_ + self.model = model self.variance_matrix_ = variance_matrix_ self.params_ = pd.Series(params_, index=X.columns, name="coef") self.baseline_hazard_ = baseline_hazard_ @@ -409,7 +688,9 @@ def _preprocess_dataframe(self, df: DataFrame) -> Tuple[DataFrame, Series, Serie df = df.copy() if self.strata is not None: - sort_by = _to_list(self.strata) + ([self.duration_col, self.event_col] if self.event_col else [self.duration_col]) + sort_by = utils._to_list(self.strata) + ( + [self.duration_col, self.event_col] if self.event_col else [self.duration_col] + ) df = df.sort_values(by=sort_by) original_index = df.index.copy() df = df.set_index(self.strata) @@ -437,7 +718,7 @@ def _preprocess_dataframe(self, df: DataFrame) -> Tuple[DataFrame, Series, Serie T = T.astype(float) # we check nans here because converting to bools maps NaNs to True.. - check_nans_or_infs(E) + utils.check_nans_or_infs(E) E = E.astype(bool) self._check_values_pre_fitting(X, T, E, W) @@ -448,13 +729,13 @@ def _check_values_post_fitting(self, X, T, E, W): """ Functions here check why a fit may have non-obviously failed """ - check_complete_separation(X, E, T, self.event_col) + utils.check_complete_separation(X, E, T, self.event_col) def _check_values_pre_fitting(self, X, T, E, W): - check_low_var(X) - check_for_numeric_dtypes_or_raise(X) - check_nans_or_infs(T) - check_nans_or_infs(X) + utils.check_low_var(X) + utils.check_for_numeric_dtypes_or_raise(X) + utils.check_nans_or_infs(T) + utils.check_nans_or_infs(X) # check to make sure their weights are okay if self.weights_col: if (W.astype(int) != W).any() and not self.robust: @@ -463,20 +744,12 @@ def _check_values_pre_fitting(self, X, T, E, W): It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" """, - StatisticalWarning, + utils.StatisticalWarning, ) if (W <= 0).any(): raise ValueError("values in weight column %s must be positive." % self.weights_col) - def _fit_model(self, *args, **kwargs): - if self.baseline_estimation_method == "breslow": - return self._fit_model_breslow(*args, **kwargs) - elif self.baseline_estimation_method == "spline": - return self._fit_model_spline(*args, **kwargs) - else: - raise ValueError("Invalid baseline estimate supplied.") - - def _fit_model_breslow( + def _fit_model( self, X: DataFrame, T: Series, @@ -506,7 +779,7 @@ def _fit_model_breslow( else: variance_matrix_ = pd.DataFrame(index=X.columns, columns=X.columns) - return params_, ll_, variance_matrix_, baseline_hazard_, baseline_cumulative_hazard_ + return params_, ll_, variance_matrix_, baseline_hazard_, baseline_cumulative_hazard_, None def _newton_rhapson_for_efron_model( self, @@ -550,6 +823,8 @@ def _newton_rhapson_for_efron_model( ------- beta: (1,d) numpy array. """ + CONVERGENCE_DOCS = "Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model" + n, d = X.shape # soft penalizer functions, from https://www.cs.ubc.ca/cgi-bin/tr/2009/TR-2009-19.pdf @@ -576,7 +851,7 @@ def _newton_rhapson_for_efron_model( else: beta = zeros((d,)) - step_sizer = StepSizer(step_size) + step_sizer = utils.StepSizer(step_size) step_size = step_sizer.next() delta = np.zeros_like(beta) @@ -620,12 +895,12 @@ def _newton_rhapson_for_efron_model( except (ValueError, LinAlgError) as e: self._check_values_post_fitting(X, T, E, weights) if "infs or NaNs" in str(e): - raise ConvergenceError( + raise utils.ConvergenceError( """Hessian or gradient contains nan or inf value(s). Convergence halted. {0}""".format(CONVERGENCE_DOCS), e, ) elif isinstance(e, LinAlgError): - raise ConvergenceError( + raise utils.ConvergenceError( """Convergence halted due to matrix inversion problems. Suspicion is high collinearity. {0}""".format( CONVERGENCE_DOCS ), @@ -639,7 +914,7 @@ def _newton_rhapson_for_efron_model( if np.any(np.isnan(delta)): self._check_values_post_fitting(X, T, E, weights) - raise ConvergenceError("""delta contains nan value(s). Convergence halted. {0}""".format(CONVERGENCE_DOCS)) + raise utils.ConvergenceError("""delta contains nan value(s). Convergence halted. {0}""".format(CONVERGENCE_DOCS)) # Save these as pending result hessian, gradient = h, g @@ -676,7 +951,7 @@ def _newton_rhapson_for_efron_model( warnings.warn( "The log-likelihood is getting suspiciously close to 0 and the delta is still large. There may be complete separation in the dataset. This may result in incorrect inference of coefficients. \ See https://stats.stackexchange.com/q/11109/11867 for more.\n", - ConvergenceWarning, + utils.ConvergenceWarning, ) converging, success = False, False @@ -694,53 +969,14 @@ def _newton_rhapson_for_efron_model( warnings.warn( "Newton-Rhaphson convergence completed successfully but norm(delta) is still high, %.3f. This may imply non-unique solutions to the maximum likelihood. Perhaps there is collinearity or complete separation in the dataset?\n" % norm_delta, - ConvergenceWarning, + utils.ConvergenceWarning, ) elif not success: self._check_values_post_fitting(X, T, E, weights) - warnings.warn("Newton-Rhaphson failed to converge sufficiently in %d steps.\n" % max_steps, ConvergenceWarning) + warnings.warn("Newton-Rhaphson failed to converge sufficiently in %d steps.\n" % max_steps, utils.ConvergenceWarning) return beta, ll_, hessian - def _fit_model_spline( - self, - X: DataFrame, - T: Series, - E: Series, - weights: Series, - initial_point: Optional[ndarray] = None, - show_progress: bool = True, - **kwargs, - ): - if self.strata is not None: - raise NotImplementedError("Spline estimation cannot be used with strata yet.") - - df = X.copy() - df["T"] = T - df["E"] = E - df["weights"] = weights - df["constant"] = 1.0 - - regressors = { - **{"beta_": df.columns.difference(["T", "E", "weights"]), "phi1_": ["constant"]}, - **{"phi%d_" % i: ["constant"] for i in range(2, self.n_baseline_knots + 2)}, - } - - cph = _PHSplineFitter(n_baseline_knots=self.n_baseline_knots, penalizer=self.penalizer, l1_ratio=self.l1_ratio) - cph.fit(df, "T", "E", weights_col="weights", show_progress=show_progress, robust=self.robust, regressors=regressors) - self._ll_null_ = cph._ll_null - baseline_hazard_ = cph.predict_hazard(df.mean()).rename(columns={0: "baseline hazard"}) - baseline_cumulative_hazard_ = cph.predict_cumulative_hazard(df.mean()).rename(columns={0: "baseline cumulative hazard"}) - - return ( - cph.params_.loc["beta_"].drop("constant") / self._norm_std, - cph.log_likelihood_, - cph.variance_matrix_.loc["beta_", "beta_"].drop("constant").drop("constant", axis=1) - / np.outer(self._norm_std, self._norm_std), - baseline_hazard_, - baseline_cumulative_hazard_, - ) - def _get_efron_values_single( self, X: ndarray, T: ndarray, E: ndarray, weights: ndarray, beta: ndarray ) -> Tuple[ndarray, ndarray, float]: @@ -1239,7 +1475,7 @@ def _compute_score_within_strata(self, X: ndarray, _T: Union[ndarray, Series], E def _compute_confidence_intervals(self) -> pd.DataFrame: ci = 100 * (1 - self.alpha) - z = inv_normal_cdf(1 - self.alpha / 2) + z = utils.inv_normal_cdf(1 - self.alpha / 2) se = self.standard_errors_ hazards = self.params_.values return pd.DataFrame( @@ -1284,7 +1520,7 @@ def summary(self) -> pd.DataFrame: df : DataFrame """ ci = 100 * (1 - self.alpha) - z = inv_normal_cdf(1 - self.alpha / 2) + z = utils.inv_normal_cdf(1 - self.alpha / 2) with np.errstate(invalid="ignore", divide="ignore", over="ignore", under="ignore"): df = pd.DataFrame(index=self.params_.index) df["coef"] = self.params_ @@ -1296,77 +1532,9 @@ def summary(self) -> pd.DataFrame: df["exp(coef) upper %g%%" % ci] = self.hazard_ratios_ * exp(z * self.standard_errors_) df["z"] = self._compute_z_values() df["p"] = self._compute_p_values() - df["-log2(p)"] = -np.log2(df["p"]) + df["-log2(p)"] = -utils.safe_log2(df["p"]) return df - def print_summary(self, decimals: int = 2, style: Optional[str] = None, **kwargs) -> None: - """ - Print summary statistics describing the fit, the coefficients, and the error bounds. - - Parameters - ----------- - decimals: int, optional (default=2) - specify the number of decimal places to show - style: string - {html, ascii, latex} - kwargs: - print additional metadata in the output (useful to provide model names, dataset names, etc.) when comparing - multiple outputs. - - """ - - # Print information about data first - justify = string_justify(25) - - headers: List[Tuple[str, Any]] = [] - headers.append(("duration col", "'%s'" % self.duration_col)) - - if self.event_col: - headers.append(("event col", "'%s'" % self.event_col)) - if self.weights_col: - headers.append(("weights col", "'%s'" % self.weights_col)) - if self.cluster_col: - headers.append(("cluster col", "'%s'" % self.cluster_col)) - if isinstance(self.penalizer, np.ndarray) or self.penalizer > 0: - headers.append(("penalizer", self.penalizer)) - headers.append(("l1 ratio", self.l1_ratio)) - if self.robust or self.cluster_col: - headers.append(("robust variance", True)) - if self.strata: - headers.append(("strata", self.strata)) - if self.baseline_estimation_method == "spline": - headers.append(("number of baseline knots", self.n_baseline_knots)) - - headers.extend( - [ - ("baseline estimation", self.baseline_estimation_method), - ("number of observations", "{:g}".format(self.weights.sum())), - ("number of events observed", "{:g}".format(self.weights[self.event_observed > 0].sum())), - ( - "partial log-likelihood" if self.baseline_estimation_method == "breslow" else "log-likelihood", - "{:.{prec}f}".format(self.log_likelihood_, prec=decimals), - ), - ("time fit was run", self._time_fit_was_called), - ] - ) - - sr = self.log_likelihood_ratio_test() - footers = [] - footers.extend( - [ - ("Concordance", "{:.{prec}f}".format(self.concordance_index_, prec=decimals)), - ("Partial AIC", "{:.{prec}f}".format(self.AIC_partial_, prec=decimals)), - ( - "log-likelihood ratio test", - "{:.{prec}f} on {} df".format(sr.test_statistic, sr.degrees_freedom, prec=decimals), - ), - ("-log2(p) of ll-ratio test", "{:.{prec}f}".format(-np.log2(sr.p_value), prec=decimals)), - ] - ) - - p = Printer(self, headers, footers, justify, decimals, kwargs) - p.print(style=style) - def _trivial_log_likelihood(self): df = pd.DataFrame({"T": self.durations, "E": self.event_observed, "W": self.weights}) trivial_model = self.__class__().fit(df, "E", weights_col="W", batch_mode=self.batch_mode) @@ -1444,7 +1612,7 @@ def predict_log_partial_hazard(self, X: Union[ndarray, DataFrame]) -> pd.Series: X = X.to_frame().T return self.predict_log_partial_hazard(X) - index = _get_index(X) + index = utils._get_index(X) if isinstance(X, pd.DataFrame): order = hazard_names @@ -1454,7 +1622,7 @@ def predict_log_partial_hazard(self, X: Union[ndarray, DataFrame]) -> pd.Series: X = X.astype(float) - X = normalize(X, self._norm_mean.values, 1) + X = utils.normalize(X, self._norm_mean.values, 1) return pd.Series(dot(X, self.params_), index=index) def predict_cumulative_hazard( @@ -1477,7 +1645,7 @@ def predict_cumulative_hazard( points in time are not in the index. conditional_after: iterable, optional Must be equal is size to X.shape[0] (denoted ``n`` above). An iterable (array, list, series) of possibly non-zero values that represent how long the - subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents + subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents :math:`s` in :math:`T | T > s`. This is useful for knowing the *remaining* hazard/survival of censored subjects. The new timeline is the remaining duration of the subject, i.e. reset back to starting at 0. @@ -1490,7 +1658,7 @@ def predict_cumulative_hazard( if times is not None: times = np.atleast_1d(times).astype(float) if conditional_after is not None: - conditional_after = _to_1d_array(conditional_after).reshape(n, 1) + conditional_after = utils._to_1d_array(conditional_after).reshape(n, 1) if self.strata: X = X.copy() @@ -1502,28 +1670,28 @@ def predict_cumulative_hazard( try: strata_c_0 = self.baseline_cumulative_hazard_[[stratum]] except KeyError: - raise StatError( + raise utils.StatError( dedent( """The stratum %s was not found in the original training data. For example, try the following on the original dataset, df: `df.groupby(%s).size()`. Expected is that %s is not present in the output.""" % (stratum, self.strata, stratum) ) ) - col = _get_index(stratified_X) + col = utils._get_index(stratified_X) v = self.predict_partial_hazard(stratified_X) - times_ = coalesce(times, self.baseline_cumulative_hazard_.index) + times_ = utils.coalesce(times, self.baseline_cumulative_hazard_.index) n_ = stratified_X.shape[0] if conditional_after is not None: conditional_after_ = stratified_X.pop("_conditional_after")[:, None] times_to_evaluate_at = np.tile(times_, (n_, 1)) + conditional_after_ - c_0_ = interpolate_at_times(strata_c_0, times_to_evaluate_at) - c_0_conditional_after = interpolate_at_times(strata_c_0, conditional_after_) + c_0_ = utils.interpolate_at_times(strata_c_0, times_to_evaluate_at) + c_0_conditional_after = utils.interpolate_at_times(strata_c_0, conditional_after_) c_0_ = np.clip((c_0_ - c_0_conditional_after).T, 0, np.inf) else: times_to_evaluate_at = np.tile(times_, (n_, 1)) - c_0_ = interpolate_at_times(strata_c_0, times_to_evaluate_at).T + c_0_ = utils.interpolate_at_times(strata_c_0, times_to_evaluate_at).T cumulative_hazard_ = cumulative_hazard_.merge( pd.DataFrame(c_0_ * v.values, columns=col, index=times_), how="outer", right_index=True, left_index=True @@ -1531,19 +1699,19 @@ def predict_cumulative_hazard( else: v = self.predict_partial_hazard(X) - col = _get_index(v) - times_ = coalesce(times, self.baseline_cumulative_hazard_.index) + col = utils._get_index(v) + times_ = utils.coalesce(times, self.baseline_cumulative_hazard_.index) if conditional_after is not None: times_to_evaluate_at = np.tile(times_, (n, 1)) + conditional_after - c_0 = interpolate_at_times(self.baseline_cumulative_hazard_, times_to_evaluate_at) - c_0_conditional_after = interpolate_at_times(self.baseline_cumulative_hazard_, conditional_after) + c_0 = utils.interpolate_at_times(self.baseline_cumulative_hazard_, times_to_evaluate_at) + c_0_conditional_after = utils.interpolate_at_times(self.baseline_cumulative_hazard_, conditional_after) c_0 = np.clip((c_0 - c_0_conditional_after).T, 0, np.inf) else: times_to_evaluate_at = np.tile(times_, (n, 1)) - c_0 = interpolate_at_times(self.baseline_cumulative_hazard_, times_to_evaluate_at).T + c_0 = utils.interpolate_at_times(self.baseline_cumulative_hazard_, times_to_evaluate_at).T cumulative_hazard_ = pd.DataFrame(c_0 * v.values, columns=col, index=times_) @@ -1572,7 +1740,7 @@ def predict_survival_function( points in time are not in the index. conditional_after: iterable, optional Must be equal is size to X.shape[0] (denoted ``n`` above). An iterable (array, list, series) of possibly non-zero values that represent how long the - subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents + subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents :math:`s` in :math:`T | T > s`. This is useful for knowing the *remaining* hazard/survival of censored subjects. The new timeline is the remaining duration of the subject, i.e. normalized back to starting at 0. @@ -1595,7 +1763,7 @@ def predict_percentile(self, X: DataFrame, p: float = 0.5, conditional_after: Op the percentile, must be between 0 and 1. conditional_after: iterable, optional Must be equal is size to X.shape[0] (denoted ``n`` above). An iterable (array, list, series) of possibly non-zero values that represent how long the - subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents + subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents :math:`s` in :math:`T | T > s`. This is useful for knowing the *remaining* hazard/survival of censored subjects. The new timeline is the remaining duration of the subject, i.e. normalized back to starting at 0. @@ -1604,8 +1772,10 @@ def predict_percentile(self, X: DataFrame, p: float = 0.5, conditional_after: Op predict_median """ - subjects = _get_index(X) - return qth_survival_times(p, self.predict_survival_function(X, conditional_after=conditional_after)[subjects]).T.squeeze() + subjects = utils._get_index(X) + return utils.qth_survival_times( + p, self.predict_survival_function(X, conditional_after=conditional_after)[subjects] + ).T.squeeze() def predict_median(self, X: DataFrame, conditional_after: Optional[ndarray] = None) -> pd.Series: """ @@ -1620,7 +1790,7 @@ def predict_median(self, X: DataFrame, conditional_after: Optional[ndarray] = No same order as the training data. conditional_after: iterable, optional Must be equal is size to X.shape[0] (denoted ``n`` above). An iterable (array, list, series) of possibly non-zero values that represent how long the - subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents + subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents :math:`s` in :math:`T | T > s`. This is useful for knowing the *remaining* hazard/survival of censored subjects. The new timeline is the remaining duration of the subject, i.e. normalized back to starting at 0. @@ -1651,7 +1821,7 @@ def predict_expectation(self, X: DataFrame, conditional_after: Optional[ndarray] same order as the training data. conditional_after: iterable, optional Must be equal is size to X.shape[0] (denoted `n` above). An iterable (array, list, series) of possibly non-zero values that represent how long the - subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents + subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents :math:`s` in :math:`T | T > s`. This is useful for knowing the *remaining* hazard/survival of censored subjects. The new timeline is the remaining duration of the subject, i.e. normalized back to starting at 0. @@ -1667,7 +1837,7 @@ def predict_expectation(self, X: DataFrame, conditional_after: Optional[ndarray] predict_percentile """ - subjects = _get_index(X) + subjects = utils._get_index(X) v = self.predict_survival_function(X, conditional_after=conditional_after)[subjects] return pd.Series(trapz(v.values.T, v.index), index=subjects) @@ -1774,7 +1944,7 @@ def plot(self, columns=None, hazard_ratios=False, ax=None, **errorbar_kwargs): errorbar_kwargs.setdefault("elinewidth", 1.25) errorbar_kwargs.setdefault("capsize", 3) - z = inv_normal_cdf(1 - self.alpha / 2) + z = utils.inv_normal_cdf(1 - self.alpha / 2) user_supplied_columns = True if columns is None: @@ -1875,7 +2045,7 @@ def plot_covariate_groups(self, covariates, values, plot_baseline=True, **kwargs """ from matplotlib import pyplot as plt - covariates = _to_list(covariates) + covariates = utils._to_list(covariates) n_covariates = len(covariates) values = np.asarray(values) if len(values.shape) == 1: @@ -1888,7 +2058,7 @@ def plot_covariate_groups(self, covariates, values, plot_baseline=True, **kwargs if covariate not in self.params_.index: raise KeyError("covariate `%s` is not present in the original dataset" % covariate) - drawstyle = "steps-post" if self.baseline_estimation_method == "breslow" else "default" + drawstyle = "steps-post" set_kwargs_drawstyle(kwargs, drawstyle) if self.strata is None: @@ -1913,7 +2083,7 @@ def plot_covariate_groups(self, covariates, values, plot_baseline=True, **kwargs ax = plt.figure().add_subplot(1, 1, 1) x_bar = self._norm_mean.to_frame().T - for name, value in zip(_to_list(self.strata), _to_tuple(stratum)): + for name, value in zip(utils._to_list(self.strata), utils._to_tuple(stratum)): x_bar[name] = value X = pd.concat([x_bar] * values.shape[0]) @@ -1963,10 +2133,6 @@ def score(self, df: pd.DataFrame, scoring_method: str = "log_likelihood") -> flo cph.score(rossi_test) """ - - if self.baseline_estimation_method != "breslow": - raise NotImplementedError("Only breslow implemented atm.") - df = df.copy() if self.strata: @@ -1983,7 +2149,7 @@ def score(self, df: pd.DataFrame, scoring_method: str = "log_likelihood") -> flo if scoring_method == "log_likelihood": - df = normalize(df, self._norm_mean.values, 1.0) + df = utils.normalize(df, self._norm_mean.values, 1.0) decision = _BatchVsSingle().decide(self._batch_mode, T.nunique(), *df.shape) get_gradients = getattr(self, "_get_efron_values_%s" % decision) @@ -2044,34 +2210,223 @@ def concordance_index_(self) -> float: return self.concordance_index_ return self._concordance_index_ - def compute_followup_hazard_ratios(self, training_df: DataFrame, followup_times: Iterable) -> DataFrame: + +class ParametricSplinePHFitter(ParametricRegressionFitter, SplineFitterMixin, ProportionalHazardMixin): + r""" + Proportional hazard model with cubic splines model for the baseline hazard. + + .. math:: h(t|x) = h_0(t) \exp((x - \overline{x})' \beta) + + where + + .. math:: h_0(t) = \exp{\left( \phi_0 + \phi_1\log{t} + \sum_{j=2}^N \phi_j v_j\(\log{t})\right)} + + where :math:`v_j` are our cubic basis functions at predetermined knots. See references for exact definition. + + References + ------------ + Royston, P., & Parmar, M. K. B. (2002). Flexible parametric proportional-hazards and proportional-odds models for censored survival data, with application to prognostic modelling and estimation of treatment effects. Statistics in Medicine, 21(15), 2175–2197. doi:10.1002/sim.1203  + """ + + _scipy_fit_method = "SLSQP" + _scipy_fit_options = {"maxiter": 1000, "iprint": 100} + + _KNOWN_MODEL = True + _FAST_MEDIAN_PREDICT = False + + cluster_col = None + strata = None + + def __init__(self, n_baseline_knots=1, *args, **kwargs): + assert n_baseline_knots is not None, "n_baseline_knots must be a positive integer. Set in class instantiation" + self.n_baseline_knots = n_baseline_knots + self._fitted_parameter_names = ["beta_"] + ["phi%d_" % i for i in range(1, self.n_baseline_knots + 2)] + super(ParametricSplinePHFitter, self).__init__(*args, **kwargs) + + def _set_knots(self, T, E): + self.knots = np.percentile(T[E.astype(bool).values], np.linspace(5, 95, self.n_baseline_knots + 2)) + return + + def _pre_fit_model(self, Ts, E, df): + self._set_knots(Ts[0], E) + + def _create_initial_point(self, Ts, E, entries, weights, Xs): + return [ + { + **{"beta_": np.zeros(len(Xs.mappings["beta_"])), "phi1_": np.array([0.05]), "phi2_": np.array([-0.05])}, + **{"phi%d_" % i: np.array([0.0]) for i in range(3, self.n_baseline_knots + 2)}, + } + ] + + def _cumulative_hazard(self, params, T, Xs): + lT = anp.log(T) + H = safe_exp(anp.dot(Xs["beta_"], params["beta_"]) + params["phi1_"] * lT) + + for i in range(2, self.n_baseline_knots + 2): + H *= safe_exp( + params["phi%d_" % i] * self.basis(lT, anp.log(self.knots[i - 1]), anp.log(self.knots[0]), anp.log(self.knots[-1])) + ) + return H + + @property + def baseline_hazard_(self): + return self.baseline_hazard_at_times() + + @property + def baseline_survival_(self): + return self.baseline_survival_at_times() + + @property + def baseline_cumulative_hazard_(self): + return self.baseline_cumulative_hazard_at_times() + + def baseline_hazard_at_times(self, times=None): """ - Recompute the hazard ratio at different follow-up times (lifelines handles accounting for updated censoring and updated durations). - This is useful because we need to remember that the hazard ratio is actually a weighted-average of period-specific hazard ratios. + Predict the baseline hazard at times (Defaults to observed durations) + """ + times = utils.coalesce(times, self.timeline) + v = self.predict_hazard(self._norm_mean, times=times) + v.columns = ["baseline hazard"] + return v + + def baseline_survival_at_times(self, times=None): + """ + Predict the baseline survival at times (Defaults to observed durations) + """ + times = utils.coalesce(times, self.timeline) + v = self.predict_survival_function(self._norm_mean, times=times) + v.columns = ["baseline survival"] + return v + + def baseline_cumulative_hazard_at_times(self, times=None): + """ + Predict the baseline cumulative hazard at times (Defaults to observed durations) + """ + times = utils.coalesce(times, self.timeline) + v = self.predict_cumulative_hazard(self._norm_mean, times=times) + v.columns = ["baseline cumulative hazard"] + return v + + def predict_cumulative_hazard(self, df, *, times=None, conditional_after=None): + """ + Predict the cumulative hazard for individuals, given their covariates. Parameters ---------- - training_df: pd.DataFrame - The same dataframe used to train the model - followup_times: Iterable - a list/array of follow-up times to recompute the hazard ratio at. + df: DataFrame + a (n,d) DataFrame. If a DataFrame, columns + can be in any order. + times: iterable, optional + an iterable (array, list, series) of increasing times to predict the cumulative hazard at. Default + is the set of all durations in the training dataset (observed and unobserved). + conditional_after: iterable, optional + Must be equal is size to (df.shape[0],) (`n` above). An iterable (array, list, series) of possibly non-zero values that represent how long the + subject has already lived for. Ex: if :math:`T` is the unknown event time, then this represents + :math:`T | T > s`. This is useful for knowing the *remaining* hazard/survival of censored subjects. + The new timeline is the remaining duration of the subject, i.e. normalized back to starting at 0. + Returns + ------- + DataFrame + the cumulative hazards of individuals over the timeline """ - results = {} - for t in sorted(followup_times): - assert t <= training_df[self.duration_col].max(), "all follow-up times must be less than max observed duration" - df = training_df.copy() - # if we "rollback" the df to time t, who is dead and who is censored - df[self.event_col] = (df[self.duration_col] <= t) & df[self.event_col] - df[self.duration_col] = np.minimum(df[self.duration_col], t) + if isinstance(df, pd.Series): + df = df.to_frame().T - model = self.__class__( - penalizer=self.penalizer, - l1_ratio=self.l1_ratio, - strata=self.strata, - baseline_estimation_method=self.baseline_estimation_method, - ).fit(df, self.duration_col, self.event_col, weights_col=self.weights_col, cluster_col=self.cluster_col) - results[t] = model.hazard_ratios_ - return DataFrame(results).T + df = self._filter_dataframe_to_covariates(df).copy().astype(float) + + # this is needed for these models + df["_intercept"] = 1 + + times = utils.coalesce(times, self.timeline) + times = np.atleast_1d(times).astype(float) + + n = df.shape[0] + Xs = self._create_Xs_dict(df) + + params_dict = {parameter_name: self.params_.loc[parameter_name].values for parameter_name in self._fitted_parameter_names} + + columns = utils._get_index(df) + if conditional_after is None: + return pd.DataFrame(self._cumulative_hazard(params_dict, np.tile(times, (n, 1)).T, Xs), index=times, columns=columns) + else: + conditional_after = np.asarray(conditional_after).reshape((n, 1)) + times_to_evaluate_at = (conditional_after + np.tile(times, (n, 1))).T + return pd.DataFrame( + np.clip( + self._cumulative_hazard(params_dict, times_to_evaluate_at, Xs) + - self._cumulative_hazard(params_dict, conditional_after, Xs), + 0, + np.inf, + ), + index=times, + columns=columns, + ) + + def predict_hazard(self, df, *, times=None): + """ + Predict the hazard for individuals, given their covariates. + + Parameters + ---------- + + df: DataFrame + a (n,d) DataFrame. If a DataFrame, columns + can be in any order. + times: iterable, optional + an iterable (array, list, series) of increasing times to predict the cumulative hazard at. Default + is the set of all durations in the training dataset (observed and unobserved). + conditional_after: + Not implemented yet. + + Returns + ------- + DataFrame + the hazards of individuals over the timeline + + """ + if isinstance(df, pd.Series): + df = df.to_frame().T + + df = self._filter_dataframe_to_covariates(df).copy().astype(float) + df["_intercept"] = 1 + times = utils.coalesce(times, self.timeline) + times = np.atleast_1d(times).astype(float) + + n = df.shape[0] + Xs = self._create_Xs_dict(df) + + params_dict = {parameter_name: self.params_.loc[parameter_name].values for parameter_name in self._fitted_parameter_names} + + return pd.DataFrame(self._hazard(params_dict, np.tile(times, (n, 1)).T, Xs), index=times, columns=df.index) + + +class _BatchVsSingle: + + BATCH = "batch" + SINGLE = "single" + + def decide(self, batch_mode: Optional[bool], n_unique: int, n_total: int, n_vars: int) -> str: + frac_dups = n_unique / n_total + if batch_mode or ( + # https://github.com/CamDavidsonPilon/lifelines/issues/591 for original issue. + # new values from from perf/batch_vs_single script. + (batch_mode is None) + and ( + ( + 6.153_952e-01 + + -3.927_241e-06 * n_total + + 2.544_118e-11 * n_total ** 2 + + 2.071_377e00 * frac_dups + + -9.724_922e-01 * frac_dups ** 2 + + 9.138_711e-06 * n_total * frac_dups + + -5.617_844e-03 * n_vars + + -4.402_736e-08 * n_vars * n_total + ) + < 1 + ) + ): + return self.BATCH + return self.SINGLE diff --git a/lifelines/fitters/log_logistic_fitter.py b/lifelines/fitters/log_logistic_fitter.py index b9158f361..7e5f9a1b0 100644 --- a/lifelines/fitters/log_logistic_fitter.py +++ b/lifelines/fitters/log_logistic_fitter.py @@ -91,9 +91,15 @@ def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): T = Ts[0] elif CensoringType.is_left_censoring(self): - T = Ts[1] + T = np.clip(0.0001, np.inf, Ts[1]) elif CensoringType.is_interval_censoring(self): - T = Ts[1] - Ts[0] + if E.sum() > 0: + # Ts[1] can contain infs, so ignore this data + okay_data = Ts[1] < 1e10 + T = Ts[1] + T = T[okay_data] + else: + T = np.array([1.0]) return np.array([np.median(T), 1.0]) @property diff --git a/lifelines/fitters/log_normal_fitter.py b/lifelines/fitters/log_normal_fitter.py index a6ace72cc..6764f25fb 100644 --- a/lifelines/fitters/log_normal_fitter.py +++ b/lifelines/fitters/log_normal_fitter.py @@ -63,7 +63,7 @@ class LogNormalFitter(KnownModelParametricUnivariateFitter): sigma_: float _fitted_parameter_names = ["mu_", "sigma_"] _bounds = [(None, None), (0, None)] - _compare_to_values = np.array([1.0, 1.0]) + _compare_to_values = np.array([0.0, 1.0]) def _create_initial_point(self, Ts, E, *args): if CensoringType.is_right_censoring(self): @@ -71,7 +71,10 @@ def _create_initial_point(self, Ts, E, *args): elif CensoringType.is_left_censoring(self): log_T = np.log(Ts[1]) elif CensoringType.is_interval_censoring(self): - log_T = np.log(Ts[1]) + if E.sum() > 0: + log_T = np.log(Ts[1][E.astype(bool)]) + else: + log_T = np.array([0]) return np.array([np.median(log_T), 1.0]) @property diff --git a/lifelines/fitters/piecewise_exponential_regression_fitter.py b/lifelines/fitters/piecewise_exponential_regression_fitter.py index 5efca2634..2e98af556 100644 --- a/lifelines/fitters/piecewise_exponential_regression_fitter.py +++ b/lifelines/fitters/piecewise_exponential_regression_fitter.py @@ -35,6 +35,7 @@ class PiecewiseExponentialRegressionFitter(ParametricRegressionFitter): paper replication `here `_ """ + _FAST_MEDIAN_PREDICT = True # about 50% faster than BFGS _scipy_fit_method = "SLSQP" diff --git a/lifelines/fitters/spline_fitter.py b/lifelines/fitters/spline_fitter.py index 4fb2cf0a7..5c2e3f292 100644 --- a/lifelines/fitters/spline_fitter.py +++ b/lifelines/fitters/spline_fitter.py @@ -11,9 +11,7 @@ class SplineFitter(KnownModelParametricUnivariateFitter, SplineFitterMixin): r""" Model the cumulative hazard using :math:`N` cubic splines. This offers great flexibility and smoothness of the cumulative hazard. - .. math:: - - H(t) = \exp{\left( \phi_0 + \phi_1\log{t} + \sum_{j=2}^N \phi_j v_j\(\log{t})\right)} + .. math:: H(t) = \exp{\left( \phi_0 + \phi_1\log{t} + \sum_{j=2}^N \phi_j v_j\(\log{t})\right)} where :math:`v_j` are our cubic basis functions at predetermined knots. See references for exact definition. diff --git a/lifelines/statistics.py b/lifelines/statistics.py index 2acaa4808..7aa094122 100644 --- a/lifelines/statistics.py +++ b/lifelines/statistics.py @@ -29,7 +29,6 @@ "proportional_hazard_test", "power_under_cph", "sample_size_necessary_under_cph", - "somers_d", ] diff --git a/lifelines/tests/test_estimation.py b/lifelines/tests/test_estimation.py index 39cfb0c61..3e291e491 100644 --- a/lifelines/tests/test_estimation.py +++ b/lifelines/tests/test_estimation.py @@ -43,6 +43,7 @@ ) from lifelines.fitters import BaseFitter, ParametricUnivariateFitter, ParametricRegressionFitter +from lifelines.fitters.coxph_fitter import SemiParametricPHFitter from lifelines import ( WeibullFitter, @@ -123,6 +124,7 @@ def __init__(self, *args, **kwargs): class CustomRegressionModelTesting(ParametricRegressionFitter): + _FAST_MEDIAN_PREDICT = True _fitted_parameter_names = ["lambda_", "beta_", "rho_"] def __init__(self, **kwargs): @@ -709,6 +711,13 @@ class TestLogNormalFitter: def lnf(self): return LogNormalFitter() + def test_lognormal_model_has_sensible_interval_censored_initial_values_for_data_with_lots_of_infs(self, lnf): + left = [1, 0, 2, 5, 4] + right = [np.inf, np.inf, np.inf, 5, 6] + lnf.fit_interval_censoring(left, right) + assert lnf._initial_values[0] < 10 + assert lnf._initial_values[1] < 10 + def test_fit(self, lnf): T = np.exp(np.random.randn(100000)) E = np.ones_like(T) @@ -812,6 +821,13 @@ class TestLogLogisticFitter: def llf(self): return LogLogisticFitter() + def test_loglogistic_model_has_sensible_interval_censored_initial_values_for_data_with_lots_of_infs(self, llf): + left = [1, 0, 2, 5, 4] + right = [np.inf, np.inf, np.inf, 5, 6] + llf.fit_interval_censoring(left, right) + assert llf._initial_values[0] < 10 + assert llf._initial_values[1] < 10 + def test_loglogistic_model_does_not_except_negative_or_zero_values(self, llf): T = [0, 1, 2, 4, 5] @@ -2642,6 +2658,101 @@ def test_aft_weibull_can_do_interval_prediction(self, aft): aft.print_summary() +class TestCoxPHFitter_SemiParametric: + @pytest.fixture + def cph(self): + return SemiParametricPHFitter() + + def test_single_efron_computed_by_hand_examples(self, data_nus, cph): + X = data_nus["x"][:, None] + T = data_nus["t"] + E = data_nus["E"] + weights = np.ones_like(T) + + # Enforce numpy arrays + X = np.array(X) + T = np.array(T) + E = np.array(E) + + # Want as bools + E = E.astype(bool) + + # tests from http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf + beta = np.array([0]) + + l, u, _ = cph._get_efron_values_single(X, T, E, weights, beta) + l = -l + + assert np.abs(u[0] - -2.51) < 0.05 + assert np.abs(l[0][0] - 77.13) < 0.05 + beta = beta + u / l[0] + assert np.abs(beta - -0.0326) < 0.05 + + l, u, _ = cph._get_efron_values_single(X, T, E, weights, beta) + l = -l + + assert np.abs(l[0][0] - 72.83) < 0.05 + assert np.abs(u[0] - -0.069) < 0.05 + beta = beta + u / l[0] + assert np.abs(beta - -0.0325) < 0.01 + + l, u, _ = cph._get_efron_values_single(X, T, E, weights, beta) + l = -l + + assert np.abs(l[0][0] - 72.70) < 0.01 + assert np.abs(u[0] - -0.000061) < 0.01 + beta = beta + u / l[0] + assert np.abs(beta - -0.0335) < 0.01 + + def test_batch_efron_computed_by_hand_examples(self, data_nus, cph): + X = data_nus["x"][:, None] + T = data_nus["t"] + E = data_nus["E"] + weights = np.ones_like(T) + + # Enforce numpy arrays + X = np.array(X) + T = np.array(T) + E = np.array(E) + + # Want as bools + E = E.astype(bool) + + # tests from http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf + beta = np.array([0]) + + l, u, _ = cph._get_efron_values_batch(X, T, E, weights, beta) + l = -l + + assert np.abs(u[0] - -2.51) < 0.05 + assert np.abs(l[0][0] - 77.13) < 0.05 + beta = beta + u / l[0] + assert np.abs(beta - -0.0326) < 0.05 + + l, u, _ = cph._get_efron_values_batch(X, T, E, weights, beta) + l = -l + + assert np.abs(l[0][0] - 72.83) < 0.05 + assert np.abs(u[0] - -0.069) < 0.05 + beta = beta + u / l[0] + assert np.abs(beta - -0.0325) < 0.01 + + l, u, _ = cph._get_efron_values_batch(X, T, E, weights, beta) + l = -l + + assert np.abs(l[0][0] - 72.70) < 0.01 + assert np.abs(u[0] - -0.000061) < 0.01 + beta = beta + u / l[0] + assert np.abs(beta - -0.0335) < 0.01 + + def test_efron_newtons_method(self, data_nus, cph): + cph._batch_mode = False + newton = cph._newton_rhapson_for_efron_model + X, T, E, W = (data_nus[["x"]], data_nus["t"], data_nus["E"], pd.Series(np.ones_like(data_nus["t"]))) + + assert np.abs(newton(X, T, E, W)[0] - -0.0335) < 0.0001 + + class TestCoxPHFitter: @pytest.fixture def cph(self): @@ -2649,7 +2760,7 @@ def cph(self): @pytest.fixture def cph_spline(self): - return CoxPHFitter(baseline_estimation_method="spline") + return CoxPHFitter(baseline_estimation_method="spline", n_baseline_knots=1) def test_penalizer_can_be_an_array(self, rossi): @@ -2697,8 +2808,19 @@ def test_spline_model_can_handle_specific_outliers(self, cph_spline): cph_sp = CoxPHFitter(baseline_estimation_method="spline", n_baseline_knots=1) cph_sp.fit(test_data, duration_col="Days") + + # check survival is always decreasing assert np.all(cph_sp.baseline_survival_.diff().dropna() < 0) + def test_spline_and_breslow_models_offer_very_comparible_baseline_survivals(self, rossi): + cph_breslow = CoxPHFitter().fit(rossi, "week", "arrest") + cph_spline = CoxPHFitter(baseline_estimation_method="spline", n_baseline_knots=2).fit(rossi, "week", "arrest") + + bh_spline = cph_spline.baseline_survival_at_times() + bh_breslow = cph_breslow.baseline_survival_ + + assert (bh_breslow["baseline survival"] - bh_spline["baseline survival"]).std() < 0.005 + def test_penalty_term_is_used_in_log_likelihood_value(self, rossi): assert ( CoxPHFitter(penalizer=1e-6).fit(rossi, "week", "arrest").log_likelihood_ @@ -2706,9 +2828,15 @@ def test_penalty_term_is_used_in_log_likelihood_value(self, rossi): < CoxPHFitter(penalizer=0).fit(rossi, "week", "arrest").log_likelihood_ ) assert ( - CoxPHFitter(penalizer=1e-6, baseline_estimation_method="spline").fit(rossi, "week", "arrest").log_likelihood_ - < CoxPHFitter(penalizer=1e-8, baseline_estimation_method="spline").fit(rossi, "week", "arrest").log_likelihood_ - < CoxPHFitter(penalizer=0, baseline_estimation_method="spline").fit(rossi, "week", "arrest").log_likelihood_ + CoxPHFitter(penalizer=1e-6, baseline_estimation_method="spline", n_baseline_knots=1) + .fit(rossi, "week", "arrest") + .log_likelihood_ + < CoxPHFitter(penalizer=1e-8, baseline_estimation_method="spline", n_baseline_knots=1) + .fit(rossi, "week", "arrest") + .log_likelihood_ + < CoxPHFitter(penalizer=0, baseline_estimation_method="spline", n_baseline_knots=1) + .fit(rossi, "week", "arrest") + .log_likelihood_ ) def test_baseline_estimation_for_spline(self, rossi, cph_spline): @@ -3134,97 +3262,6 @@ def test_log_likelihood(self, data_nus, cph): cph.fit(data_nus, duration_col="t", event_col="E") assert abs(cph.log_likelihood_ - -12.7601409152) < 0.001 - def test_single_efron_computed_by_hand_examples(self, data_nus, cph): - - X = data_nus["x"][:, None] - T = data_nus["t"] - E = data_nus["E"] - weights = np.ones_like(T) - - # Enforce numpy arrays - X = np.array(X) - T = np.array(T) - E = np.array(E) - - # Want as bools - E = E.astype(bool) - - # tests from http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf - beta = np.array([0]) - - l, u, _ = cph._get_efron_values_single(X, T, E, weights, beta) - l = -l - - assert np.abs(u[0] - -2.51) < 0.05 - assert np.abs(l[0][0] - 77.13) < 0.05 - beta = beta + u / l[0] - assert np.abs(beta - -0.0326) < 0.05 - - l, u, _ = cph._get_efron_values_single(X, T, E, weights, beta) - l = -l - - assert np.abs(l[0][0] - 72.83) < 0.05 - assert np.abs(u[0] - -0.069) < 0.05 - beta = beta + u / l[0] - assert np.abs(beta - -0.0325) < 0.01 - - l, u, _ = cph._get_efron_values_single(X, T, E, weights, beta) - l = -l - - assert np.abs(l[0][0] - 72.70) < 0.01 - assert np.abs(u[0] - -0.000061) < 0.01 - beta = beta + u / l[0] - assert np.abs(beta - -0.0335) < 0.01 - - def test_batch_efron_computed_by_hand_examples(self, data_nus, cph): - - X = data_nus["x"][:, None] - T = data_nus["t"] - E = data_nus["E"] - weights = np.ones_like(T) - - # Enforce numpy arrays - X = np.array(X) - T = np.array(T) - E = np.array(E) - - # Want as bools - E = E.astype(bool) - - # tests from http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf - beta = np.array([0]) - - l, u, _ = cph._get_efron_values_batch(X, T, E, weights, beta) - l = -l - - assert np.abs(u[0] - -2.51) < 0.05 - assert np.abs(l[0][0] - 77.13) < 0.05 - beta = beta + u / l[0] - assert np.abs(beta - -0.0326) < 0.05 - - l, u, _ = cph._get_efron_values_batch(X, T, E, weights, beta) - l = -l - - assert np.abs(l[0][0] - 72.83) < 0.05 - assert np.abs(u[0] - -0.069) < 0.05 - beta = beta + u / l[0] - assert np.abs(beta - -0.0325) < 0.01 - - l, u, _ = cph._get_efron_values_batch(X, T, E, weights, beta) - l = -l - - assert np.abs(l[0][0] - 72.70) < 0.01 - assert np.abs(u[0] - -0.000061) < 0.01 - beta = beta + u / l[0] - assert np.abs(beta - -0.0335) < 0.01 - - def test_efron_newtons_method(self, data_nus, cph): - cph._batch_mode = False - newton = cph._newton_rhapson_for_efron_model - X, T, E, W = (data_nus[["x"]], data_nus["t"], data_nus["E"], pd.Series(np.ones_like(data_nus["t"]))) - - assert np.abs(newton(X, T, E, W)[0] - -0.0335) < 0.0001 - def test_fit_method(self, data_nus, cph): cph.fit(data_nus, duration_col="t", event_col="E") assert np.abs(cph.params_.iloc[0] - -0.0335) < 0.0001 diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 49e412c3c..40596fe07 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -1815,3 +1815,9 @@ def find_best_parametric_model( continue return best_model, best_score + + +def safe_log2(p): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", r"divide by zero encountered in log2") + return np.log2(p) diff --git a/lifelines/utils/printer.py b/lifelines/utils/printer.py index 5c80f7efc..65def7956 100644 --- a/lifelines/utils/printer.py +++ b/lifelines/utils/printer.py @@ -140,7 +140,7 @@ def ascii_print(self): df.to_string( float_format=utils.format_floats(decimals), formatters={ - **{c: utils.format_exp_floats(decimals) for c in columns if "exp(" in c}, + **{c: utils.format_exp_floats(decimals) for c in columns if "exp(coef)" in c}, **{utils.leading_space("p"): utils.format_p_value(decimals)}, }, columns=[c for c in utils.map_leading_space(first_row_set) if c in columns], @@ -165,4 +165,3 @@ def ascii_print(self): print("---") for string, value in self.footers: print("{} = {}".format(string, value)) - print() diff --git a/lifelines/version.py b/lifelines/version.py index f656938bc..96c176e08 100644 --- a/lifelines/version.py +++ b/lifelines/version.py @@ -1,4 +1,4 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -__version__ = "0.24.9" +__version__ = "0.24.10"